### copy file from hooklog Pool to classified directory.
1. see the md5 in given pickle dictionary(generated from Mike's AvVendorReport.py)
2. check if the md5 trace is in hooklog Pool
3. if true, then copy to target family directory. (Do classifying)

In [1]:
import pickle
import os
import shutil

dataPickle = "Report/Aries/aries_top3_voting.pickle"
family_root_dir = "11939data/aries_top3_party/"
hkPoolDir = "11952data/trace_origin/"
# read the target pickle files
with open(dataPickle, 'rb') as handle:
    dataContent = pickle.load(handle)

for famName, md5s in dataContent.items():
    familyDir = family_root_dir + famName
    
    for root, dirs, files in os.walk(hkPoolDir):
        for fEntry in files:
            path = root + fEntry
            md5 = fEntry.split("_")[0]
            if md5 in md5s:
                if not os.path.isdir(familyDir): os.makedirs(familyDir)
                shutil.copyfile(path, familyDir+"/"+fEntry)

### Classify the generation of a variant
1. check whether a variant will fork child process and grand child process.
2. define their relationship
3. separate these processes into different directory in a family directory.

In [2]:
import os
import shutil

# Key API Name = CreateProcess
# Key Attribute = dwProcessId

def getSampleRelation(familyPath, sampleMD5Dict):
    keyAPI = "CreateProcess"
    keyAttribute = "dwProcessId"
    
    possibleRoot = set()
    result = dict()
    md5RelationDict = dict() # a dict {key=main: value=child_pid} (or key=child, value=grandchild_pid)
    
    for md5 in sorted(sampleMD5Dict.keys()):
        traceFiles = sampleMD5Dict[md5]
        if len(traceFiles) == 1: # if single file, skip it.
            possibleRoot.add(traceFiles[0])
            continue

        for trace in traceFiles: # trace all files if they have same md5
            
            handle = open(familyPath + trace, 'rb')
            child = list()
            while(1):
                line = handle.readline().decode("ISO 8859-1").strip() # MIKE: 20170616, for python 3
                if not line: 
                    break
                if(line[0] is '#'):
                    api = handle.readline().decode("ISO 8859-1").strip() # see api name
                    if(api == keyAPI):
                        terminateCtr = 0
                        
                        # dwProcessId may appear in createProcess's params, amounts 14 lines
                        while(terminateCtr<14):
                            newLine = handle.readline().decode("ISO 8859-1").strip()
                            if(newLine[0:6] == "Return"):
                                if(newLine.split('=')[1] != "SUCCESS"): break

                            if(newLine[0:11] == keyAttribute):
                                child.append(newLine.split('=')[1])
                                break

                            terminateCtr+=1 # defend of infinite loop
                            
            md5RelationDict[trace] = child

    
    for trace in sorted(md5RelationDict.keys()):
        childList = md5RelationDict[trace]

        if trace not in result.keys():
            traceList = list()
            for child in childList:
                hashValue = trace.split("_")[0]
                fName = hashValue + "_" + child + ".trace.hooklog"

                if fName in sampleMD5Dict[hashValue]:
                    traceList.append(fName)
#                 else:
#                     print(trace, child)
            result[trace] = traceList
        else:
            print("!!! - ",trace)
    
    reverseRelation = dict()
    for k in sorted(result.keys()):
        v = result[k]
        if v:
            for ele in v:
                if ele in reverseRelation.keys(): print(ele,";",k)
                reverseRelation[ele] = k

    possibleRoot.update(set(md5RelationDict.keys()))
    totalTraceCount = len(possibleRoot)
    
    for k in reverseRelation.keys():
        if k in possibleRoot:
            possibleRoot.remove(k)
                        
    return result, possibleRoot, totalTraceCount

# Move all other main processes which didn't fork any child to mainDir
def moveOtherMainProcs(familyPath, mainDir):
    files = os.listdir(familyPath)
    for mainProc in files:
        if os.path.isfile(familyPath+mainProc):
            shutil.move(familyPath+mainProc, mainDir)
            

def separateProcessByGeneration(familyPath, levels):
    print(familyPath)
    
    for level, malwrs in levels.items():
        if level == 1:
            myDir = familyPath + 'main/'
        elif level == 2:
            myDir = familyPath + 'child/'
        else:
            myDir = familyPath + str(level) + ' child/'
            
        if not os.path.isdir(myDir): os.makedirs(myDir)
            
        for mal in malwrs:
            shutil.copyfile(familyPath+mal, myDir+mal)

### Check main-child relationship of process of family
1. set the specific family
2. Get the relationDict - which contains main and child relation
(key = main, value = list of children)
3. if having grandchild, then the dict value is a dict{key=child, value=list of grand children}
## Run main/child separation : do 'separateProcessByGeneration()'

In [76]:
familyPath = '11952data/top3_label/5up_test_firstseen/family'

if os.path.isdir(familyPath):
    sampleMD5Dict = dict() # A dict which key=md5, value=md5_pid.trace

    for root, dirs, files in os.walk(familyPath):
        for fEntry in files:
            if(fEntry == '.DS_Store'): continue # MacOS file system file.

            md5 = fEntry.split("_")[0]
            
            if sampleMD5Dict.get(md5): # classifying traces by md5
                sampleMD5Dict[md5].append(fEntry)
            else:
                sampleMD5Dict[md5] = [fEntry]
    print("MD5 kinds: ", len(sampleMD5Dict.keys()))
    print("Have multi-procs md5:")
    ctr = 0
    for key, value in sampleMD5Dict.items():
        if len(value) > 1:
            ctr+=1
            print(key, len(value))
    print("multi-process samples count:", ctr)
    print("single-process:", len(sampleMD5Dict.keys()) - ctr)
            
    result, possibleRoot, totalTraceCount = getSampleRelation(familyPath + '/' , sampleMD5Dict)
    
    levels = dict()
    currentLevel = 1

    levels[currentLevel] = possibleRoot
    classifiedTraceCount = len(possibleRoot)
    print("currentLevel:",currentLevel , " - ", len(levels[currentLevel]))
    while(classifiedTraceCount < totalTraceCount):
        levels[currentLevel+1] = set()

        for parent in levels[currentLevel]:
    #         print(parent, result[parent])

            if parent in result.keys():
                for t in result[parent]:
                    levels[currentLevel+1].add(t)
                    classifiedTraceCount+=1
            else:
                pass
        currentLevel+=1

MD5 kinds:  2
Have multi-procs md5:
f39d60f0cffbfb6541c3e074e24e2f6589e89efb089a1797514901ebc7766cd1 2
multi-process samples count: 1
single-process: 1


In [None]:
levels = dict()
currentLevel = 1

levels[currentLevel] = possibleRoot
classifiedTraceCount = len(possibleRoot)
print("currentLevel:",currentLevel , " - ", len(levels[currentLevel]))
while(classifiedTraceCount < totalTraceCount):
    levels[currentLevel+1] = set()
    
    for parent in levels[currentLevel]:
#         print(parent, result[parent])
        
        if parent in result.keys():
            for t in result[parent]:
                levels[currentLevel+1].add(t)
                classifiedTraceCount+=1
        else:
            pass
    currentLevel+=1
    print("currentLevel:",currentLevel , " - ", len(levels[currentLevel]))
    

In [None]:
tl = 0
for lv, eles in levels.items():
    print("Level:", lv, " - 個數:", len(eles))
    tl += len(eles)
print("總共 ", tl, " 個")

### Moving Tracelogs according the above level relationship.

In [None]:
# Do generation separation
separateProcessByGeneration(familyPath + '/', levels)

### Process All Families One Time.

In [3]:
def count(familyPath):
    if os.path.isdir(familyPath):
        sampleMD5Dict = dict() # A dict which key=md5, value=md5_pid.trace

    for root, dirs, files in os.walk(familyPath):
        for fEntry in files:
            if(fEntry == '.DS_Store'): continue # MacOS file system file.

            md5 = fEntry.split("_")[0]
            
            if sampleMD5Dict.get(md5): # classifying traces by md5
                sampleMD5Dict[md5].append(fEntry)
            else:
                sampleMD5Dict[md5] = [fEntry]
            
    result, possibleRoot, totalTraceCount = getSampleRelation(familyPath + '/' , sampleMD5Dict)
    
    levels = dict()
    currentLevel = 1

    levels[currentLevel] = possibleRoot
    classifiedTraceCount = len(possibleRoot)
    while(classifiedTraceCount < totalTraceCount):
        levels[currentLevel+1] = set()

        for parent in levels[currentLevel]:
    #         print(parent, result[parent])

            if parent in result.keys():
                for t in result[parent]:
                    levels[currentLevel+1].add(t)
                    classifiedTraceCount+=1
            else:
                pass
        currentLevel+=1
        
    levels = dict()
    currentLevel = 1

    levels[currentLevel] = possibleRoot
    classifiedTraceCount = len(possibleRoot)
    while(classifiedTraceCount < totalTraceCount):
        levels[currentLevel+1] = set()

        for parent in levels[currentLevel]:

            if parent in result.keys():
                for t in result[parent]:
                    levels[currentLevel+1].add(t)
                    classifiedTraceCount+=1
            else:
                pass
        currentLevel+=1
        
    # Do generation separation
    separateProcessByGeneration(familyPath, levels)
    
familyPath = '11939data/top3_party_0622_test_分世代用/'
for path in [familyPath + famName + '/' for famName in os.listdir(familyPath)]:
    print("processing - ", path.split('/')[-2])
    count(path)
print('Done.')

processing -  allaple
11939data/top3_party_0622_test_分世代用/allaple/
processing -  almanahe
11939data/top3_party_0622_test_分世代用/almanahe/
processing -  bayrob
11939data/top3_party_0622_test_分世代用/bayrob/
processing -  berbew
11939data/top3_party_0622_test_分世代用/berbew/
processing -  cerber
11939data/top3_party_0622_test_分世代用/cerber/
processing -  chir
11939data/top3_party_0622_test_分世代用/chir/
processing -  cycbot
11939data/top3_party_0622_test_分世代用/cycbot/
processing -  devir
11939data/top3_party_0622_test_分世代用/devir/
processing -  eggnog
11939data/top3_party_0622_test_分世代用/eggnog/
processing -  elkern
11939data/top3_party_0622_test_分世代用/elkern/
processing -  expiro
11939data/top3_party_0622_test_分世代用/expiro/
processing -  fakeav
11939data/top3_party_0622_test_分世代用/fakeav/
processing -  fareit
11939data/top3_party_0622_test_分世代用/fareit/
processing -  fesber
11939data/top3_party_0622_test_分世代用/fesber/
processing -  fujacks
11939data/top3_party_0622_test_分世代用/fujacks/
processing -  gamarue
1