In [2]:
import pickle

class CollectForestInfo:
    intermediateDict = None
    residualDict = None
    descendant_dict = None
    repCommMotifSeq_dict = None # save tree's common motif seq. list
    treeList = None
    
    def __init__(self, intermidiatePicklePath, residualPicklePath, includePairwiseTree, forceMerge=False):
        
        # read the results from pickle files
        with open(intermidiatePicklePath, 'rb') as handle:
            self.intermediateDict = pickle.load(handle)
        with open(residualPicklePath, 'rb') as handle:
            self.residualDict = pickle.load(handle)
        
        self._setForestOutputs(forceMerge)
        self._setTreeList(includePairwiseTree)
        
        
    # get descendant and motif information from pickle
    def _setForestOutputs(self, forceMerge):

        descendant_dict = dict()
        repCommMotifSeq_dict = dict()
        intermediate_list = sorted(self.intermediateDict.items(), key=lambda x : x[0])

        for item in intermediate_list:
            value = item[1] # get original dict value
            score = value[0]
            clusterName = value[1][0]
            memberSet = value[2]
            commonMotifSeq = value[1][1] # list of common motif seq.

            descendants = set()
            for member in memberSet:
                if forceMerge:
                    descendants.add(member)
                else:
                    if member[0] == "G":
                        for descendant in descendant_dict[member]:
                            descendants.add(descendant)
                    else:
                        descendants.add(member)
            descendant_dict[clusterName] = descendants
            repCommMotifSeq_dict[clusterName] = commonMotifSeq

        self.descendant_dict = descendant_dict
        self.repCommMotifSeq_dict = repCommMotifSeq_dict
    
    
    # get those residual trees which isn't sigular
    # collect their clusterName into notLonerList.
    def _setTreeList(self, includePairwiseTree):

        notLonerList = []

        for key, value in self.residualDict.items():
            clusterName = value[0][0]
            motifsList = value[0][1]
            members = value[1]

            notLoner = False

            if(len(members) > 1):
                if(includePairwiseTree):
                    notLoner = True

                else:   # remove 2-member pairs
                    if( len(members) == 2):
                        for member in members:
                            if member[0] == 'G':
                                notLoner = True
                                break
                    else:
                        notLoner = True

            if(notLoner):
                notLonerList.append((clusterName, members))

        notLonerList = sorted(notLonerList, key=lambda x: int(x[0][1::]), reverse=False)

        self.treeList = notLonerList

#     def getGroupMotif_dict(self): # get motif sequence of each group (not only tree root)
#         return self.groupMotif_dict
        
    def getDescendant_dict(self): # get all descendant list(including root and middle nodes)
        return self.descendant_dict
    
    def getTreeList(self): # get tree root list.
        return self.treeList
    
    def getTreeRootNameList(self):
        nameList = list()
        for treeRoot in self.treeList:
            rootName = treeRoot[0] # treeRoot = (ParentNodeName, {children_Node_Names})
            nameList.append(rootName)
        return nameList
    
    def getTreeRootCount(self): # get how many trees in forest
        return len(self.treeList)
    
    def getForestMembers(self):
        forestMemberSet = set()
        trMember_dict = self.getTreeMembers_dict()
        for rootName in trMember_dict:
            members = trMember_dict[rootName]
            forestMemberSet.update(members)
        return forestMemberSet
    
    def getForestMemberCount(self): # return how many malwares in forest
        return len(self.getForestMembers())
    
    def getTreeMembers_dict(self): # key: treeRootName; val: treeMemberSet
        treeMember_dict = dict()
        rootNames = self.getTreeRootNameList()
        for rootName in rootNames:
            members = self.descendant_dict[rootName] # get Node's all descendants
            treeMember_dict[rootName] = members
        return treeMember_dict
    
    def getTreeMembers(self, rootName): # return members (set) in specific treeRoot
        trMember_dict = self.getTreeMembers_dict()
        return trMember_dict[rootName] # type == set()
    
    def getTreeSamples(self, rootName): # return how many samples in a tree (for PE files)
        members = self.getTreeMembers(rootName)
        samples = set()
        for mem in members:
            samples.add(mem.split('_')[0])
        return samples
    
    def getRepAPISeq_dict(self): # key: treeRootName; val: RepAPISeq <list>
        repAPISeq_dict = dict()
        rootNames = self.getTreeRootNameList()
        for rootName in rootNames:
            repAPISeq = self.getRepAPISeq(rootName) # get Rep API Seq of each root
            repAPISeq_dict[rootName] = repAPISeq # add into dict
        return repAPISeq_dict
            
    def getRepAPISeq(self, rootName): # get Rep API Seq of root 數字
        repMotifList = list()
        commMotifSeq = self.repCommMotifSeq_dict[rootName] # get CMS list
        
        commonAPISeq = [] # merge all motif's APIs
        for motifAPI in commMotifSeq:
            commonAPISeq.extend(motifAPI)
        return commonAPISeq
    
    def getRepMotifCount(self, rootName): # get motif count of root 
        commMotifSeq = self.repCommMotifSeq_dict[rootName]
        return len(commMotifSeq)
    
    def getRepMotifSequence(self, rootName): # get motif sequence of root 文字
        return self.repCommMotifSeq_dict[rootName]

### Dump Tree's Rep & Hooklog

把每顆樹所包含的hooklogs以及rep sequence，依照family統整到tree-rep-logs的資料夾。

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [None]:
import os
import re
from shutil import copy

input_path = '/home/master/r07725027/dataset/rasMMA-output/'
output_path = '/home/master/r07725027/dataset/tree-rep-logs/'
data_directory = '/home/master/r07725027/dataset/aries_v2_simplified_15up/'

families_dir = os.listdir(input_path)
family_folders = os.listdir(data_directory)

def dump_tree_rep(tree, rep, output_path):
        tree_path = f'{output_path}{tree}/'
        
        os.makedirs(tree_path, mode=0o744)
        pickle.dump(rep, open(f'{output_path}{tree}/rep.pickle', "wb"))
        print('TREE REP DUMP: ' + tree)
        
def dump_tree_hooklog(family_name, tree_name, member_list):
    family_data_folder = ''
    
    # get hooklog folder in aries_v2_simplified_15up
    for family in family_folders:
        if family_name.split('_')[0] in family:
            family_data_folder = family
            break
    
    family_data_folder = f'{data_directory}{family_data_folder}/'
    hook_logs = os.listdir(family_data_folder)
    
    # search for hooklogs
    for log in hook_logs:
        for member in member_list:
            start, end = member.split('_')
            find = re.search(r'^' + start + r'.*_' + end + r'\.trace\.hooklog', log)
            
            # copy hooklog to /output-path/family_name/tree_name/*.trace.hooklog
            if find:
                copy(family_data_folder + log, f'{output_path}{family_name}/{tree_name}/{log}')
                print('TREE LOG DUMP: ' + tree_name)
        
def main():
    for family_name in families_dir:
        pickle_path = f'{input_path}{family_name}/pickle/'
        intermidiate_pickle_path = f'{pickle_path}{family_name}_intermediate.pickle'
        residual_pickle_path = f'{pickle_path}{family_name}_residual.pickle'
        output_path = f'/home/master/r07725027/dataset/tree-rep-logs/{family_name}/'

        # check if pickle file exist
        if not os.path.exists(intermidiate_pickle_path) or not os.path.exists(residual_pickle_path):
            print(f'-----{family_name} pickle not exists.')
            continue

        collect_forest_info = CollectForestInfo(intermidiate_pickle_path, residual_pickle_path, True)
        trees = collect_forest_info.getTreeList()

        print('----- FAMILY: Dumping ' + family_name + ' -----')
        
        for tree in trees:
            tree_name = tree[0]

            # dump tree's rep sequence
            rep = collect_forest_info.getRepMotifSequence(tree_name)
            dump_tree_rep(tree_name, rep, output_path)

            # dump tree's hooklogs
            members = collect_forest_info.getTreeMembers(tree_name)
            dump_tree_hooklog(family_name, tree_name, members)
    
        
main()

### Dumping Data Correctness Test

檢查三種數量，將dump出來的資料(test)跟collect_forest_info(truth)比較：
- 每個family下tree的數量
- 每個tree下的hooklog的數量
- 每個tree Rep Sequence motiff數

In [None]:
input_path = '/home/master/r07725027/dataset/rasMMA-output/'
dump_path = '/home/master/r07725027/dataset/tree-rep-logs/'
family_dump_folders = os.listdir(dump_path)

any_error = False
tree_error = []
log_error = []
rep_error = []

for family_name in family_dump_folders:
    # test data
    trees_test = os.listdir(f'{dump_path}{family_name}')
    trees_count_test = len(trees_test)
    
    pickle_path = f'{input_path}{family_name}/pickle/'
    intermidiate_pickle_path = f'{pickle_path}{family_name}_intermediate.pickle'
    residual_pickle_path = f'{pickle_path}{family_name}_residual.pickle'
    
    # truth data
    collect_forest_info = CollectForestInfo(intermidiate_pickle_path, residual_pickle_path, True)
    trees = collect_forest_info.getTreeList()
    trees_count = len(trees)
    
    # check each family tree count
    if trees_count == trees_count_test:
        print('(MATCH) TREE COUNT: ' + family_name)
    else:
        print('(ERROR) TREE COUNT: ' + family_name)
        print(trees_count_test, trees_count)
        any_error = True
        tree_error.append(family_name)
    
    # check each tree hooklog count
    for tree in trees_test:
        # test data
        logs_test = os.listdir(f'{dump_path}{family_name}/{tree}/')
        logs_count_test = len(logs_test)
        
        # truth data
        logs_count = len(collect_forest_info.getTreeMembers(tree))
        
        if logs_count == logs_count_test - 1:
            print('    - (MATCH) LOG COUNT: ' + tree)
        else:
            print('    - (ERROR) LOG COUNT: ' + tree)
            any_error = True
            log_error.append(f'{family_name}/{tree}')
            
        # check rep lines count
        tree_rep_test = pickle.load(open(f'{dump_path}{family_name}/{tree}/rep.pickle', "rb"))
        tree_rep = collect_forest_info.getRepMotifSequence(tree)
        
        if len(tree_rep_test) == len(tree_rep):
            print('    - (MATCH) REP SEQ COUNT: ' + tree)
        else:
            print('    - (ERROR) REP SEQ COUNT: ' + tree)
            any_error = True
            rep_error.append(f'{family_name}/{tree}')
        
            
if not any_error:
    print("\n----- Dump Successfully -----")
else:
    print('\nTree Error: ', tree_error)
    print('Log Error: ', log_error)

## Other tests

In [15]:
# ### unit test
#吃兩個pickle
pkl_dir_path = 'output/RasMMA_forest/40.picsys_0.8/pickle/'
interPkl = pkl_dir_path + '40.picsys_0.8_intermediate.pickle'
resPkl = pkl_dir_path + '40.picsys_0.8_residual.pickle'
TreeUtil = CollectForestInfo
testFamilyForest = TreeUtil(interPkl, resPkl, True)



# for root in rootNames:
#     rootAPISeq = testFamilyForest.getRepAPISeq(root)
#     motifCount = testFamilyForest.getRepMotifCount(root)
#     print(len(rootAPISeq), motifCount)
    
#     motifSeq = testFamilyForest.getRepAPISeq(root)
#     motifLenList = [len(motif) for motif in motifSeq]
#     print(motifLenList)

201 5
[108, 86, 36, 37, 37, 46, 102, 74, 100, 86, 107, 36, 36, 86, 74, 93, 134, 110, 131, 34, 35, 73, 78, 88, 79, 79, 36, 66, 79, 92, 110, 82, 94, 104, 111, 122, 102, 112, 93, 98, 112, 114, 92, 97, 92, 104, 92, 206, 92, 113, 92, 97, 92, 105, 92, 117, 92, 100, 92, 107, 92, 93, 92, 94, 79, 36, 66, 79, 79, 36, 66, 79, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 79, 96, 79, 79, 79, 36, 96, 79, 96, 79, 36, 66, 79, 36, 66, 79, 36, 66, 79, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 79, 36, 96, 79, 96, 79, 36, 79, 36, 96, 79, 96, 79, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 96, 79, 96, 79, 36, 79, 36, 79, 36, 79, 36, 66, 79, 79, 36, 79, 36, 79, 36, 79, 36, 79, 79, 36, 79, 79, 36, 79, 51, 44, 51, 44, 110, 79, 36, 96, 79, 96, 165, 280, 89, 92, 85]
76 4
[35, 37, 55, 108, 86, 36, 86, 46, 102, 74, 100, 86, 74, 37, 35, 36, 33, 102, 131, 109, 116, 112, 101, 1885, 101, 1885, 101, 1885, 101, 1885, 101, 1885, 101, 18

In [11]:
rootNames

['G80', 'G111', 'G114']

In [19]:
rootNames = testFamilyForest.getTreeRootNameList()
M=[]
K = []
for roots in rootNames:
    kk = testFamilyForest.getTreeMembers(roots) # 各tree底下有哪些hooklogs
    K.append(kk)
#     print(kk)
    mm = testFamilyForest.getRepMotifSequence(roots)  # 各tree的REP是誰
    M.append(mm)
#     print(mm)
# print(kk)
# print(mm)

In [23]:
testFamilyForest.getRepAPISeq_dict()

{'G80': ['RegQueryValue#PR@HKLM@sys_curCtlSet_ctl_sessionManager\\*#PR@SUBK@criticalsectiontimeout#PR@0#PR@12f9b0#Ret#0',
  'RegQueryValue#PR@HKLM@soft_ms_ole\\*#PR@SUBK@rwlockresourcetimeout#PR@0#PR@12f9b4#Ret#P',
  'LoadLibrary#PR@SYS@wininet@DLL#Ret#P',
  'LoadLibrary#PR@SYS@advapi32@DLL#Ret#P',
  'LoadLibrary#PR@SYS@advapi32@DLL#Ret#P',
  'LoadLibrary#PR@ARB@windowsshell@MANIFEST#Ret#N',
  'CreateFile#PR@ARB@MANIFEST#PR@GENERIC_READ#PR@OPEN_EXISTING#PR@FILE_SHARE_DELETE;FILE_SHARE_READ#Ret#P',
  'RegQueryValue#PR@HKCU@desktop\\#PR@SUBK@smoothscroll#PR@0#PR@77462a48#Ret#P',
  'RegQueryValue#PR@HKCU@soft_ms_win_explorer\\advanced#PR@SUBK@enableballoontips#PR@0#PR@77462170#Ret#P',
  'RegEnumValue#PR@HKLM@soft_ms_winNT_languagepack\\*#PR@SUBK@surr#PR@REG_DWORD#PR@2#Ret#0',
  'RegCreateKey#PR@HKCU@hkey_current_user#PR@SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Internet Settings#Ret#0',
  'LoadLibrary#PR@SYS@wsock32@DLL#Ret#P',
  'LoadLibrary#PR@SYS@shell32@DLL#Ret#P',
  'RegQueryValu

In [24]:
M

[[['RegQueryValue#PR@HKLM@sys_curCtlSet_ctl_sessionManager\\*#PR@SUBK@criticalsectiontimeout#PR@0#PR@12f9b0#Ret#0',
   'RegQueryValue#PR@HKLM@soft_ms_ole\\*#PR@SUBK@rwlockresourcetimeout#PR@0#PR@12f9b4#Ret#P',
   'LoadLibrary#PR@SYS@wininet@DLL#Ret#P',
   'LoadLibrary#PR@SYS@advapi32@DLL#Ret#P',
   'LoadLibrary#PR@SYS@advapi32@DLL#Ret#P',
   'LoadLibrary#PR@ARB@windowsshell@MANIFEST#Ret#N',
   'CreateFile#PR@ARB@MANIFEST#PR@GENERIC_READ#PR@OPEN_EXISTING#PR@FILE_SHARE_DELETE;FILE_SHARE_READ#Ret#P',
   'RegQueryValue#PR@HKCU@desktop\\#PR@SUBK@smoothscroll#PR@0#PR@77462a48#Ret#P',
   'RegQueryValue#PR@HKCU@soft_ms_win_explorer\\advanced#PR@SUBK@enableballoontips#PR@0#PR@77462170#Ret#P',
   'RegEnumValue#PR@HKLM@soft_ms_winNT_languagepack\\*#PR@SUBK@surr#PR@REG_DWORD#PR@2#Ret#0',
   'RegCreateKey#PR@HKCU@hkey_current_user#PR@SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Internet Settings#Ret#0',
   'LoadLibrary#PR@SYS@wsock32@DLL#Ret#P',
   'LoadLibrary#PR@SYS@shell32@DLL#Ret#P',
   'RegQu

* 