### Get sample's filename

In [1]:
import pandas as pd
import pickle
import sys, os
from collections import Counter
from pathlib import Path

from cc_module_cust import AttackGraph, Node, Edge, FileTable, build # 增加 AttackGraph.step_list

In [2]:
# ! pip install graphviz

In [3]:
def get_executable_sample() -> dict:
    df = pd.read_csv("../C malware info/sample_info.csv")
    family_lst = df.family.unique()
    sample_dct = {}
    for key in family_lst:
        sample_dct[key] = []

    for idx, row in df.iterrows():
        if row.platform in ['X86_64','X86','ARM']:
            sample_dct[row.family].append(row.filename)
    return sample_dct

sample_dct = get_executable_sample()

# # save dictionary
# with open('sample_dct.pkl', 'wb') as outp:
#     pickle.dump(sample_dct, outp, pickle.HIGHEST_PROTOCOL)

In [4]:
# load dictionary
# with open('../sample_dct.pkl', 'rb') as inp:
#     sample_dct = pickle.load(inp)

In [5]:
# sample_dct
'''
{
'Xorddos': ['07c070b717a23453a2b71c7582d6a928.bin',
    '0aefb67c01a24d05351b093455203fa2.bin',...],
'Mirai': ['3d9487191dd4e712cbfb8f4dcf916a707f60c3fb23807d4c02fb941e216f951d.bin',
  'ac13002f74249e0eab2dacb596a60323130664b8c19d938af726508fdc7500a2.bin',...],
  ...
}
'''

"\n{\n'Xorddos': ['07c070b717a23453a2b71c7582d6a928.bin',\n    '0aefb67c01a24d05351b093455203fa2.bin',...],\n'Mirai': ['3d9487191dd4e712cbfb8f4dcf916a707f60c3fb23807d4c02fb941e216f951d.bin',\n  'ac13002f74249e0eab2dacb596a60323130664b8c19d938af726508fdc7500a2.bin',...],\n  ...\n}\n"

### Build: functions
存檔內容
- asg 的物件 (pickle file)
- triplet.csv (1115 ver4)
- node.csv (1115 ver4)

[solved] Must run in `.py` file, cant run in `.ipynb`. Don't know why.
- 注意 trace folder 和 csv 的家族名稱不同，一個有 `-all` 一個沒有，兩者要一致否則會出 node 都是 0。
- 跑到 xorddos 的 ./trace/Dofloo/xxx 噴 FileNotFoundError: [WinError 2] 系統找不到指定的檔案。
    - f477b05826c46ac8a482e85d89b00de0.bin (有時候又OK)

In [6]:
# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__
    
def translate_type(s: str) -> str:
    type = {
        'p': 'proc',
        'f': 'file',
        'n': 'net',
        'm': 'mem',
        'm_addr': 'mem',
    }
    return type.get(s, 'other')
    
def analysis_graph(asg: AttackGraph) -> pd.DataFrame:
    node_set = [node.name for node in asg.graph.keys()]
    node_type = {}
    # print('# of nodes:',len(node_set))
    # print('# of edges:',len(asg.step_list),'(edges equal to steps)\n')

    nodeType_counter = Counter({n:0 for n in ['f','c','n','p','m_addr','pipe','else']})
    for node in asg.graph.keys():
        nodeType_counter[node.type] += 1
    # print('# of node type:\n',nodeType_counter)

    node_counter = Counter({n:0 for n in node_set})
    edge_counter = Counter()
    for (src_node, dst_node, edgename) in asg.step_list:
        node_counter[src_node.name] += 1
        node_counter[dst_node.name] += 1
        edge_counter[edgename] += 1
        node_type[src_node.name] = translate_type(src_node.type)
        node_type[dst_node.name] = translate_type(dst_node.type)
    # print('\nfrequency of node:\n',node_counter)
    # print('\nfrequency of edge:\n',edge_counter)
    # print('\n# of syscall',len(edge_counter))

    # 回傳這張圖的 node_name, node_name, appearance
    df = [[node,node_type[node],node_counter[node]] for node in node_counter]
    df = pd.DataFrame(df, columns=['node_name', 'type', 'appearance'])
    return df

In [7]:
def is_important(edgename: str) -> bool:
    '''判斷一個syscall是否會改變系統檔案，是回傳True。'''
    affect_sys = ["set_thread_area", "setsid", "fcntl", "fchown", "fchmod", "rename", "umask", "set_tid_address", \
        "futex", "set_robust_list", "arch_prctl", "execve", "exit_group", "clone", "write", "connect", "send", \
        "brk", "socket", "mprotect", "mmap", "openat", "mmap2", "munmap", "open", "close"]
    if edgename in affect_sys:
        return True
    return False

def get_node_importance(asg: AttackGraph) -> dict:
    ''' 基於作用在 dst_node 的 syscall 來決定這個 dst_node 是否重要 (因為原本是使用 is_important() 判斷一個syscall重要與否) '''
    node_set = [node.name for node in asg.graph.keys()]
    node_importance = dict().fromkeys(node_set, 0)
    for i, (nodes, edge) in enumerate(asg.edges.items()):
        src_node, dst_node, _ = nodes
        # print('get_node_importance()', edge.name)
        if is_important(edge.name):
            node_importance[dst_node.name] = 1
    return node_importance

def analysis_triplet(asg: AttackGraph) -> pd.DataFrame:
    node_set = [node.name for node in asg.graph.keys()]
    node_counter = Counter({n:0 for n in node_set})
    for i, (nodes, edge) in enumerate(asg.edges.items()):
        src_node, dst_node, _ = nodes
        node_counter[src_node.name] += 1
        node_counter[dst_node.name] += 1
    
    triplate_lst = []
    for (src_node, dst_node, edgename) in asg.step_list:
        src_type = translate_type(src_node.type)
        dst_type = translate_type(dst_node.type)
        imp = 1 if is_important(edgename) else 0 # 這個 syscall 是否會改變系統
        cs = node_counter[src_node.name] # node 的 degree
        cd = node_counter[dst_node.name]
        # 反轉 read 的節點
        if edgename in  ['read','getsockopt']:
            triplate_lst.append([i+1, dst_node.name, dst_type, cd, edgename,src_node.name, src_type, cs, imp])
        else:
            triplate_lst.append([i+1, src_node.name, src_type, cs, edgename, dst_node.name, dst_type, cd, imp])

    #     break
    # print(triplate_lst)
    df = pd.DataFrame(triplate_lst, columns=['step','src_node', 'type', 'deg', 'edge', 'dst_node', 'type', 'deg','importance'])
    return df

# filename = 'Dofloo_V4_ALL_wo_system_memory.pkl'
# with open(filename, 'rb') as inp:
#     asg_all = pickle.load(inp)
# df = analysis_triplate(asg_all)

# filename_lst = ['Dofloo_V1_ALL_Memory_and_System.pkl','Dofloo_V2_ALL_Memory_sink.pkl',]
#     # 'Dofloo_V3_ALL_wo_system.pkl','Dofloo_V4_ALL_wo_system_memory.pkl']
# triplate_stat_lst = []
# node_type_set = set()
    
# for i,filename in enumerate(filename_lst):
#     # f = open(f'1115_Dofloo_V{i+1}.txt', 'w')

#     with open(f'1115_dofloo_V1~V2_pkl_cust/{filename}', 'rb') as inp:
#         asg_object = pickle.load(inp)
#         df = analysis_triplate(asg_object)
#         triplate_stat_lst.append(df)

# with pd.ExcelWriter("1115_triplate_statistics.xlsx") as writer:
#     # df.to_excel(writer, sheet_name=f"ver4")
#     [df.to_excel(writer, sheet_name=f"ver{i+1}") for i,df in enumerate(triplate_stat_lst)]

### Build: run script

In [11]:
Path("./saved_pkl").mkdir(parents=True, exist_ok=True)

for family_name in sample_dct:
    if family_name != 'Dofloo': # 只建這個 family 的圖
        continue
    print(f'building family {family_name}')
    family_path = f'./saved_pkl/{family_name}'
    Path(family_path).mkdir(parents=True, exist_ok=True)
    for s in sample_dct[family_name]:
        if not s.startswith('0046'): # 只建這個 sample 的圖
            continue
        print(f'  building sample {s}')
        try:
            # 建立圖譜並存檔
            asg_object, digraph = build(family_name, s, save_file=False, draw=False)
            if asg_object == None:
                continue
            # blockPrint()
            pkl_name = s.split('.')[0]
            with open(f'{family_path}/{pkl_name}.pkl', 'wb') as outp:
                pickle.dump(sample_dct, outp, pickle.HIGHEST_PROTOCOL)
            df_node = analysis_graph(asg_object)
            df_triplet = analysis_triplet(asg_object)
            node_importance = get_node_importance(asg_object)
            df_node['importance'] = df_node['node_name'].apply(lambda n: True if node_importance[n] else False)
            df_node.to_csv(f'{family_path}/{pkl_name}_node.csv', index=False)
            df_triplet.to_csv(f'{family_path}/{pkl_name}_triplet.csv', index=False)
            # enablePrint()
            # break
        except FileNotFoundError:
            print('  [Error] FileNotFoundError')
    # [build(family_name, s) for s in sample_dct[family_name]] #　一行搞定版

building family Xorddos
  building sample 07c070b717a23453a2b71c7582d6a928.bin
  path: ../C ASG/trace/Xorddos/07c070b717a23453a2b71c7582d6a928.bin


KeyError: 'path'

### Evaluate