## 2022/11/08 針對 Dofloo 單一個樣本，4 種 ASG 的結果進行分析

In [1]:
import pandas as pd
import pickle
import sys
from collections import Counter
from pathlib import Path

從上層資料夾import模組，但build()的相對路徑會出錯，還是直接把模組搬過來測。
- https://www.geeksforgeeks.org/python-import-from-parent-directory/
- https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
- https://stackoverflow.com/questions/39125532/file-does-not-exist-in-jupyter-notebook

In [2]:
# setting path to import cc_module
# import sys
# import os
# current = os.path.abspath('')
# # current = os.path.dirname(os.path.realpath(__file__))
# parent = os.path.dirname(current)
# sys.path.append(parent)
from cc_module import build
from cc_module import AttackGraph, Node, Edge, FileTable

### Excel 1: 包裝成 function，印出統計結果 (Node)
結果直接輸出到txt中喔!

In [3]:
def translate_type(s: str) -> str:
    type = {
        'p': 'proc',
        'f': 'file',
        'n': 'net',
        'm': 'mem'
    }
    return type.get(s, 'other')

In [4]:
def analysis_graph(asg: AttackGraph) -> pd.DataFrame:
    node_set = [node.name for node in asg.graph.keys()]
    node_type = {}
    print('# of nodes:',len(node_set))
    print('# of edges:',len(asg.edges),'(edges equal to steps)\n')

    nodeType_counter = Counter({n:0 for n in ['f','c','n','p','m_addr','pipe','else']})
    for node in asg.graph.keys():
        nodeType_counter[node.type] += 1
    print('# of node type:\n',nodeType_counter)

    node_counter = Counter({n:0 for n in node_set})
    edge_counter = Counter()
    for (nodes, edge) in asg.edges.items():
        # print(nodes, edge)
        # print(edge.name)
        src_node, dst_node, _ = nodes # 其實底線_等同於 edge.name
        # print(src_node.name, dst_node.name)
        node_counter[src_node.name] += 1
        node_counter[dst_node.name] += 1
        edge_counter[edge.name] += 1
        node_type[src_node.name] = translate_type(src_node.type)
        node_type[dst_node.name] = translate_type(dst_node.type)
    print('\nfrequency of node:\n',node_counter)
    print('\nfrequency of edge:\n',edge_counter)
    print('\n# of syscall',len(edge_counter))

    # 回傳這張圖的 node_name, node_name, appearance
    df = [[node,node_type[node],node_counter[node]] for node in node_counter]
    df = pd.DataFrame(df, columns=['node_name', 'type', 'appearance'])
    return df


- 這邊直接寫入檔案 V1.txt, V2.txt ...。也生成一個 node 的統計資訊 excel 檔。
- https://stackoverflow.com/questions/7152762/how-to-redirect-print-output-to-a-file

In [10]:
# filename = 'Dofloo_V1_ALL有畫出-Memory-and-System.pkl'
# with open(filename, 'rb') as inp:
#     asg_all = pickle.load(inp)

filename_lst = ['Dofloo_V1_ALL_Memory_and_System.pkl','Dofloo_V2_ALL_Memory_sink.pkl',]
    # 'Dofloo_V3_ALL_wo_system.pkl','Dofloo_V4_ALL_wo_system_memory.pkl']
node_stat_lst = []
    
for i,filename in enumerate(filename_lst):
    orig_stdout = sys.stdout
    f = open(f'1115_Dofloo_V{i+1}.txt', 'w')
    sys.stdout = f

    with open(f'1115_dofloo_V1~V2_pkl/{filename}', 'rb') as inp:
        asg_object = pickle.load(inp)
        df = analysis_graph(asg_object)
        node_stat_lst.append(df)

    sys.stdout = orig_stdout
    f.close()

# 將 nodes 的統計資訊寫入 excel (會覆蓋掉同檔名，小心!)
with pd.ExcelWriter("1115_node_statistics.xlsx") as writer:
    [df.to_excel(writer, sheet_name=f"ver{i+1}") for i,df in enumerate(node_stat_lst)]

### Excel 2: 列印出圖中的 Triplate (edge)
用來觀察 x -> A 的 x 有哪些，又執行了甚麼動作。 
 
- 影響系統的函數關鍵字 : set、change、create ( 因為 create 也是在系統中多出一個新的東西，所以 open() 包含在這裡 ) affect_sys = ["set_thread_area", "setsid", "fcntl", "fchown", "fchmod", "rename", "umask", "set_tid_address", "futex", 
"set_robust_list", "arch_prctl", "execve", "exit_group", "clone", "write", "connect", "send", "brk", "socket", "mprotect", "mmap", "openat", "mmap2", "munmap", "open", "close"]

- 有些沒有影響系統，但是會取得系統資訊，這樣還是蠻重要的 (Ex: uname) non_affect_sys = ["ugetrlimit", "readlink", "getcwd", "uname", "waitpid", "getuid", "getgid", "getppid", 
 "getegid", "wait4", "prlimit64", "getpid", "geteuid", "statfs", "_newselect", "getsockopt", 
 "stat", "access", "fstat", "ioctl", "time", "fstat64", "nanosleep", "lseek", "read", ]

In [11]:
def is_important(edgename: str) -> bool:
    '''判斷一個syscall是否會改變系統檔案，是回傳True。'''
    affect_sys = ["set_thread_area", "setsid", "fcntl", "fchown", "fchmod", "rename", "umask", "set_tid_address", \
        "futex", "set_robust_list", "arch_prctl", "execve", "exit_group", "clone", "write", "connect", "send", \
        "brk", "socket", "mprotect", "mmap", "openat", "mmap2", "munmap", "open", "close"]
    if edgename in affect_sys:
        return True
    return False

def analysis_triplate(asg: AttackGraph) -> pd.DataFrame:
    node_set = [node.name for node in asg.graph.keys()]
    node_counter = Counter({n:0 for n in node_set})
    for i, (nodes, edge) in enumerate(asg.edges.items()):
        src_node, dst_node, _ = nodes
        node_counter[src_node.name] += 1
        node_counter[dst_node.name] += 1
    
    triplate_lst = []
    for i, (nodes, edge) in enumerate(asg.edges.items()):
        src_node, dst_node, _ = nodes
        src_type = translate_type(src_node.type)
        dst_type = translate_type(dst_node.type)
        imp = 1 if is_important(edge.name) else 0 # 這個 syscall 是否會改變系統
        cs = node_counter[src_node.name] # node 的 degree
        cd = node_counter[dst_node.name]
        # 反轉 read 的節點
        if edge.name == 'read':
            triplate_lst.append([i+1, dst_node.name, dst_type, cd, edge.name,src_node.name, src_type, cs, imp])
        else:
            triplate_lst.append([i+1, src_node.name, src_type, cs, edge.name, dst_node.name, dst_type, cd, imp])

    #     break
    # print(triplate_lst)
    df = pd.DataFrame(triplate_lst, columns=['step','src_node', 'type', 'deg', 'edge', 'dst_node', 'type', 'deg','importance'])
    return df

# filename = 'Dofloo_V4_ALL_wo_system_memory.pkl'
# with open(filename, 'rb') as inp:
#     asg_all = pickle.load(inp)
# df = analysis_triplate(asg_all)

filename_lst = ['Dofloo_V1_ALL_Memory_and_System.pkl','Dofloo_V2_ALL_Memory_sink.pkl',]
    # 'Dofloo_V3_ALL_wo_system.pkl','Dofloo_V4_ALL_wo_system_memory.pkl']
triplate_stat_lst = []
node_type_set = set()
    
for i,filename in enumerate(filename_lst):
    # f = open(f'1115_Dofloo_V{i+1}.txt', 'w')

    with open(f'1115_dofloo_V1~V2_pkl/{filename}', 'rb') as inp:
        asg_object = pickle.load(inp)
        df = analysis_triplate(asg_object)
        triplate_stat_lst.append(df)

with pd.ExcelWriter("1115_triplate_statistics.xlsx") as writer:
    # df.to_excel(writer, sheet_name=f"ver4")
    [df.to_excel(writer, sheet_name=f"ver{i+1}") for i,df in enumerate(triplate_stat_lst)]

- 使用 VBA 套件操作 excel，將 importance 的欄位上色。

In [None]:
# ! pip install xlwings

In [7]:
import string
def char(idx: int) -> str:
    '''0->A, 1->B, ...'''
    return string.ascii_uppercase[idx]

In [12]:
import xlwings as xw
from xlwings.constants import Direction
workbook = xw.Book('1115_triplate_statistics.xlsx')
sheets = workbook.sheets

for i,sheet in enumerate(sheets):
    lenx = sheet.range("B1").end(Direction.xlDown).row - 1 # 資料有幾欄&幾列
    colx = sheet.range("B1").end(Direction.xlToRight).column - 1
    titles = sheet.range(f"B1:{char(colx)}1").value
    col_imp = char(titles.index('importance') + 1) # 'importance'在第幾欄 (第H欄)

    for row in range(2, lenx+2):
        # if importance cell == 1, color that row
        if sheet.range(f"{col_imp}{row}").value == 1:
            # print('yes')
            sheet.range(f"B{row}:{col_imp}{row}").color = (255, 255, 204) # 淺黃色
workbook.save()

In [19]:
# test: 為特定一張表的重要syscall 加上顏色
# import xlwings as xw
# from xlwings.constants import Direction
# workbook = xw.Book('triplate_statistics.xlsx')
# sheet = workbook.sheets['ver4']
# lenx = sheet.range("B1").end(Direction.xlDown).row - 1 # 資料有幾欄&幾列
# colx = sheet.range("B1").end(Direction.xlToRight).column - 1
# titles = sheet.range(f"B1:{char(colx)}1").value
# col_imp = char(titles.index('importance') + 1) # 'importance'在第幾欄 (第H欄)

# for row in range(2, lenx+2):
#     # if importance cell == 1, color that row
#     if sheet.range(f"{col_imp}{row}").value == 1:
#         # print('yes')
#         sheet.range(f"B{row}:{col_imp}{row}").color = (255, 255, 204) # 淺黃色
# workbook.save()

### 以下是測試的 code

In [3]:
asg, g = build('Dofloo', '9a37fcc7eab08d59532bc5c66390bc30.bin')

path: ../trace/Dofloo/9a37fcc7eab08d59532bc5c66390bc30.bin
Nodes after reduction:  36


In [10]:
node_set = [node.name for node in asg.graph.keys()]
print('# of nodes:',len(node_set))
print('# of edges:',len(asg.edges),'(edges equal to steps)')

# of nodes: 36
# of edges: 40 (edges equal to steps)


In [31]:
nodeType_counter = Counter({n:0 for n in ['f','c','n','p','m_addr','pipe','else']})
for node in asg.graph.keys():
    nodeType_counter[node.type] += 1
nodeType_counter

Counter({'f': 7, 'c': 1, 'n': 11, 'p': 17, 'm_addr': 0, 'pipe': 0, 'else': 0})

### Calc frequency of node appearance
指出指入都計算一次，暫無分開統計指出幾次，指入幾次。

In [33]:
# calc node's frequency
node_counter = Counter({n:0 for n in node_set})
edge_counter = Counter()
for (nodes, edge) in asg.edges.items():
    # print(nodes, edge)
    # print(edge.name)
    src_node, dst_node, _ = nodes # 其實底線_等同於 edge.name
    # print(src_node.name, dst_node.name)
    node_counter[src_node.name] += 1
    node_counter[dst_node.name] += 1
    edge_counter[edge.name] += 1
    # break
node_counter

Counter({'malware': 8,
         'uname': 1,
         'sh': 12,
         '1529': 16,
         'sed': 14,
         '/etc/rc.local': 6,
         '/etc/rc.d/rc.local': 1,
         '/etc/init.d/boot.local': 1,
         '1530': 4,
         '1531': 1,
         '1532': 1,
         '/dev/urandom': 1,
         'eth0': 1,
         'eth1': 1,
         'eth2': 1,
         'eth3': 1,
         'eth4': 1,
         'eth5': 1,
         'eth6': 1,
         'eth7': 1,
         'eth8': 1,
         'eth9': 1,
         '183.131.83.38:50050': 1,
         '/proc/net/dev': 1,
         '/sys/devices/system/cpu/online': 1,
         '/proc/stat': 1})

In [34]:
edge_counter

Counter({'read': 18,
         'clone': 10,
         'exec': 6,
         'write': 3,
         'open': 2,
         'connect': 1})

In [17]:
# 測試 df 建構
# node_name, type, appearance
node_set = [node.name for node in asg.graph.keys()]
node_type = {}
node_counter = Counter({n:0 for n in node_set})
for (nodes, edge) in asg.edges.items():
    src_node, dst_node, _ = nodes
    node_type[src_node.name] = translate_type(src_node.type)
    node_type[dst_node.name] = translate_type(dst_node.type)
    node_counter[src_node.name] += 1
    node_counter[dst_node.name] += 1

Path("/outcsv").mkdir(exist_ok=True)
df = [[node,node_type[node],node_counter[node]] for node in node_counter]
df = pd.DataFrame(df, columns=['node_name', 'type', 'appearance'])
df

Unnamed: 0,node_name,type,appearance
0,malware,proc,8
1,uname,other,1
2,sh,proc,12
3,1529,proc,16
4,sed,proc,14
5,/etc/rc.local,file,6
6,/etc/rc.d/rc.local,file,1
7,/etc/init.d/boot.local,file,1
8,1530,proc,4
9,1531,proc,1


In [64]:
analysis_graph(asg_all)

# of nodes: 440
# of edges: 803 (edges equal to steps)
# of node type: Counter({'m_addr': 181, 'else': 170, 'f': 60, 'p': 17, 'n': 11, 'c': 1, 'pipe': 0})
frequency of node: Counter({'sed': 407, 'sh': 198, '1527': 164, '/lib/x86_64-linux-gnu/libc.so.6': 36, '1526': 27, 'malware': 26, '/etc/ld.so.cache': 24, '0': 24, '/lib/x86_64-linux-gnu/libselinux.so.1': 18, '/lib/x86_64-linux-gnu/libpcre.so.3': 18, '/lib/x86_64-linux-gnu/libdl.so.2': 18, '/lib/x86_64-linux-gnu/libpthread.so.0': 18, '/etc/ld.so.nohwcap': 12, '/etc/ld.so.preload': 12, '/proc/filesystems': 12, '/usr/lib/locale/locale-archive': 12, '/usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache': 12, '/etc/rc.local': 9, '8192*1024 bytes': 7, 'status:0': 7, '1513': 6, '/prober': 6, '.': 6, '/usr/local/sbin/sed': 6, '/usr/local/bin/sed': 6, '/usr/sbin/sed': 6, '/usr/bin/sed': 6, '/sbin/sed': 6, '/bin/sed': 6, '/sys/fs/selinux': 6, '/selinux': 6, '/etc/selinux/config': 6, '/usr/lib/x86_64-linux-gnu/charset.alias': 6, '/usr/share/loca

In [62]:
# 志剛發現 asg data structure 缺少了一些 syscall，如下 
dict_from_net = {'set_thread_area': 1, 'ugetrlimit': 1, 'readlink': 1, 'getcwd': 1, 'setsid': 1, 'uname': 2, 'fcntl': 3, 'fchown': 3, 'fchmod': 3, 'rename': 3, 'waitpid': 6, 'getuid': 6, 'getgid': 6, 'getppid': 6, 'getegid': 6, 'wait4': 6, 'rt_sigreturn': 6, 'prlimit64': 6, 'umask': 6, 'set_tid_address': 7, 'futex': 8, 'getpid': 9, 'set_robust_list': 11, 'arch_prctl': 12, 'geteuid': 12, 'statfs': 12, 'execve': 13, 'exit_group': 13, 'clone': 16, 'write': 28, 'connect': 39, '_newselect': 39, 'getsockopt': 39, 'send': 39, 'brk': 40, 'socket': 40, 'stat': 48, 'access': 66, 'fstat': 78, 'ioctl': 91, 'mprotect': 100, 'mmap': 138, 'openat': 143, 'time': 189, 'fstat64': 299, 'mmap2': 303, 'munmap': 312, 'nanosleep': 336, 'rt_sigaction': 385, 'rt_sigprocmask': 624, 'lseek': 1500, 'open': 2041, 'read': 2101, 'close': 2463}
dict_from_asg = {'time': 150, 'mprotect': 100, 'fstat': 75, 'open': 68, 'read': 63, 'mmap': 60, 'stat': 48, 'access': 30, 'brk': 28, 'munmap': 16, 'exit_group': 13, 'arch_prctl': 12, 'statfs': 12, 'set_robust_list': 11, 'clone': 10, 'getpid': 9, 'set_tid_address': 7, 'futex': 7, 'waitpid': 6, 'getuid': 6, 'getgid': 6, 'geteuid': 6, 'getppid': 6, 'getegid': 6, 'exec': 6, 'prlimit64': 6, 'umask': 6, 'write': 6, 'fcntl': 3, 'fchown': 3, 'fchmod': 3, 'fstat64': 2, 'mmap2': 2, 'nanosleep': 2, 'set_thread_area': 1, 'ugetrlimit': 1, 'readlink': 1, 'getcwd': 1, 'setsid': 1, '_newselect': 1, 'getsockopt': 1, 'lseek': 1}
setA = set(dict_from_net.keys())
setB = set(dict_from_asg.keys())
setA - setB

{'close',
 'connect',
 'execve',
 'ioctl',
 'openat',
 'rename',
 'rt_sigaction',
 'rt_sigprocmask',
 'rt_sigreturn',
 'send',
 'socket',
 'uname',
 'wait4'}

In [59]:
'exec' in setB

True