In [1]:
import pandas as pd

### 解釋
- syscall <-> verb 規則的成品為 `rule_dataset_final.csv`。
- 總共有 513 條規則，收錄 354 個 system call，能轉換的動詞有 113 個。
- 由於規則數量眾多，且許多 system call 都沒有惡意程式使用，故可以先拿來收藏，先使用 `/Model` 的 `rule_dataset.csv`。
- /trans_syndata_to_rule 會將標籤好 EntityType 和 ActionType 的 syndata 調整成 rules 的格式，並將從 root 資料夾的 `rule_dataset_final.csv` 取出 from CTI 的 rules，合併至產出中，這份產出就是最終成品 `rule_dataset_final.csv`，我們手動複製一份過來做數據分析。請隨時保持確認兩份相同檔名的 rule 是一致的。
- 若需要更改標籤，請至 /trans_syndata_to_rule/before 的檔案做修改。

In [2]:
df = pd.read_csv('rule_dataset_final.csv')
df = df[~df['Syscall'].isin(['rm()', 'exec()'])] # 統計中不要這兩個 call，這是製成學長自己掰的 call

def clean_braces(x):
    x = x.replace("()", "").replace(",", "").replace(" ", "")
    return str(x)
def add_braces(x):
    x = clean_braces(x)
    x = x+"()"
    return x

df['Syscall'] = df['Syscall'].apply(add_braces) #clean_braces
df.head(3)

# df['Syscall'] = df['Syscall'].apply(add_braces)
# df.to_csv("rule_dataset.csv", index=False)

Unnamed: 0,EntityType,ActionType,Source,Syscall,EnVerb,Sentence
0,FILE,DEVICE,man,io_cancel(),cancel,
1,FILE,DEVICE,man,io_destroy(),destroy,
2,FILE,DEVICE,man,io_getevents(),read,


- 印出各類別的個數和內容，共 7 種 EntityType: FILE, PROC, DEVICE, ID, TIME, INFO, NET

In [3]:
df.iloc[0]

EntityType           FILE
ActionType         DEVICE
Source                man
Syscall       io_cancel()
EnVerb             cancel
Sentence              NaN
Name: 0, dtype: object

In [4]:
print('ActionTypes:', len(df['ActionType'].unique()))
print('Verb from manpage:', len(df[df['Source'] == 'man']['EnVerb'].unique()))
print('Verb from reports:', len(df[df['Source'] != 'man']['EnVerb'].unique()))

ActionTypes: 34
Verb from manpage: 136
Verb from reports: 31


In [5]:
printDetail = False #True
entityType_lst = df['EntityType'].unique()
lenSyscall = len(df['Syscall'].unique())
lenEnVerb = len(df['EnVerb'].unique())
print(f"Total rules: {len(df)}, EntityType: {df['EntityType'].unique()}")
print(f"For total {lenSyscall:2} uq syscalls & {lenEnVerb:2} uq verbs\n")
ttl_lenRule_manpage = 0

for entityType in entityType_lst:
    dataFileType = df[df['EntityType'] == entityType]
    lenEnTp = len(dataFileType)
    lenSyscall = len(dataFileType['Syscall'].unique())
    lenEnVerb = len(dataFileType['EnVerb'].unique())
    actionType_lst = dataFileType['ActionType'].unique()
    lenEnVerb_manpage = len(dataFileType[dataFileType['Source'] == 'man']['EnVerb'].unique())
    lenEnVerb_report = lenEnVerb - lenEnVerb_manpage
    lenRule_manpage = len(dataFileType[dataFileType['Source'] == 'man'])
    lenRule_report = len(dataFileType) - lenRule_manpage
    ttl_lenRule_manpage += lenRule_manpage

    # print(dataFileType.index, len(dataFileType.index))

    print(f"EntityType: {entityType:<6} has {len(dataFileType):2} rules | {lenSyscall:2} uq_syscalls | {lenEnVerb:2} uq_verbs | {len(actionType_lst):2} actionType\
          \n\t=>syscall: {list(dataFileType['Syscall'].unique())}\
          \n\t=>verb:    {list(dataFileType['EnVerb'].unique())}\
          \n\t=>verb:    from manpage {lenEnVerb_manpage}, from report {lenEnVerb_report}\
          \n\t=>rule:    from manpage {lenRule_manpage}, from report {lenRule_report}")

    for actionType in actionType_lst:
        dataActionType = dataFileType[dataFileType['ActionType'] == actionType]
        lenSyscall = len(dataActionType['Syscall'].unique())
        lenEnVerb = len(dataActionType['EnVerb'].unique())

        if printDetail:
            print(f"\t{actionType:<12} has {lenSyscall:2} uq syscalls & {lenEnVerb:2} uq verbs\
    {list(dataActionType['Syscall'].unique())} {list(dataActionType['EnVerb'].unique())}")
        else:
            print(f"\t{actionType:<12} has {lenSyscall:2} uq syscalls & {lenEnVerb:2} uq verbs")

    print()

print('ttl_lenRule_manpage:', ttl_lenRule_manpage)


Total rules: 683, EntityType: ['FILE' 'RESOURCE_BYTE' 'PERMISSION' 'MEM' 'PROC' 'TIME' 'ID' 'NET'
 'EXIT_STATUS']
For total 395 uq syscalls & 155 uq verbs

EntityType: FILE   has 297 rules | 170 uq_syscalls | 84 uq_verbs | 21 actionType          
	=>syscall: ['io_cancel()', 'io_destroy()', 'io_getevents()', 'io_setup()', 'io_submit()', 'ioctl()', 'ioperm()', 'iopl()', 'pciconfig_iobase()', 'pciconfig_read()', 'pciconfig_write()', 'swapoff()', 'swapon()', 'access()', 'faccessat()', 'close()', 'creat()', 'epoll_create()', 'epoll_create1()', 'flock()', 'link()', 'linkat()', 'memfd_create()', 'mkdir()', 'mkdirat()', 'mknod()', 'mknodat()', 'mmap()', 'mmap2()', 'mount()', 'name_to_handle_at()', 'open()', 'open_by_handle_at()', 'openat()', 'symlink()', 'symlinkat()', 'fremovexattr()', 'remove()', 'rmdir()', 'umount()', 'umount2()', 'unlink()', 'unlinkat()', 'getcwd()', 'getdents()', 'dup()', 'dup2()', 'dup3()', 'eventfd()', 'eventfd2()', 'signalfd()', 'userfaultfd()', '_newselect()', 'epoll_

In [10]:
# 42 + 37 + 1 + 6 + 29 = 115
# 368*1 + 23*2 + 4*3 = 426
# 97*1 + 17*2 + 6*3 + 4*2 = 157
426 + 157

583

### 動詞過濾 除去非動詞的 action word
- https://spacy.io/usage/linguistic-features
- https://machinelearningknowledge.ai/tutorial-on-spacy-part-of-speech-pos-tagging/
- 由於 pos 標籤一次只能貼上一個，所以有些詞會被誤標成名詞，如 name, hook, format。反而有些不該是動詞的單字被標籤上動詞，如 resolv.conf, m6_6n3。
- 經由這段測試，推薦自建黑名單剔除誤判成動詞的單字。

In [8]:
import spacy

oov_verbs = convertion_df['verb_origin']
none_verb_lst = ["resolv.conf", "wget", "m6_6n3", "se"] # wget 可以接受

nlp = spacy.load("en_core_web_sm")
for v in oov_verbs:
    doc = nlp(v)
    for token in doc:
        if token.pos_ != 'VERB':
            print(token.text, token.pos_, token.tag_) # ex: modify VERB VB
        # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
        #         token.shape_, token.is_alpha, token.is_stop)

print("======")
for v in none_verb_lst:
    doc = nlp(v)
    for token in doc:
        print(token.text, token.pos_, token.tag_)


hardcode NOUN NN
format NOUN NN
name NOUN NN
se X FW
hook NOUN NN
se X FW
resolv.conf VERB VB
wget VERB VB
m6_6n3 VERB VB
se X FW


### 查看哪些 Syscall 未包含在 Rule 中

In [26]:
# Load Malware Sample ASG
# import re
import os
# from os import listdir
# from os.path import isfile, join
# from pathlib import Path
# import pandas as pd
# import pickle
from IPython.display import display

''' 自己寫的模組 '''
from Exp4_autore.ASG import AttackGraph, Node, Edge, FileTable #, build

family = 'GafGyt' # Darlloz
samplename = '933e9eff97f50b196b2bb0fef499640e' # dfeb77cb0ba28ac3ba4be55d7bc91fad
trace_path = f"../C ASG/trace/{family}/{samplename}.bin"
print("path is correct:", os.path.exists(trace_path))
asg = AttackGraph(trace_path)
asg.create()

path is correct: True


In [30]:
syscalls_in_asg = {step[2] for step in asg.step_list}
print(list(syscalls_in_asg)[:3], 'num of unique syscalls:', len(syscalls_in_asg))

['open', 'kill', 'fork'] num of unique syscalls: 15


In [31]:
tmp = df
tmp['Syscall'] = df['Syscall'].apply(clean_braces)
syscalls_in_rule = tmp['Syscall'].unique()
for asg_sy in syscalls_in_asg:
    if asg_sy not in syscalls_in_rule:
        print(asg_sy)