## Synonym of Baseline Object
Q: Why do we need to list the synonyms of Baseline Object?  
A: We expect a fuzzy matching method.

In [1]:
org_base = {'sh': 60, 'ID': 40, 'sed': 38, 'malware': 18, '1526': 13, '/etc/rc.local': 9, '1527': 9, '8192*1024 bytes': 7, 'status:0': 7, '23.236.66.13:50050': 6, 'status:2': 4, 'Permission:0700': 3, 'Permission:022': 3, 'status:1': 2, 'Sleep Duration': 2, '/proc/net/dev': 2, '/proc/stat': 2, 'uname': 1, '/proc/self/exe': 1, '"/prober"': 1, '/etc/rc.d/rc.local': 1, '/etc/init.d/boot.local': 1, '1528': 1, '1529': 1, '/dev/urandom': 1, 'NIC': 1, 'Timestamp': 1, '/sys/devices/system/cpu/online': 1}

synonyms = {
    'ID': ['process id', 'pid'],
    'NIC': ['']
}

從 `triplate_statistics.xlsx` 的 excel 檔建立 baseline，只取出 file, net 的類型，注意 other 的 uname 也要加進來。  
ver4 dst_node set with length: `27` 

In [16]:
# ! pip install graphviz

Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     -------------------------------------- 47.0/47.0 kB 471.8 kB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.20.1


In [1]:
import re
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path
import pandas as pd
from typing import Dict, List, get_type_hints
from styleframe import StyleFrame, Styler
from IPython.display import display

### Load Baseline from triplate_statistics xlsx/csv
整段程式會以這一隻 sample 當作 baseline，baseline 只取 ASG 中作為受詞的 file, net 和 uname

- Dofloo 96f125505ca736ff5d85ed33fa1c9b15, 9a37fcc7eab08d59532bc5c66390bc30, 60f879ed44de37351c973f01d0d77f79
- Xor 0aefb67c01a24d05351b093455203fa2, 0bc90c333f08237475a08c7158aba345, 07c070b717a23453a2b71c7582d6a928

In [396]:
# basedata 是要送進評估系統裡的dict物件
# df = pd.read_excel(open('triplate_statistics.xlsx', 'rb'), sheet_name='ver4', index_col=0) 
# family, samplename, outputfolder = 'Dofloo', 'f477b05826c46ac8a482e85d89b00de0', 'outputfolder' 
family, samplename, outputfolder = 'Xorddos', 'eadf8abf7738144e929a25e9eb7d8076', 'outputfolder'
# family, samplename, outputfolder = 'Mirai', 'e666e0c720387db27e23c65d6a252f79587ca1b9d1c38e96d6db13b05d5b73fa', 'outputfolder'
# family, samplename, outputfolder = 'Tsunami', '46389c117c5f41b60e10f965b3674b3b77189b504b0aeb5c2da67adf55a7129f', 'outputfolder'

df_triplet = pd.read_csv(f'../C ASG statistics 1115ver/saved_pkl/{family}/{samplename}_triplet.csv')
df_node = pd.read_csv(f'../C ASG statistics 1115ver/saved_pkl/{family}/{samplename}_node.csv')
Path(outputfolder).mkdir(parents=True, exist_ok=True) # create output folder if not exist

base_data = {
    'text':[],
    'type':[],
    'weight':[]
}
exclude_node = ['NO_SOCKET','Unknown','malware']
for i,row in df_triplet.iterrows():
    dst_node_name = re.sub(r"\"", '', row.dst_node) # clean node name
    if dst_node_name not in base_data['text'] and dst_node_name not in exclude_node:
        if dst_node_name == 'uname' or row['type.1'] in ['file','net']: # 只取作為受詞的 file, net 和 uname
            base_data['text'].append(dst_node_name)
            base_data['type'].append(row['type.1'])
            # display(df_node[df_node['node_name'] == row.dst_node])
            node_info = df_node[df_node['node_name'] == row.dst_node] # 從 df_node 抓出這個節點的權重 (重要與否)
            weight = 1 + (1 if list(node_info['importance'])[0] else 0) # 重要2分 不重要1分
            base_data['weight'].append(weight)

print('num of baseline objects', len(base_data['text']))
view = pd.DataFrame(base_data).sort_values(by=['type','text'])
view.to_csv(f'{outputfolder}/baseline_{family}_{samplename[:3]}.csv', index=False)
view

num of baseline objects 93


Unnamed: 0,text,type,weight
2,/boot,file,1
67,/boot/aimxscnxph,file,2
24,/boot/anzzkrxtdb,file,2
54,/boot/beqminucub,file,2
47,/boot/bhwbhqbqcj,file,2
...,...,...,...
85,103.25.9.228:53,net,2
90,103.25.9.245:8005,net,2
88,66.102.253.30:8005,net,2
86,8.8.8.8:53,net,2


### 使用 Dofloo 的文章來測試

In [397]:
report_fir_path = '../C parse report/sentence csvs/'
# target_family = 'Dofloo'

def get_all_filenames(dir: str='./') -> list:
    ''' traverse root directory, and list directories as dirs and files as files. Return filenames in rootdir. '''
    files = [f for f in listdir(dir) if isfile(join(dir, f))]
    files.sort()
    return files

dofloo_report_names = [f for f in get_all_filenames(report_fir_path) if f.startswith(family)] # 只選擇這個家族的報告
dofloo_report_dfs = [pd.read_csv(f"{report_fir_path}{f}") for f in dofloo_report_names]
dofloo_report_names

['Xorddos-Cdnetworks.csv',
 'Xorddos-Crowdstrike.csv',
 'Xorddos-Intezer.csv',
 'Xorddos-Microsoft.csv',
 'Xorddos-Trendmicro.csv',
 'Xorddos-Trendmicro2.csv']

In [398]:
dofloo_report_dfs[0]

Unnamed: 0,Number,Content
0,"""Sentence 1","understanding a xor.ddos attack\r\n2月 17, 2021..."
1,"""Sentence 2","xor.ddos is the name of the malware, not the a..."
2,"""Sentence 3",it was detected in september 2014 and the anal...
3,"""Sentence 4",is a distributed denial-of-service attack from...
4,"""Sentence 5","in many ways it is and in this guide, we will ..."
5,"""Sentence 6",what is xor.ddos malware?
6,"""Sentence 7",the traditional attack utilized the existing v...
7,"""Sentence 8","however, xor.ddos makes windows pcs into zombi..."
8,"""Sentence 9",the xor.ddos attack is used to defeat the netw...
9,"""Sentence 10",this is a very serious threat to the network b...


In [399]:
res = re.search(r"[^a-z]sh[^a-z]", "www sh www")
res

<re.Match object; span=(3, 7), match=' sh '>

In [400]:
class ReportEvalModel:
    def __init__(self, baseline: Dict[str, list], sentences: pd.DataFrame, reportname:str=''):
        self.baseline = baseline
        self.sentences = sentences # a df
        self.reportname = reportname
        cols = baseline['text']
        self.match_tbl = pd.DataFrame([[0]*len(cols)]*len(sentences), columns=cols)
        self.match_baseline = set()
        self.match()

    def match(self):
        '''find baseline in sentences of report. Fill matching result in self.match_tbl.'''
        sentence_list = self.sentences['Content']
        for idx_sent, sent in enumerate(sentence_list):
            # print(idx_sent, sent)
            # for idx_base, base in enumerate(self.baseline.text):
            for idx_base, (base, type) in enumerate(zip(self.baseline['text'], self.baseline['type'])):
                # find baseline in a sentence
                if self.find_baseline(str(sent), base, type) != -1:
                    self.match_tbl.loc[idx_sent,base] = 1 # mark as found
                    self.match_baseline.add(base)
        pass

    def find_baseline(self, sentence: str, baseline: str, baseline_type: str='file'):
        '''not found will return -1'''
        if baseline_type == 'net':
            return sentence.find(baseline) # NIC, ip, port 先不做特別處理
        elif baseline in ['sed','sh']:
            # ^sh, 'sh', <sh>, www sh www (沒測過)
            if re.search(r"[^a-z]sh[^a-z]", sentence) or re.search(r"^sh[^a-z]", sentence):
                return 0
            if re.search(r"[^a-z]sed[^a-z]", sentence) or re.search(r"^sed[^a-z]", sentence):
                return 0
            return -1
        else:
            # 一般檔案就直接比對
            return sentence.find(baseline)
    
    def get_match_sentences(self) -> pd.DataFrame:
        result = self.match_tbl.copy()
        result['match'] = result.sum(axis=1)
        return result
    
    def get_match_baselines(self) -> set:
        return self.match_baseline

rem = ReportEvalModel(base_data, dofloo_report_dfs[4])

In [401]:
# er.match_tbl
rem.get_match_sentences()

Unnamed: 0,uname,/proc/self/exe,/boot,/lib,/lib/udev,/var,/var/run,/lib/udev/udev,/boot/glwwlcevzg,/etc/init.d/glwwlcevzg,...,/var/run/sftp.pid,103.25.9.228:53,8.8.8.8:53,0.0.0.0:8005,66.102.253.30:8005,103.240.141.50:8005,103.25.9.245:8005,0.0.0.0:80,/proc/1529/exe,match
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [402]:
rem.get_match_baselines()

{'/lib', '/var', '/var/run'}

In [403]:
class FamilySet:
    '''一個 malware family report 集合'''
    def __init__(self, familyname:str, baseline:Dict[str, list]):
        self.familyname = familyname
        self.baseline = baseline
        self.rem_lst: List[ReportEvalModel] = [] # list of ReportEvalModel under this family
        self.result_tbl = None

    def add_rem(self, sentences: pd.DataFrame, reportname: str=''):
        '''新增 report 到 FamilySet 中，需傳入文本和報告名稱，會沿用 FamilySet 的 baseline.'''
        rem = ReportEvalModel(self.baseline, sentences, reportname)
        self.rem_lst.append(rem)

    def calc_report_coverage_score(self, match_baselines: list[str], apply_weight=True) -> float:
        '''計算這篇報告的 coverage_score，未處理分母為0之情形'''
        if not apply_weight:
            return len(match_baselines) / len(self.baseline['text']) # 無權種的算法
        denominator = sum(self.baseline['weight']) # 分母
        numerator = 0 # 分子
        for b, w in zip(self.baseline['text'], self.baseline['weight']):
            if b in match_baselines:
                numerator += w
        return numerator/denominator

    def show_result(self):
        '''print and return result table. baselinse(x) * report(y).'''
        column_names = ['report_name','ttl_match','coverage_score']
        column_names.extend(self.baseline['text'])
        result_tbl = pd.DataFrame([[0]*len(column_names)]*len(self.rem_lst), columns=column_names)
        for i,rem in enumerate(self.rem_lst):
            match_baselines = rem.get_match_baselines() # 這篇報告含有哪些 baseine:set
            c_score = self.calc_report_coverage_score(match_baselines, apply_weight=True)
            result_tbl.loc[i,'report_name'] = rem.reportname
            result_tbl.loc[i,'ttl_match'] = len(match_baselines)
            result_tbl.loc[i,'coverage_score'] = f'{c_score:.4}'
            for b in match_baselines:
                result_tbl.loc[i,b] = 1
            print(rem.reportname, match_baselines)
        display(result_tbl)
        return result_tbl

fset = FamilySet('Dofloo', base_data)
for i,rdf in enumerate(dofloo_report_dfs):
    # rem = ReportEvalModel(base_data, rdf, dofloo_report_names[i])
    fset.add_rem(rdf, dofloo_report_names[i])
result_tbl = fset.show_result()
result_tbl.to_csv(f'{outputfolder}/{family}_{samplename[:3]}_FamilySet.csv', index=False) 

Xorddos-Cdnetworks.csv set()
Xorddos-Crowdstrike.csv set()
Xorddos-Intezer.csv {'uname'}
Xorddos-Microsoft.csv {'/boot', '/proc/rs_dev', '/var', '/var/run', '/lib', '/etc/crontab', '/proc/self/exe'}
Xorddos-Trendmicro.csv {'/lib', '/var', '/var/run'}
Xorddos-Trendmicro2.csv set()


Unnamed: 0,report_name,ttl_match,coverage_score,uname,/proc/self/exe,/boot,/lib,/lib/udev,/var,/var/run,...,/etc/crontab,/var/run/sftp.pid,103.25.9.228:53,8.8.8.8:53,0.0.0.0:8005,66.102.253.30:8005,103.240.141.50:8005,103.25.9.245:8005,0.0.0.0:80,/proc/1529/exe
0,Xorddos-Cdnetworks.csv,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Xorddos-Crowdstrike.csv,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Xorddos-Intezer.csv,1,0.005682,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Xorddos-Microsoft.csv,7,0.05114,0,1,1,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
4,Xorddos-Trendmicro.csv,3,0.01705,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5,Xorddos-Trendmicro2.csv,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 印出/存檔 Report 中的 matched sentence
- matched baseline
- sentence 內文
- sentence number [1~n] 作為排序依據
- subject of sentnece
- verb of sentnece
- object of sentnece (等於 matched baseline)

In [404]:
def save_match_sent_to_excel(fset: FamilySet):
    columns = ['baseline','sentence number','sentence']
    output_xlsx = f'{outputfolder}/{family}_{samplename[:3]}_report_match_sent.xlsx'
    has_match = sum([len(rem.get_match_baselines()) for rem in fset.rem_lst])
    if has_match == 0:
        print('no matches in CTI reports') # with open 一定要寫入，若全空則直接不開檔
        return
    with pd.ExcelWriter(output_xlsx, engine='openpyxl') as writer:
    
        # 遍歷報告，一個 rem: ReportEvalModel 代表一篇報告
        for rem in fset.rem_lst:
            if len(rem.get_match_baselines()) == 0: # 如果報告中無 baseline 跳過不紀錄
                continue
            sheet_data = [] # shape = columns_len * match_sent_len
            sent_df = rem.get_match_sentences()
            sent_content_lst = rem.sentences['Content'] # 報告中的每個句子

            # 遍歷每個含有 baseline 的句子
            for sid, row in sent_df.iterrows():
                if row['match'] == 0:
                    continue
                row.drop('match', inplace=True)
                row = row[row > 0]
                # print(row)
                # 考量每個句子可能含有多個 baseline，故需寫成多行
                for i, v in row.items():
                    insert_data = dict().fromkeys(columns)
                    insert_data['baseline'] = i
                    insert_data['sentence number'] = sid + 1
                    insert_data['sentence'] = sent_content_lst[sid].strip()
                    sheet_data.append(insert_data)
            
            # 輸出 sheet
            print(rem.reportname,'has', len(sheet_data),'matches')
            output_sheet = pd.DataFrame(sheet_data)
            # output_sheet.style.set_properties(subset=['sentence'])
            styler = Styler(horizontal_alignment='left', vertical_alignment='top')
            sf = StyleFrame(output_sheet, styler)
            sf = sf.set_column_width(columns=['baseline'], width=20.0)
            sf = sf.set_column_width(columns=['sentence number'], width=15.0)
            sf = sf.set_column_width(columns=['sentence'], width=80.0)
            sf.to_excel(writer, sheet_name=rem.reportname, index=False) #.save()
            # output_sheet.to_excel(writer, sheet_name=rem.reportname, index=False)
    return

# 此行會執行並複寫 excel，執行完後要手動調整行距格式，小心使用
save_match_sent_to_excel(fset)

Xorddos-Intezer.csv has 1 matches
Xorddos-Microsoft.csv has 28 matches
Xorddos-Trendmicro.csv has 3 matches


調整 excel style 的參考
- API https://styleframe.readthedocs.io/en/3.0.6/styleframe.html
- 欄寬 https://stackoverflow.com/questions/40162098/changing-column-width-from-excel-files
- 欄寬(not working) https://stackoverflow.com/questions/39680147/can-i-set-variable-column-widths-in-pandas

以下是測試寫入 excel 的 code

columns = ['baseline','sentence number','sentence']
sheet_data = [] # shape = columns_len * match_sent_len
# matched_series = fset.rem_lst[4].get_match_sentences()['match'] > 0 # 含有 baseline 的句子 id
# sent_idxlst = matched_series[matched_series].index.tolist()

sent_df = fset.rem_lst[4].get_match_sentences()
sent_content_lst = fset.rem_lst[4].sentences['Content']
# 遍歷每個含有 baseline 的句子
for sid, row in sent_df.iterrows():
    if row['match'] == 0:
        continue
    row.drop('match', inplace=True)
    row = row[row > 0]
    # print(row)
    # 考量每個句子可能含有多個 baseline，故需寫成多行
    for i, v in row.iteritems():
        insert_data = dict().fromkeys(columns)
        insert_data['baseline'] = i
        insert_data['sentence number'] = sid + 1
        insert_data['sentence'] = sent_content_lst[sid].strip()
        sheet_data.append(insert_data)
# print(sheet_data)

fset.rem_lst[4].sentences

# 法一 這個 style 很醜 但好改
from styleframe import StyleFrame, Styler
output_xlsx = 'report_match_result.xlsx'
output_sheet = pd.DataFrame(sheet_data)
styler = Styler(horizontal_alignment='left', vertical_alignment='top')
# StyleFrame(output_sheet, styler).to_excel('test.xlsx', index=False).save()
# https://stackoverflow.com/questions/17326973/is-there-a-way-to-auto-adjust-excel-column-widths-with-pandas-excelwriter

# 法二
from pandas.io.formats.excel import ExcelFormatter
with pd.ExcelWriter('test.xlsx', engine="xlsxwriter") as writer:
    cell_format = writer.book.add_format()
    cell_format.set_font_color('green')
    
    writer.book.formats[0].set_text_wrap()  # update global format with text wrap
    output_sheet.to_excel(writer, index=False)