In [1]:
import re
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path
import pandas as pd
import pickle
from typing import Dict, List, get_type_hints
from styleframe import StyleFrame, Styler
from IPython.display import display

''' 自己寫的模組 '''
from cc_regex_script import RegexMatchResult, RegexMaster 
from cc_module_cust import AttackGraph, Node, Edge, FileTable #, build



### Load 所有 Dofloo 的 object，計算並儲存其 regex

In [2]:
family = 'Dofloo'

sample_info = pd.read_csv('../../C malware info/sample_info.csv')
dofloo_info = sample_info[(sample_info['family'] == family) & 
            (~sample_info['filename'].isin(['8845355172c485813872f1bb1834de15.bin']))]
dofloo_samplename = dofloo_info['filename'].to_list()
dofloo_samplename = [name.split('.')[0] for name in dofloo_samplename]
print(f"family {family} has numOfSample {len(dofloo_samplename)}, first is {dofloo_samplename[0]}")

family Dofloo has numOfSample 13, first is 0046a78514a658b9b7a4e29e8581f85b


In [3]:
class Sample:
    def __init__(self, samplename:str, regex_set:set=None, special_token_dict=None) -> None:
        self.samplename: str = samplename
        self.regex_set: set = set()
        if regex_set:
            self.regex_set = regex_set
        self.special_token_dict = {}
        if special_token_dict:
            self.special_token_dict: dict = special_token_dict # list of dict
        pass
    def __repr__(self) -> str:
        return f"<Sample self.samplename>"

''' 讀取 ASG 中的 set_of_object (就是這個樣本的 special_token_dict)，並歸納樣本的 regex_set '''
regexMaster = RegexMaster()
regex_pool = regexMaster.get_all_regex()
# total_used_regex = set()
samples: list[Sample] = []
for samplename in dofloo_samplename:
    with open(f'../../C ASG statistics 1115ver/saved_pkl/{family}/{samplename}.pkl', 'rb') as inp:
        asg: AttackGraph = pickle.load(inp) # asg.set_of_object is dict() (key:spacial token, value: type 首字大寫)
        sample = Sample(samplename, special_token_dict=asg.set_of_object)
        for obj in sample.special_token_dict.keys():
            matched_list: list[RegexMatchResult] = regexMaster.find_spacial_token(obj)
            if matched_list:
                # print(obj, matched_list)
                [sample.regex_set.add(m.match_regex) for m in matched_list]
                # [total_used_regex.add(m.match_regex) for m in matched_list]
        # print('------', sample.regex_set)
        # break
        samples.append(sample)
print(f'Exam: sample S1 matches {len(samples[0].regex_set)} of regex')
print(f'Size of regex_pool: {len(regex_pool)}, size of total_used_regex: {len(regexMaster.used_regex_set)}')

Exam: sample S1 matches 13 of regex
Size of regex_pool: 23, size of total_used_regex: 17


In [4]:
''' Save result to txt file '''
total_regex = set()
with open('./result/countof_regexset.txt', 'w', encoding='utf-8') as opf:
    # header
    opf.write('{:12s}'.format('Sample no.'))
    for i in range(len(samples)):
        if i+1 < 10:
            opf.write(f' S{i+1} ')
        else:
            opf.write(f'S{i+1} ')
    opf.write('\n')
    opf.write('{:12s}'.format('# of regex'))
    for i in range(len(samples)):
        opf.write(f'{len(samples[i].regex_set):>3} ')
    opf.write('\n{:12s}'.format('Sample hash'))
    for i in range(len(samples)):
        opf.write(f'{samples[i].samplename[:3]:>3} ')
    opf.write('\n\n')
    opf.write(f'Size of regex_pool: {len(regex_pool)}, size of total_used_regex: {len(regexMaster.used_regex_set)}\n\n')

    # content of each sample
    for i,s in enumerate(samples):
        content_1 = f"S{i+1} has {len(s.regex_set)} of regex\n"
        content_2 = f"regex_set: {s.regex_set}\n"
        content_3 = f"special_token_dict: {s.special_token_dict.keys()}\n"
        opf.writelines([content_1, content_2, content_3])
        opf.write('\n\n')

### Load Reports

In [5]:
report_fir_path = '../../C parse report/sentence csvs/'
family, samplename, outputfolder = 'Dofloo', '???', './result' 

def get_all_filenames(dir: str='./') -> list:
    ''' traverse root directory, and list directories as dirs and files as files. Return filenames in rootdir. '''
    files = [f for f in listdir(dir) if isfile(join(dir, f))]
    files.sort()
    return files

dofloo_report_names = [f for f in get_all_filenames(report_fir_path) if f.startswith(family)] # 只選擇這個家族的報告
dofloo_report_dfs = [pd.read_csv(f"{report_fir_path}{f}") for f in dofloo_report_names]
dofloo_report_names

['Dofloo-BleepingComputer.csv',
 'Dofloo-MalwareMustDie.csv',
 'Dofloo-Securityaffairs.csv',
 'Dofloo-SyscallParty.csv',
 'Dofloo-Trendmicro.csv']

### 單篇文章的 Class 和 整個 Family 的 Class
`baseline: Dict[str, list]` 改為 `regexMaster: RegexMaster`

In [6]:
class ReportEvalModel:
    def __init__(self, regexMaster: RegexMaster, sentences: pd.DataFrame, reportname:str=''):
        self.regex_pool: set[str] = regexMaster.used_regex_set
        self.sentences = sentences # a df
        self.reportname = reportname
        cols = list(regex_pool)
        self.match_tbl = pd.DataFrame([[0]*len(cols)]*len(sentences), columns=cols)
        self.match_word:dict[(int,str), str] = dict() # key(idx_sent,regex) value(word:str)
        self.match_regex = set()
        self.regexMaster = regexMaster # 其實應該用 class function 就不用傳入一個物件
        self.match()

    def match(self) -> None:
        '''find baseline in sentences of report. Fill matching result in self.match_tbl.'''
        sentence_list = self.sentences['Content']
        for idx_sent, sent in enumerate(sentence_list):
            # print(idx_sent, sent)
            # find regexs in a sentence
            matched_list: list[RegexMatchResult] = self.regexMaster.find_spacial_token(sent)
            if not matched_list:
                continue
            for m in matched_list:
                this_re = m.match_regex
                matched_word = m.word
                self.match_word[(idx_sent,this_re)] = m.word # 紀錄被 search 到的單字
                self.match_tbl.loc[idx_sent,this_re] = 1 # mark as found
                self.match_regex.add(this_re)
        pass
    
    def get_match_sentences(self) -> pd.DataFrame:
        result = self.match_tbl.copy()
        result['match'] = result.sum(axis=1)
        return result
    
    def get_match_regexs(self) -> set:
        return self.match_regex

    def get_match_word(self, idx_sent:int, regex:str) -> str:
        return self.match_word[(idx_sent, regex)]

class FamilySet:
    '''一個 malware family report 集合'''
    def __init__(self, familyname:str, regexMaster: RegexMaster):
        self.familyname = familyname
        self.regexMaster = regexMaster
        self.num_of_used_regex = len(regexMaster.used_regex_set) # 這個家族含規則的總數量 (unoin by all samples)
        self.rem_lst: List[ReportEvalModel] = [] # list of ReportEvalModel under this family
        self.result_tbl = None

    def add_rem(self, sentences: pd.DataFrame, reportname: str=''):
        '''新增 report 到 FamilySet 中，需傳入文本和報告名稱，會沿用 FamilySet 的 baseline.'''
        rem = ReportEvalModel(self.regexMaster, sentences, reportname)
        self.rem_lst.append(rem)

    def calc_report_coverage_score(self, match_regexs: list[str], apply_weight=False) -> float:
        '''計算這篇報告的 coverage_score，未處理分母為0之情形'''
        if not apply_weight:
            return len(match_regexs) / self.num_of_used_regex # 無權種的算法，分母是家族 wide
        ''' 以下 有權重的算法尚未修改，必出 bug '''
        # denominator = sum(self.regexMaster['weight']) # 分母
        # numerator = 0 # 分子
        # for b, w in zip(self.regexMaster['text'], self.regexMaster['weight']):
        #     if b in match_baselines:
        #         numerator += w
        # return numerator/denominator

    def show_result(self, apply_weight: bool=False):
        '''print and return result table. baselinse(x) * report(y).'''
        column_names = ['report_name','ttl_match','coverage_score']
        column_names.extend(list(regexMaster.used_regex_set))
        result_tbl = pd.DataFrame([[0]*len(column_names)]*len(self.rem_lst), columns=column_names)
        for i,rem in enumerate(self.rem_lst):
            match_regexes = rem.get_match_regexs() # 這篇報告含有哪些 baseine:set
            c_score = self.calc_report_coverage_score(match_regexes, apply_weight=apply_weight)
            result_tbl.loc[i,'report_name'] = rem.reportname
            result_tbl.loc[i,'ttl_match'] = len(match_regexes)
            result_tbl.loc[i,'coverage_score'] = f'{c_score:.4}'
            for b in match_regexes:
                result_tbl.loc[i,b] = 1
            print(rem.reportname, match_regexes)
        display(result_tbl)
        return result_tbl

### Run Script: 存檔每篇報告的 matched sentence

In [7]:
fset = FamilySet(family, regexMaster)
for i,rdf in enumerate(dofloo_report_dfs):
    # rem = ReportEvalModel(base_data, rdf, dofloo_report_names[i])
    fset.add_rem(rdf, dofloo_report_names[i])
result_tbl = fset.show_result(apply_weight=False)
result_tbl.to_csv(f'{outputfolder}/{family}_FamilySet_by_Regex.csv', index=False) 

Dofloo-BleepingComputer.csv set()
Dofloo-MalwareMustDie.csv {'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}', '0x[0-9a-zA-Z]{8}', '^sed$'}
Dofloo-Securityaffairs.csv {'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}:\\d+', '/etc/rc', '\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}'}
Dofloo-SyscallParty.csv {'/etc/init.d/', '/etc/rc', '0x[0-9a-zA-Z]{8}', '\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}', '/proc/', 'uname(.)($)', '^sed$'}
Dofloo-Trendmicro.csv {'/sys/', '/etc/init.d/', '/etc/rc', 'resolv.conf', '/proc/', 'nsswitch.conf', '^sed$'}


Unnamed: 0,report_name,ttl_match,coverage_score,mtab,/etc/rc,nsswitch.conf,/proc/,bin/,Permission.*,/sys/,...,^sh$,/etc/init.d/,0x[0-9a-zA-Z]{8},"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",.*bin/sed,"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+",/selinux,resolv.conf,/etc/sed,uname(.)($)
0,Dofloo-BleepingComputer.csv,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Dofloo-MalwareMustDie.csv,3,0.1765,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,Dofloo-Securityaffairs.csv,3,0.1765,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,Dofloo-SyscallParty.csv,7,0.4118,0,1,0,1,0,0,0,...,0,1,1,1,0,0,0,0,0,1
4,Dofloo-Trendmicro.csv,7,0.4118,0,1,1,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0


In [8]:
def save_match_sent_to_excel(fset: FamilySet):
    columns = ['report','regex','matched word','sentence number','sentence']
    output_xlsx = f'{outputfolder}/{family}_report_match_sent.csv'
    has_match = sum([len(rem.get_match_regexs()) for rem in fset.rem_lst])
    if has_match == 0:
        print('no matches in CTI reports') # with open 一定要寫入，若全空則直接不開檔
        return
    # with pd.ExcelWriter(output_xlsx, engine='openpyxl') as writer:
    
    # 遍歷報告，一個 rem: ReportEvalModel 代表一篇報告
    csv_data = []
    for rem in fset.rem_lst:
        reportname = rem.reportname.split('.')[0]
        if len(rem.get_match_regexs()) == 0: # 如果報告中無 regex 跳過不紀錄
            continue
        # sheet_data = [] # shape = columns_len * match_sent_len
        sent_df = rem.get_match_sentences()
        sent_content_lst = rem.sentences['Content'] # 報告中的每個句子

        # 遍歷每個含有 regex 的句子
        for sid, row in sent_df.iterrows():
            if row['match'] == 0:
                continue
            row.drop('match', inplace=True)
            row = row[row > 0]
            # print(row)
            # 考量每個句子可能含有多個 regex，故需寫成多行
            for i, v in row.items():
                insert_data = dict().fromkeys(columns)
                insert_data['report'] = reportname
                insert_data['regex'] = i
                insert_data['matched word'] = rem.get_match_word(sid, i)
                insert_data['sentence number'] = sid + 1
                insert_data['sentence'] = str(sent_content_lst[sid]).strip()
                # sheet_data.append(insert_data)
                csv_data.append(insert_data)
    df = pd.DataFrame(csv_data)
    # print(df)
    df.to_csv(output_xlsx, index=False)

            # 輸出 csv
            
            # 輸出 excel sheet
            # print(rem.reportname,'has', len(sheet_data),'matches')
            # output_sheet = pd.DataFrame(sheet_data)
            # # output_sheet.style.set_properties(subset=['sentence'])
            # styler = Styler(horizontal_alignment='left', vertical_alignment='top')
            # sf = StyleFrame(output_sheet, styler)
            # sf = sf.set_column_width(columns=['baseline'], width=20.0)
            # sf = sf.set_column_width(columns=['sentence number'], width=15.0)
            # sf = sf.set_column_width(columns=['sentence'], width=80.0)
            # sf.to_excel(writer, sheet_name=rem.reportname, index=False) #.save()
            # # output_sheet.to_excel(writer, sheet_name=rem.reportname, index=False)
    return

# 此行會執行並複寫 excel，執行完後要手動調整行距格式，小心使用
save_match_sent_to_excel(fset)