In [1]:
import re
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path
import pandas as pd
import json
import pickle
from typing import Dict, List, get_type_hints
from styleframe import StyleFrame, Styler
from IPython.display import display

''' 自己寫的模組 '''
from cc_regex_script import RegexMatchResult, RegexMaster 
from cc_nlp_script import find_verb_of_vocab, OperationMode, OperationEvaluator
from ASG import AttackGraph, Node, Edge, FileTable #, build
import Utility



In [4]:
family = 'Dofloo'
samplename_lst = ['0046a78514a658b9b7a4e29e8581f85b']

def prettifyName(item:str) -> str:
    pretty_name = {
        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}": "ip_addr/ip_addr:port/port",
        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+": "ip_addr/ip_addr:port/port", 
        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:d+":  "ip_addr/ip_addr:port/port",
        r"0x[0-9a-zA-Z]{8}": "0x[0-9a-zA-Z]{8} (mem)",       # old
        r"0x[0-9a-zA-Z]{1,16}": "0x[0-9a-zA-Z]{1,16} (mem)", # new
        r"Permission":                 'Permission/Permission:[0-9]{3}',
        r"Permission:[0-9]{3}":        'Permission/Permission:[0-9]{3}',
        # r"permission:{0,1}[0-9]{0,4}": 'Permission/Permission:[0-9]{3}'
    }
    changed = pretty_name.get(item, None)
    if changed:
        return changed
    return item

class PrettyJson(dict):
	def __str__(self):
		return json.dumps(self, indent=4, sort_keys=True)
# myJSON = PrettyJson(student) # usage

class OperationPair:
    SYSCALL = 0
    VERB = 1    # not use now (may cause confused)

    def __init__(self, action:str, object:str, isSyscall:bool=False, subject:str=None, \
                 step_number:int=None , original_object=None, \
                 original_sentence=None, object_type=None):
        self.object:str = object # regex
        self.action:str = action # verb or syscall
        self.isSyscall:bool = isSyscall # action type (syscall or verb)
        self.object_type:str = object_type # FILE, NET, PROC, MEM, INFO
        self.subject:str = subject
        self.step_number:int = step_number
        self.original_object:str = original_object # can store original matched word (object) from CTD
        self.original_sentence:str = original_sentence # can store the mathced sentence
        self.altVerb:str = None # can store the alternative verb if the origin verb is not in the rule
        pass

    def __eq__(self, other): 
        if not isinstance(other, OperationPair):
            # don't attempt to compare against unrelated types
            return NotImplemented
        # Same if <action, object> pair is same (type is not consider yet)
        return self.action == other.action and self.object == other.object

    def __hash__(self) -> int:
        # necessary for instances to behave sanely in dicts and sets.
        return hash((self.action, self.object))

    def _getActionStr(self) -> str:
        if self.isSyscall:
            return f"{self.action}()"
        return self.action
    def __str__(self) -> str:
        if self.step_number:
            return f"<OP{self.step_number:>2} act={self._getActionStr()}, obj={prettifyName(self.object)}>"
        return f"<OP act={self._getActionStr()}, obj={prettifyName(self.object)}>"
    def __repr__(self) -> str:
        return str(self)
    def getSentStr(self) -> str:
        '''return string contains original_object & flat_sentence'''
        flat_sentence = " ".join(self.original_sentence.splitlines())
        return f"<OP{self.step_number:>2} act={self._getActionStr()}, obj={prettifyName(self.original_object)}, sent={flat_sentence}>"
        
# test case
p_aa = OperationPair('a','a')
p_aaz = OperationPair('a','a')
p_ab = OperationPair('a','b')

print(p_aa == p_aaz) # True
print(set([p_aa, p_aaz])) # one element

def construct_sample_OPset(step_list: list, regexMaster:RegexMaster, keep_src_node=False, debug=False, 
                           rt_order=True, draw_memory_etc=False) -> set | tuple[set, list]:
    '''return a list of OPset. RegexMaster and family are gobal variable.
     If draw_memory_etc = True, 則會特別處理記憶體, 網路, 權限等節點的 dst_node.'''
    if debug:
        print('step_list len:', len(set(step_list)))
    OPlst_raw = list((src_node.name, dst_node.name, syscall) for (src_node, dst_node, syscall) in step_list) # object is raw
    OPset_raw = list(dict.fromkeys(OPlst_raw))
    if debug:
        print('OPlst_raw len:', len(OPlst_raw)) # why len is 123100? (while step_list len is 7775) (must be equal)
        print(f"OPlst_raw: {OPlst_raw[:10]}")
        print('OPset_raw len:', len(OPset_raw)) # len is 5925 for xor
        print(f"OPset_raw: {OPset_raw[:10]}")
    # regexMaster = RegexMaster()
    # regex_pool = regexMaster.get_all_regex()
    OPset = set()
    OPset_ordered = list()
    for (src_node, dst_node, syscall) in OPset_raw:
        # regex_dst_node_list = RegexMaster.find_spacial_token_with_regex(regex_pool, dst_node)
        regex_dst_node_list:list[RegexMatchResult] = regexMaster.find_spacial_token(dst_node)

        # 0509 加的，為了畫圖
        if draw_memory_etc and dst_node in ['Memory Address', 'NIC', ]:
            if dst_node == 'Memory Address':
                txt = 'Memory Address'
                regex_dst_node_list = [RegexMatchResult(txt, match_regex='0x[0-9a-zA-Z]{1,16}', type='MEM')]
            else:
                txt = 'NIC'
                regex_dst_node_list = [RegexMatchResult(txt, match_regex='^eth.*', type='NET')]
        if draw_memory_etc and dst_node in ['\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}', 'port \\d+', 
                                            '\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}:\\d+']:
            txt = 'ip_addr/ip_addr:port/port'
            regex_dst_node_list = [RegexMatchResult(txt, match_regex=txt, type='NET')]
        if draw_memory_etc and dst_node in ['Permission', 'Permission:[0-9]{3}']:
            txt = 'Permission/Permission:[0-9]{3}'
            regex_dst_node_list = [RegexMatchResult(txt, match_regex=txt, type='FILE')]

        if regex_dst_node_list is None or len(regex_dst_node_list) == 0:
            continue
        for regex_result in regex_dst_node_list:
            dst_node = regex_result.match_regex
            step_number = len(OPset) + 1
            if keep_src_node:
                op = OperationPair(syscall, dst_node, subject=src_node, isSyscall=True,\
                                   step_number=step_number, object_type=regex_result.type)
                OPset.add(op) # keep src_node in OP
                if op not in OPset_ordered:
                    OPset_ordered.append(op)
            else:
                op = OperationPair(syscall, dst_node, isSyscall=True,\
                                   step_number=step_number, object_type=regex_result.type)
                OPset.add(op) # only dst_node, syscall in OP
                if op not in OPset_ordered:
                    OPset_ordered.append(op)
    if debug:
        print('OPset len:', len(OPset), 'OPset_ordered len:', len(OPset_ordered))
    if rt_order:
        return OPset, OPset_ordered
    return OPset

# test
# sample_OPset = construct_sample_OPset(asg.step_list, debug=True)

class Sample:
    def __init__(self, samplename:str, regex_set:set=None, special_token_dict=None, \
                 step_list=None, regexMaster=None) -> None:
        self.samplename: str = samplename
        self.regexMaster = None
        if regexMaster:
            self.regexMaster = regexMaster
        self.regex_set: set = set()
        if regex_set:
            self.regex_set = regex_set
        self.special_token_dict = {}
        if special_token_dict:
            self.special_token_dict: dict = special_token_dict # list of dict
        if step_list:
            self.step_list = step_list
            OPset, OPset_ordered = construct_sample_OPset(step_list, self.regexMaster, draw_memory_etc=True)
            self.OPset: set[OperationPair] = OPset
            self.OPset_ordered: list[OperationPair] = OPset_ordered # 依照出現順序排序的 OPset (元素相同順序不同)
            self.OPset_sysOnly_ordered = [] # 尚未初始化 只儲存改變系統狀態的 OP 的物件
            self.OPset_sysOnly_ordered_index = [] # 尚未初始化 只儲存改變系統狀態的 OP 的 index
            self.ASGstep_sysOnly_ordered_index = [] # 尚未初始化 只儲存改變系統狀態的 ASG 的 index
            del OPset, OPset_ordered
        pass
    def __repr__(self) -> str:
        return f"<Sample self.samplename>"

''' 讀取 ASG 中的 set_of_object (就是這個樣本的 special_token_dict)，並歸納樣本的 regex_set '''
samples: list[Sample] = []
total_used_regex = set()
for samplename in samplename_lst:

    trace_path = f"../../C ASG/trace/{family}/{samplename}.bin"
    print("path is correct:", os.path.exists(trace_path))
    asg = AttackGraph(trace_path)
    asg.create()

    regexMaster = RegexMaster(asg)

    sample = Sample(samplename, special_token_dict=asg.set_of_object, step_list=asg.step_list, \
                    regexMaster=regexMaster)
    sample.regex_set = regexMaster.get_used_regex() # used_regex_set has bug 有些抓不到
    samples.append(sample)
    total_used_regex = total_used_regex | sample.regex_set

seen_node_S, seen_node_O = Utility.create_set_of_objects(asg)
raw_objects = set(seen_node_S)
raw_objects.update(seen_node_O)
print(f'Exam: sample S1 matches {len(samples[0].regex_set)} of regex, # of unique raw objects: {len(raw_objects)}\
, # of unique steps: {len(Utility.get_uni_step(asg))}')
print(f'Size of total_used_regex: {len(total_used_regex)}')
regex_match_file, regex_non_match_file = Utility.build_file_regex(Utility.get_set_of_objects_file(asg), Utility.build_RULES_DICT()) # file re -> obj relation

# Create Visualization Instance
img_path = '../../Graph & Diagram/'

True
{<OP act=a, obj=a>}
path is correct: True
Exam: sample S1 matches 24 of regex, # of unique raw objects: 55, # of unique steps: 284
Size of total_used_regex: 24


In [5]:
verify_text = '''
The /etc/rc.local will execute certain commands after all of the systems' services have started.

Trend Micro researchers also discovered that the latest variant of the AESDDoS bot can modify files i.e., /etc/rc.local and /etc/rc.d/rc.local, as an autostart technique by appending the {malware path}/{malware file name} reboot command.

Trend Micro researchers also discovered that the latest variant of the AESDDoS bot can modify files i.e., /etc/rc.local and /etc/rc.d/rc.local, 

The /etc/rc.local will execute certain commands after all of the systems' services have started.

Reads the following information from /proc: 
/proc/stat /proc/meminfo /proc/cpuinfo /proc/net/dev /proc/self/exe /proc/self/maps /proc/sys/vm/overcommit_memory /proc/sys/kernel/rtsig-max /proc/sys/kernel/ngroups_max /proc/sys/kernel/osrelease /proc/self/fd/%d/%s /proc/self/fd /proc/net 
'''

regexMaster.find_spacial_token(verify_text)

[<RegexMatchResult word=/etc/rc.local, type=FILE, match_regex=/etc/rc\.local>,
 <RegexMatchResult word=/etc/rc.local, type=FILE, match_regex=/etc/rc\.local>,
 <RegexMatchResult word=/etc/rc.d/rc.local,, type=FILE, match_regex=/etc/rc.*\.d/.*>,
 <RegexMatchResult word=/etc/rc.local, type=FILE, match_regex=/etc/rc\.local>,
 <RegexMatchResult word=/etc/rc.d/rc.local,, type=FILE, match_regex=/etc/rc.*\.d/.*>,
 <RegexMatchResult word=/etc/rc.local, type=FILE, match_regex=/etc/rc\.local>,
 <RegexMatchResult word=/proc/stat, type=FILE, match_regex=/proc/stat>,
 <RegexMatchResult word=/proc/net/dev, type=FILE, match_regex=/proc/net/dev>,
 <RegexMatchResult word=/proc/self/exe, type=FILE, match_regex=/proc/self>,
 <RegexMatchResult word=/proc/self/maps, type=FILE, match_regex=/proc/self>,
 <RegexMatchResult word=/proc/self/fd/%d/%s, type=FILE, match_regex=/proc/self>,
 <RegexMatchResult word=/proc/self/fd, type=FILE, match_regex=/proc/self>]

In [None]:
/proc/stat
/proc/net/dev
/proc/self