In [2]:
%run Hooklog3.ipynb
import pandas as pd
import shutil
import os, pickle
import numpy as np
from tqdm import tqdm
from collections import Counter
Hooklog = Hooklog3

In [3]:
root_dir = "./data/Hooklog/MikeGithub/"
in_parseFirstPar = False # False=只拿API function name
fam_dirs = next(os.walk(root_dir))[1]
combine_dict = []
for fam in tqdm(fam_dirs):
    in_directory = root_dir + fam + '/'
    hl_list = next(os.walk(in_directory))[2] # get all filenames in the in_directory
    hl_list = [os.path.join(in_directory, f) for f in hl_list] # filepathname list
    hl_list = list(filter(lambda f: f.endswith(".hooklog"), hl_list)) # in case some non-hooklog file in the folder
    for file in hl_list:
        hl3 = Hooklog(file, in_parseFirstPar)
        hl_li = []
        for value in hl3.li:
            hl_li.append(value[1])
        combine_dict.append(hl_li)

100%|██████████| 13/13 [00:00<00:00, 34.52it/s]


In [8]:
def basic_statistics(all_length):
    '''
    input: length list of elements e.g.[1,1,1,3,5,9,4,2,1,3,54,78,5...]
    output1: mean、std、mode、min、q1、median(q2)、q3、max、iqr、outlier、far out
    output2: statistics graph、10%~90% form
    '''
    stat_dict = {}
    stat_dict['mean'] = np.mean(all_length)
    stat_dict['std'] = np.std(all_length)
    stat_dict['mode'] = np.argmax(np.bincount(all_length))
    stat_dict['min'] = np.min(all_length)
    stat_dict['q1'] = np.quantile(all_length,0.25)
    stat_dict['median'] = np.quantile(all_length,0.5)
    stat_dict['q3'] = np.quantile(all_length,0.75)
    stat_dict['max'] = np.max(all_length)
    stat_dict['iqr'] = stat_dict['q3'] - stat_dict['q1']
    stat_dict['outlier'] = stat_dict['q3'] + 1.5*stat_dict['iqr']
    stat_dict['far_out'] = stat_dict['q3'] + 3*stat_dict['iqr']
    for i in [10,20,30,40,50,60,70,80,90,100]:
        stat_dict[str(i)+'%'] = np.percentile(all_length,i)
    return pd.DataFrame.from_dict(stat_dict,orient='index',columns=['length'])

def knee_point(length_li,k=5):
    """
    length_li: 長度list
    k: slope topK's knee points
    Return dict format (key is the answer)
    """
    length_dict = dict(Counter(length_li))
    sorted_dict = {k: v for k, v in sorted(length_dict.items(), key=lambda x: x[1])}
    all_items_num = sum(list(length_dict.values()))
#     for item in sorted_dict.items():
    all_keys = list(sorted_dict.keys())
    all_values = list(sorted_dict.values())
    slope_li = []
    for i in range(len(sorted_dict)):
        length1 = all_keys[i]
        try:
            length2 = all_keys[i+1]
        except IndexError:
            break
        value1 = sum(all_values[:i+1])
        value2 = sum(all_values[:i+2])
        slope = ((value2-value1)/all_items_num)/(length2-length1)
        slope_li.append(slope)
    change_rate_li = []
    for i in range(len(slope_li)):
        try:
            slope1 = slope_li[i]
            slope2 = slope_li[i+1]
        except IndexError:
            break
        change_rate_li.append(abs(slope2-slope1)) #陡變緩或是緩變陡的都一起算
    idx_li = sorted(range(len(change_rate_li)), key=lambda i: change_rate_li[i], reverse=True)[:k]
    return_dict = {}
    for idx in idx_li:
        return_dict[all_keys[idx+1]] = change_rate_li[idx]
    return pd.DataFrame.from_dict(return_dict,orient='index',columns=['knee_length'])
#     return return_dict        
        

以下兩格擇一
* single-gram一格
* n-gram四格

In [7]:
# single-gram
all_apis = []
for api_li in combine_dict:
    for api in api_li:
        all_apis.append(api)
all_apis_set = set(all_apis)
print(len(all_apis_set))

single_dict_enc = {}
for i,gram in enumerate(list(all_apis_set)):
    single_dict_enc[gram] = i+1
pickle.dump(file=open(root_dir + "single_dict_enc_Mike.pkl",'wb'),obj=single_dict_enc)
single_dict_enc

23


{'WinHttpConnect': 1,
 'WinExec': 2,
 'OpenProcess': 3,
 'RegQueryValue': 4,
 'CopyFile': 5,
 'InternetOpen': 6,
 'WinHttpOpenRequest': 7,
 'WinHttpSendRequest': 8,
 'RegCreateKey': 9,
 'RegDeleteKey': 10,
 'LoadLibrary': 11,
 'InternetConnect': 12,
 'HttpSendRequest': 13,
 'CreateThread': 14,
 'DeleteFile': 15,
 'ExitProcess': 16,
 'WinHttpOpen': 17,
 'RegSetValue': 18,
 'CreateFile': 19,
 'CreateProcess': 20,
 'RegEnumValue': 21,
 'CreateProcessInternal': 22,
 'CreateRemoteThread': 23}

In [3]:
#n-gram
stride = 1
all_window_li = []
for hkl_li in combine_dict:
    for i in range(len(hkl_li)):
        if i%stride != 0:
            continue
        try:
            all_window_li.append((hkl_li[i],hkl_li[i+1])) #tri-gram:(hkl_li[i],hkl_li[i+1],hkl_li[i+2])
        except IndexError:
            break

In [5]:
#n-gram
statistic_gram = dict(Counter(all_window_li))
print(len(statistic_gram))
tri_dictionary_li = sorted(statistic_gram, key=statistic_gram.get,reverse=True)
tri_dictionary_li

133


[('RegQueryValue', 'RegQueryValue'),
 ('CreateFile', 'CreateFile'),
 ('CreateFile', 'CopyFile'),
 ('RegEnumValue', 'RegEnumValue'),
 ('CopyFile', 'CreateFile'),
 ('LoadLibrary', 'LoadLibrary'),
 ('RegQueryValue', 'LoadLibrary'),
 ('LoadLibrary', 'RegQueryValue'),
 ('RegCreateKey', 'RegSetValue'),
 ('RegCreateKey', 'RegQueryValue'),
 ('OpenProcess', 'OpenProcess'),
 ('RegQueryValue', 'RegCreateKey'),
 ('CreateFile', 'RegQueryValue'),
 ('RegQueryValue', 'CreateFile'),
 ('RegSetValue', 'RegSetValue'),
 ('LoadLibrary', 'CreateFile'),
 ('RegSetValue', 'RegCreateKey'),
 ('RegSetValue', 'RegQueryValue'),
 ('CreateFile', 'RegCreateKey'),
 ('RegQueryValue', 'RegEnumValue'),
 ('RegSetValue', 'CreateFile'),
 ('LoadLibrary', 'RegCreateKey'),
 ('RegQueryValue', 'RegSetValue'),
 ('CreateFile', 'LoadLibrary'),
 ('RegEnumValue', 'RegQueryValue'),
 ('RegDeleteKey', 'RegDeleteKey'),
 ('DeleteFile', 'DeleteFile'),
 ('RegEnumValue', 'LoadLibrary'),
 ('RegCreateKey', 'RegCreateKey'),
 ('RegEnumValue', 'Reg

In [6]:
#n-gram
length_dist = list(statistic_gram.values())
stat_df = pd.DataFrame(sorted(length_dist,reverse=True),columns=['bi_num'],index=tri_dictionary_li)
stat_df.to_excel('./results/Hooklog/bigram_numbers_Mike.xlsx')
basic_statistics(length_dist)

Unnamed: 0,length
mean,1007.090226
std,5179.414928
mode,1.0
min,1.0
q1,9.0
median,43.0
q3,174.0
max,55829.0
iqr,165.0
outlier,421.5


In [7]:
#n-gram
tri_dict_enc = {}
for i,gram in enumerate(tri_dictionary_li):
    tri_dict_enc[gram] = i+1
pickle.dump(file=open(root_dir + "bi_dict_enc_Mike.pkl",'wb'),obj=tri_dict_enc)
tri_dict_enc

{('RegQueryValue', 'RegQueryValue'): 1,
 ('CreateFile', 'CreateFile'): 2,
 ('CreateFile', 'CopyFile'): 3,
 ('RegEnumValue', 'RegEnumValue'): 4,
 ('CopyFile', 'CreateFile'): 5,
 ('LoadLibrary', 'LoadLibrary'): 6,
 ('RegQueryValue', 'LoadLibrary'): 7,
 ('LoadLibrary', 'RegQueryValue'): 8,
 ('RegCreateKey', 'RegSetValue'): 9,
 ('RegCreateKey', 'RegQueryValue'): 10,
 ('OpenProcess', 'OpenProcess'): 11,
 ('RegQueryValue', 'RegCreateKey'): 12,
 ('CreateFile', 'RegQueryValue'): 13,
 ('RegQueryValue', 'CreateFile'): 14,
 ('RegSetValue', 'RegSetValue'): 15,
 ('LoadLibrary', 'CreateFile'): 16,
 ('RegSetValue', 'RegCreateKey'): 17,
 ('RegSetValue', 'RegQueryValue'): 18,
 ('CreateFile', 'RegCreateKey'): 19,
 ('RegQueryValue', 'RegEnumValue'): 20,
 ('RegSetValue', 'CreateFile'): 21,
 ('LoadLibrary', 'RegCreateKey'): 22,
 ('RegQueryValue', 'RegSetValue'): 23,
 ('CreateFile', 'LoadLibrary'): 24,
 ('RegEnumValue', 'RegQueryValue'): 25,
 ('RegDeleteKey', 'RegDeleteKey'): 26,
 ('DeleteFile', 'DeleteFile

convert hkl to single or bi-gram
* 以下挑一格

In [9]:
# single gram
root_dir = "./data/Hooklog/MikeGithub/"
in_parseFirstPar = False # False=只拿API function name
fam_dirs = next(os.walk(root_dir))[1]
all_enc_length = []
fam_names = []
for fam in tqdm(fam_dirs):
    in_directory = root_dir + fam + '/'
    hl_list = next(os.walk(in_directory))[2] # get all filenames in the in_directory
    hl_list = [os.path.join(in_directory, f) for f in hl_list] # filepathname list
    hl_list = list(filter(lambda f: f.endswith(".hooklog"), hl_list)) # in case some non-hooklog file in the folder
    for file in hl_list:
        hl3 = Hooklog(file, in_parseFirstPar)
        hl_li = []
        for i,value in enumerate(hl3.li):
            hl_li.append((hl3.li[i][1])) #tri-gram: (hl3.li[i][1],hl3.li[i+1][1],hl3.li[i+2][1])
        enc_li = []
        for n_gram in hl_li:
            try:
                enc_li.append(single_dict_enc[n_gram])
            except KeyError:
                print("Not in encoding set:",n_gram)
                enc_li.append(len(tri_dictionary_li)+1)
        all_enc_length.append(len(enc_li))
        fam_names.append(fam)
        np.save(file.replace('.trace.hooklog','.single.npy'),np.array(enc_li))
#         combine_dict.append(hl_li)
stat_df = pd.DataFrame(all_enc_length,columns=['encoded_length'],index=fam_names)
stat_df.to_excel('./results/Hooklog/single_encLength_MikeGithub.xlsx')
basic_statistics(all_enc_length)

100%|██████████| 13/13 [00:00<00:00, 33.26it/s]


Unnamed: 0,length
mean,249.964684
std,236.606674
mode,219.0
min,3.0
q1,80.0
median,219.0
q3,306.0
max,1226.0
iqr,226.0
outlier,645.0


In [8]:
#bi-gram
root_dir = "./data/Hooklog/MikeGithub/"
in_parseFirstPar = False # False=只拿API function name
fam_dirs = next(os.walk(root_dir))[1]
all_enc_length = []
fam_names = []
for fam in tqdm(fam_dirs):
    in_directory = root_dir + fam + '/'
    hl_list = next(os.walk(in_directory))[2] # get all filenames in the in_directory
    hl_list = [os.path.join(in_directory, f) for f in hl_list] # filepathname list
    hl_list = list(filter(lambda f: f.endswith(".hooklog"), hl_list)) # in case some non-hooklog file in the folder
    for file in hl_list:
        hl3 = Hooklog(file, in_parseFirstPar)
        hl_li = []
        for i,value in enumerate(hl3.li):
            if i%stride!=0:
                continue
            try:
                hl_li.append((hl3.li[i][1],hl3.li[i+1][1])) #tri-gram: (hl3.li[i][1],hl3.li[i+1][1],hl3.li[i+2][1])
            except IndexError:
                break
        enc_li = []
        for n_gram in hl_li:
            try:
                enc_li.append(tri_dict_enc[n_gram])
            except KeyError:
                print("Not in n-gram set:",n_gram)
                enc_li.append(len(tri_dictionary_li)+1)
        all_enc_length.append(len(enc_li))
        fam_names.append(fam)
        np.save(file.replace('.trace.hooklog','.enc.npy'),np.array(enc_li))
#         combine_dict.append(hl_li)
stat_df = pd.DataFrame(all_enc_length,columns=['encoded_length'],index=fam_names)
stat_df.to_excel('./results/Hooklog/afterBigram_encLength_MikeGithub.xlsx')
basic_statistics(all_enc_length)

100%|██████████| 13/13 [00:00<00:00, 29.19it/s]


Unnamed: 0,length
mean,248.964684
std,236.606674
mode,218.0
min,2.0
q1,79.0
median,218.0
q3,305.0
max,1225.0
iqr,226.0
outlier,644.0


In [10]:
stat_df2 = knee_point(all_enc_length,k=50)
stat_df2[(stat_df2.index>1)&(stat_df2.index<300)]
#30~115
#29~114

Unnamed: 0,knee_length
115,0.009454
109,0.009058
30,0.008752
34,0.007716
3,0.002502
6,0.002458
17,0.000885
121,0.000728
77,0.000485
12,0.000472
