In [1]:
import os
import re
import time
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import move
from sklearn.metrics import log_loss, accuracy_score, precision_recall_fscore_support

In [2]:
%load_ext memory_profiler

In [3]:
%run train.ipynb
%run predict.ipynb
%run feature_engineering.ipynb

In [2]:
bdci_base_path = "E:/Dataset/ccf-malware-classification"
bdci_data_path = f"{bdci_base_path}/raw_data"
bdci_inter_path = f"{bdci_base_path}/user_data"
bdci_label_path = f"{bdci_base_path}/train_label.csv"
bdci_result_path = f"{bdci_base_path}/prediction_result"

big_base_path = "E:/Dataset/microsoft-malware-classification"
big_data_path = f"{big_base_path}/raw_data"
big_inter_path = f"{big_base_path}/user_data"
big_label_path = f"{big_base_path}/trainLabels.csv"
big_result_path = f"{big_base_path}/prediction_result"

## Data preparation

### Data Preprocessing for Microsoft BIG-15 Dataset

In [31]:
def bytes_to_pe(base_path, data_path):
    filename = os.listdir(f"{base_path}/train")
    filename = [f for f in filename if f.endswith('bytes')]
    with tqdm(total=len(filename), ncols=80, desc="bytes_to_pe") as pbar:
        for sample in filename:
            with open(f"{base_path}/train/{sample}", "rb") as f:
                bytez = f.read()
            bytels = [b for b in bytez.split() if len(b) <=2 and b.isalnum()]
            bytestr = b''.join(bytels).decode('utf-8', errors='ignore')
            bytehex = bytes.fromhex(bytestr)
            with open(f"{data_path}/train/pe/{sample.split('.')[0]}", 'wb') as f:
                f.write(bytehex)
            pbar.update(1)

In [32]:
bytes_to_pe(big_base_path, big_data_path)

bytes_to_pe: 100%|████████████████████████| 10868/10868 [43:59<00:00,  4.12it/s]


In [5]:
def repl(prefix):
    return prefix.group('prefix')[-1]

def asm_format(base_path, data_path):
    filename = os.listdir(f"{base_path}/train")
    filename = [f for f in filename if f.endswith('asm')]
    with tqdm(total=len(filename), ncols=80, desc="asm_format") as pbar:
        for sample in filename:
            with open(f"{base_path}/train/{sample}", "rb") as f:
                stringz = f.read().decode('utf-8', errors='ignore')
            lines = stringz.split('\r\n')
            lines = [re.sub(r'(?P<prefix>^[^;]*?\s{2,}\S)', repl, ln) for ln in lines]
            lines = [re.sub(r'(?P<prefix>^[.|[A-Za-z]+:[A-Z0-9]+$)', '', ln) for ln in lines]
            with open(f"{data_path}/train/asm/{sample}", 'w', encoding='utf-8', errors='ignore') as f:
                f.write('\n'.join(lines))
            pbar.update(1)    

In [6]:
asm_format(big_base_path, big_data_path)

asm_format: 100%|███████████████████████| 10868/10868 [2:20:36<00:00,  1.29it/s]


### Partition of training set and test set

In [29]:
def label_partition(label_path, inter_path):
    label = pd.read_csv(label_path)
    columns = label.columns.tolist()
    df_train = pd.DataFrame(columns=columns)
    df_test = pd.DataFrame(columns=columns)
    filename, family = columns[0], columns[1]
    if min(label[family].unique()) > 0:
        label.loc[:, family] -= 1
    for lab, fam in label.groupby(family):
        shape = fam.shape[0]
        df_train = df_train.append(fam.iloc[: shape//2, :])
        df_test = df_test.append(fam.iloc[shape//2 :, :])
        print(lab, shape//2, shape-shape//2)
    df_train[filename].to_csv(f"{inter_path}/train_filename.txt", header=False, index=False)
    np.save(f"{inter_path}/train_y.npy", df_train[family])
    df_test[filename].to_csv(f"{inter_path}/test_filename.txt", header=False, index=False)
    np.save(f"{inter_path}/test_y.npy", df_test[family])
    print(df_train.shape[0], df_test.shape[0])

In [7]:
label_partition(bdci_label_path, bdci_inter_path)

0 214 214
1 373 373
2 10 10
3 130 131
4 160 161
5 90 91
6 388 388
7 675 675
8 297 297
9 582 582
2919 2922


In [5]:
bdci_train_y = np.load(f"{bdci_inter_path}/train_y.npy", allow_pickle=True).astype('int')
np.bincount(bdci_train_y) / bdci_train_y.shape[0]

array([0.07331278, 0.12778349, 0.00342583, 0.0445358 , 0.05481329,
       0.03083248, 0.13292223, 0.23124358, 0.10174717, 0.19938335])

In [30]:
label_partition(big_label_path, big_inter_path)

0 770 771
1 1239 1239
2 1471 1471
3 237 238
4 21 21
5 375 376
6 199 199
7 614 614
8 506 507
5432 5436


In [7]:
big_train_y = np.load(f"{big_inter_path}/train_y.npy", allow_pickle=True).astype('int')
np.bincount(big_train_y) / big_train_y.shape[0]

array([0.14175258, 0.22809278, 0.27080265, 0.04363034, 0.00386598,
       0.06903535, 0.03663476, 0.11303387, 0.09315169])

In [22]:
def file_movement(inter_path, data_path):
    with open(f"{inter_path}/test_filename.txt", 'r') as fp:
        test_filename = fp.read().split()
    for file in os.listdir(f"{data_path}/train/pe"):
        if file in test_filename:
            move(f"{data_path}/train/pe/{file}", f"{data_path}/test/pe")
    for file in os.listdir(f"{data_path}/train/asm"):
        if file.split('.')[0] in test_filename:
            move(f"{data_path}/train/asm/{file}", f"{data_path}/test/asm")
    print(len(os.listdir(f"{data_path}/train/pe")), len(os.listdir(f"{data_path}/test/pe")))
    print(len(os.listdir(f"{data_path}/train/asm")), len(os.listdir(f"{data_path}/test/asm")))

In [8]:
file_movement(bdci_inter_path, bdci_data_path)

2919 2922
2919 2922


In [31]:
file_movement(big_inter_path, big_data_path)

5432 5436
5432 5436


## Comparison

In [3]:
feature_list = ['histogram', 'byteentropy', 'strings', 'section', 'imports', 'exports', 'words_1000', 'ins_1000', 'semantic', 
                'ember', 'ember_section_ins_words', 'ember_section_ins_semantic']
model_list = ['vote', 'weighted', 'final']
feature_set = ['ember', 'section', 'imports', 'exports', 'words_1000', 'ins_1000', 'semantic', 
               'ember_section_ins_words', 'ember_section_ins_semantic']

### Effectiveness of Features

In [24]:
def result_by_features(data_path, inter_path, index_list):
    # Training
    feature_engineering("train", data_path, inter_path)
    train_data_dict = load_data('train', index_list, inter_path)
    train_y = np.load(f"{inter_path}/train_y.npy", allow_pickle=True).astype('int')
    train_model(train_data_dict, train_y, inter_path)
    
    time.sleep(1)
    
    # Predicting
    feature_engineering("test", data_path, inter_path)
    predict_result_base(index_list, inter_path)

In [18]:
result_by_features(bdci_data_path, bdci_inter_path, feature_list)

------------------------ Prediction ------------------------
histogram
Wall time: 0.04632925987243652 s
byteentropy
Wall time: 0.06550908088684082 s
strings
Wall time: 0.05738067626953125 s
section
Wall time: 0.047260284423828125 s
imports
Wall time: 0.0708627700805664 s
exports
Wall time: 0.04539990425109863 s
words_1000
Wall time: 0.0357060432434082 s
ins_1000
Wall time: 0.034131765365600586 s
semantic
Wall time: 0.05620884895324707 s
ember
Wall time: 0.055074214935302734 s
ember_section_ins_words
Wall time: 0.03131604194641113 s
ember_section_ins_semantic
Wall time: 0.034371137619018555 s


In [11]:
result_by_features(big_data_path, big_inter_path, feature_list)

------------------------ Prediction ------------------------
histogram
Wall time: 0.06801152229309082 s
byteentropy
Wall time: 0.08501935005187988 s
strings
Wall time: 0.07101607322692871 s
section
Wall time: 0.07101750373840332 s
imports
Wall time: 0.08287501335144043 s
exports
Wall time: 0.048011064529418945 s
words_1000
Wall time: 0.07201910018920898 s
ins_1000
Wall time: 0.06499838829040527 s
semantic
Wall time: 0.08101797103881836 s
ember
Wall time: 0.0690157413482666 s
ember_section_ins_words
Wall time: 0.05841255187988281 s
ember_section_ins_semantic
Wall time: 0.06001162528991699 s


### Effectiveness of models

In [12]:
def result_by_models(data_path, inter_path, index_list):
    # soft vote
    predict_result_vote(index_list, inter_path)
    
    # weighted soft vote
    predict_result_weighted(index_list, inter_path)

In [17]:
result_by_models(bdci_data_path, bdci_inter_path, feature_set)

------------------------ Prediction (vote) ------------------------
Wall time: 1.05 s
------------------------ Prediction (weighted) ------------------------
Wall time: 53.1 ms


In [13]:
result_by_models(big_data_path, big_inter_path, feature_set)

------------------------ Prediction (vote) ------------------------
Wall time: 1.41 s
------------------------ Prediction (weighted) ------------------------
Wall time: 98.5 ms


### Efficiency and resource overhead

In [17]:
def li2022imbalanced(data_path, inter_path, index_list):
#     feature_engineering("train", data_path, inter_path)
    train_data_dict = load_data('train', index_list, inter_path)
    train_y = np.load(f"{inter_path}/train_y.npy", allow_pickle=True).astype('int')
    t1 = time.time()
    %memit train_model(train_data_dict, train_y, inter_path)
    t2 = time.time()
    print(f'Train time: {t2 - t1}s')
    
    def predicting(data_path, inter_path):
#         feature_engineering("test", data_path, inter_path)
        # Decision-based multimodal fusion
        print("------------------------ Predicting ------------------------")
        feature_list1 = ['ember', 'section', 'imports', 'exports']
        result1 = predict_result(feature_list1, inter_path)
        feature_list2 = ['section', 'exports', 'ember_section_ins_words', 'ember_section_ins_semantic']
        result2 = predict_result(feature_list2, inter_path)
        feature_list3 = ['section', 'exports', 'words_1000', 'ember_section_ins_semantic']
        result3 = predict_result(feature_list3, inter_path)
        feature_list4 = ['section', 'exports', 'words_1000', 'semantic']
        result4 = predict_result(feature_list4, inter_path)
        # Model ensemble
        result_np = (result1 + result2 + result3 + result4) / 4
        # Submit result
        submit_result(inter_path, result_np, "result_final")
    
    t1 = time.time()
    %memit predicting(data_path, inter_path)
    t2 = time.time()
    print(f'Predict time: {t2 - t1}s')



In [19]:
%%memit 
training(bdci_data_path, bdci_inter_path, feature_set)

train_histogram: 100%|█████████████████████| 2919/2919 [00:08<00:00, 362.54it/s]
train_byteentropy: 100%|████████████████████| 2919/2919 [00:33<00:00, 86.97it/s]
train_strings: 100%|████████████████████████| 2919/2919 [01:39<00:00, 29.35it/s]
train_section: 100%|████████████████████████| 2919/2919 [00:33<00:00, 86.95it/s]
train_imports: 100%|███████████████████████| 2919/2919 [00:07<00:00, 375.00it/s]
train_exports: 100%|███████████████████████| 2919/2919 [00:05<00:00, 550.63it/s]
words_train: 100%|██████████████████████████| 2919/2919 [00:59<00:00, 48.79it/s]
words_test: 100%|███████████████████████████| 2922/2922 [01:03<00:00, 46.34it/s]
ins_train: 100%|████████████████████████████| 2919/2919 [01:13<00:00, 39.95it/s]
ins_test: 100%|█████████████████████████████| 2922/2922 [01:17<00:00, 37.67it/s]
train_asm2txt: 100%|████████████████████████| 2919/2919 [01:23<00:00, 34.98it/s]
test_asm2txt: 100%|█████████████████████████| 2922/2922 [01:26<00:00, 33.93it/s]
semantic: 100%|█████████████

------------------------ Training ------------------------
ember
Wall time: 8.21 s
section
Wall time: 879 ms
imports
Wall time: 2.58 s
exports
Wall time: 619 ms
words_1000
Wall time: 1.44 s
ins_1000
Wall time: 1.32 s
semantic
Wall time: 4.51 s
ember_section_ins_words
Wall time: 2.39 s
ember_section_ins_semantic


test_histogram:   3%|▋                       | 78/2922 [00:00<00:03, 764.93it/s]

Wall time: 3.21 s
Wall time: 31.6 s


test_histogram: 100%|██████████████████████| 2922/2922 [00:09<00:00, 316.99it/s]
test_byteentropy: 100%|█████████████████████| 2922/2922 [00:34<00:00, 84.29it/s]
test_strings: 100%|█████████████████████████| 2922/2922 [01:40<00:00, 29.19it/s]
test_section: 100%|█████████████████████████| 2922/2922 [00:29<00:00, 99.75it/s]
test_imports: 100%|████████████████████████| 2922/2922 [00:08<00:00, 341.06it/s]
test_exports: 100%|████████████████████████| 2922/2922 [00:05<00:00, 500.34it/s]
semantic: 100%|████████████████████████████| 2922/2922 [00:06<00:00, 436.64it/s]


------------------------ Prediction start ------------------------
------------------------ Prediction finish ------------------------
Predict time: 1.4037315845489502s
peak memory: 2713.88 MiB, increment: 2556.73 MiB


In [48]:
%%memit 
predicting(bdci_data_path, bdci_inter_path)

------------------------ Prediction start ------------------------
------------------------ Prediction finish ------------------------
------------------------ Result submit ------------------------
Wall time: 1.67 s


In [18]:
%%memit 
training(big_data_path, big_inter_path, feature_set)

------------------------ Training ------------------------
ember
Wall time: 42.7 s
section
Wall time: 2.69 s
imports
Wall time: 6.96 s
exports
Wall time: 1.55 s
words_1000
Wall time: 4.68 s
ins_1000
Wall time: 8.41 s
semantic
Wall time: 43.7 s
ember_section_ins_words
Wall time: 14.8 s
ember_section_ins_semantic
Wall time: 27.8 s
Wall time: 2min 50s
peak memory: 1092.16 MiB, increment: 399.55 MiB


In [20]:
%%memit
predicting(big_data_path, big_inter_path)

------------------------ Predicting ------------------------
Wall time: 2.104006767272949s
peak memory: 352.85 MiB, increment: 182.97 MiB


### Result analysis

In [10]:
def classification_performance(inter_path, result_path, index_list):
    test_y = np.load(f"{inter_path}/test_y.npy", allow_pickle=True).astype('int')
    df_metrics = pd.DataFrame(index=index_list, columns=['acc', 'macro-p', 'macro-r', 'macro-f1', 'logloss'])
    for name in index_list:
        submit = pd.read_csv(f"{result_path}/result_{name}.csv")
        submit = submit.iloc[:, 1:]
        result = np.array(submit)
        y_pred = np.argmax(result, axis=1)
        acc = accuracy_score(test_y, y_pred)
        p, r, f1, _ = precision_recall_fscore_support(test_y, y_pred, average='macro')
        logloss = log_loss(test_y, result, labels=list(range(result.shape[1])))
        df_metrics.loc[name, :] = [acc, p, r, f1, logloss]
    df_metrics.to_csv(f"{result_path}/evaluation_metrics.csv")
    print(df_metrics)

In [13]:
classification_performance(bdci_inter_path, bdci_result_path, feature_list + model_list)

                                 acc   macro-p   macro-r  macro-f1   logloss
histogram                   0.975017  0.950061  0.930002  0.938953  0.082651
byteentropy                 0.979124  0.964959  0.966733  0.965563  0.075416
strings                     0.977755  0.936364  0.954937  0.944718  0.072705
section                     0.991786  0.966167  0.990602  0.976858  0.033631
imports                     0.926762  0.922428   0.87799  0.865087  0.251241
exports                      0.34976  0.323642  0.261582  0.233058  1.668876
words_1000                  0.990418  0.977851  0.984967  0.981104  0.035151
ins_1000                    0.992813  0.988375  0.990224  0.989185  0.029193
semantic                    0.988022  0.978758  0.974376  0.976079  0.044488
ember                       0.986653  0.978295  0.968926  0.973194  0.052805
ember_section_ins_words     0.994867   0.99044   0.99366  0.991934  0.019895
ember_section_ins_semantic  0.995209  0.970643  0.993817  0.980718  0.018585

In [23]:
classification_performance(big_inter_path, big_result_path, feature_list + model_list)

                                 acc   macro-p   macro-r  macro-f1   logloss
histogram                   0.978845  0.979787  0.941858   0.95807   0.07321
byteentropy                  0.97351  0.953589  0.918221  0.931537  0.089103
strings                     0.977557   0.96776  0.932936  0.947676  0.081044
section                     0.967071  0.940721  0.950775  0.945161  0.106358
imports                     0.959713   0.94057  0.850859  0.856238  0.136761
exports                     0.360743  0.217952  0.185669  0.154771  1.721513
words_1000                  0.978845  0.967074  0.906762  0.927288  0.063254
ins_1000                    0.984915  0.984831  0.972269  0.978238  0.052846
semantic                    0.978109  0.962304  0.909685  0.926282  0.079467
ember                       0.984547  0.981237  0.929299  0.946866  0.054321
ember_section_ins_words     0.989882  0.982294  0.970489  0.976014  0.032105
ember_section_ins_semantic  0.988595  0.982465  0.979046  0.980636  0.039149