## 利用OCSVM进行novelty detection

In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers, optimizers, Sequential, metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
# 加载数据
def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}
data_csv = pd.read_csv('../../csv/dataset_handled.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
    
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017.append(data_2018_１)
data_test_1 = data_2018_2
data_test_2 = data_2019
print(data_train.shape, data_test_1.shape, data_test_2.shape)

(6929, 9) (2668, 9) (3878, 9)


In [3]:
# 对测试集进行下采样
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)
data_test_1 = data_test_1.sample(frac=0.73, random_state=1).reset_index(drop=True)
data_test_2 = data_test_2.sample(frac=0.54, random_state=1).reset_index(drop=True)
print(data_train.shape, data_test_1.shape, data_test_2.shape,)

(6929, 9) (1948, 9) (2094, 9)


In [4]:
from utils import *
data_path_1, labels_1 = path_loader(data_train)
data_path_2, labels_2 = path_loader(data_test_1)
data_path_3, labels_3 = path_loader(data_test_2)

In [12]:
from tensorflow.keras import Model

model = keras.models.load_model('../model/malconv_split.h5')
# 模型输出为全连接层的输入，即去掉全连接层
layer_model = Model(inputs=model.input, outputs=model.layers[8].output)
# 分批预测，防止内存过载
batch_size = 200
for idx in range(0, 6000, batch_size):
    codes_tmp, labels_tmp = train_data_loader(data_path_1[idx:idx+batch_size], labels_1[idx:idx+batch_size])
    feature_tmp = layer_model.predict(codes_tmp)
    feature_train = feature_tmp if idx == 0 else np.concatenate((feature_train, feature_tmp), axis=0)

# feature_train = layer_model.predict(codes_train)
print(type(feature_train))

In [None]:
# 将特征向量保存为文件
np.save('feature_train.npy', feature_train)

In [5]:
feature_train = np.load('feature_train.npy')
feature_train.shape

(6000, 64)

In [6]:
data_train = data_train[:6000]
data_train['vector'] = ''
for idx in range(data_train.shape[0]):
    data_train.at[idx, 'vector'] = list(feature_train[idx])
    
data_backdoor = data_train[data_train['label']=='backdoor']
data_virus = data_train[data_train['label']=='virus']
data_worm = data_train[data_train['label']=='worm']
data_trojan = data_train[data_train['label']=='trojan']

data_backdoor.shape, data_virus.shape, data_worm.shape, data_trojan.shape

((1200, 10), (1600, 10), (1649, 10), (1551, 10))

In [7]:
data_backdoor = np.array(data_backdoor['vector'].to_list())
data_backdoor_train = data_backdoor[: 1100]
data_backdoor_val = data_backdoor[1100: ]

data_virus = np.array(data_virus['vector'].to_list())
data_virus_train = data_virus[: 1300]
data_virus_val = data_virus[1300: ]

data_worm = np.array(data_worm['vector'].to_list())
data_worm_train = data_worm[: 1400]
data_worm_val = data_worm[1400: ]

data_trojan = np.array(data_trojan['vector'].to_list())
data_trojan_train = data_trojan[: 1300]
data_trojan_val = data_trojan[1300: ]

In [2]:
from sklearn.svm import OneClassSVM

In [1]:
one_svm_worm = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_worm_train)
one_svm_trojan = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_trojan_train)
one_svm_backdoor = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_backdoor_train)
one_svm_virus = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_virus_train)

NameError: name 'OneClassSVM' is not defined

In [10]:
# one_svm_trojan = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_trojan_train)
# percen(one_svm_trojan.predict(data_trojan_val))

In [13]:
# 对第一个测试子集进行novelty detection
# 分批预测，防止内存过载
batch_size = 200
for idx in range(0, len(data_test_1), batch_size):
    codes_tmp, labels_tmp = path_loader(data_test_1[idx:idx+batch_size])
    codes_tmp, labels_tmp = test_data_loader(codes_tmp, labels_tmp)
    feature_tmp = layer_model.predict(codes_tmp)
    feature_test = feature_tmp if idx == 0 else np.concatenate((feature_test, feature_tmp), axis=0)

data_test_1['ocsvm'] = ''

res_1 = one_svm_backdoor.predict(feature_test)
res_2 = one_svm_virus.predict(feature_test)
res_3 = one_svm_worm.predict(feature_test)
res_4 = one_svm_trojan.predict(feature_test)

# 综合不同分类器的结果判断最终结果,若分类器分类成功，则有一个结果趋近于1,另外三个趋近于-1,res>0或res==-4说明分类器并不能准确判定类型，
for idx in range(data_test_1.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_1.at[idx, 'ocsvm'] = 1
    else:
        data_test_1.at[idx, 'ocsvm'] = 0
        
data_normal = data_test_1[data_test_1['ocsvm']==0]
print(data_normal.shape)

100%|██████████| 200/200 [00:01<00:00, 176.82it/s]
100%|██████████| 200/200 [00:04<00:00, 48.34it/s]
100%|██████████| 200/200 [00:03<00:00, 51.06it/s]
100%|██████████| 200/200 [00:03<00:00, 50.06it/s]
100%|██████████| 200/200 [00:04<00:00, 49.99it/s]
100%|██████████| 200/200 [00:03<00:00, 51.67it/s]
100%|██████████| 200/200 [00:04<00:00, 49.48it/s]
100%|██████████| 200/200 [00:03<00:00, 52.98it/s]
100%|██████████| 200/200 [00:03<00:00, 51.90it/s]
100%|██████████| 148/148 [00:02<00:00, 55.83it/s]
(1742, 10)


In [15]:
from sklearn.metrics import classification_report
# 使用时间切分模型预测

codes_normal, labels_normal = path_loader(data_normal)
# 分批预测
batch_size = 200
for idx in range(0, len(codes_normal), batch_size):
    codes_tmp, labels_tmp = test_data_loader(codes_normal[idx: idx+batch_size], labels_normal[idx: idx+batch_size])
    y_pred_tmp = model.predict(codes_tmp)
    y_true_tmp = np.argmax(labels_tmp, axis=1)
    y_pred_tmp = np.argmax(y_pred_tmp, axis=1)
    y_pred = y_pred_tmp if idx == 0 else np.concatenate((y_pred, y_pred_tmp), axis=0)
    y_true = y_true_tmp if idx == 0 else np.concatenate((y_true, y_true_tmp), axis=0)

# codes_normal, labels_normal = test_data_loader(codes_normal, labels_normal)

target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

100%|██████████| 200/200 [00:01<00:00, 179.03it/s]
100%|██████████| 200/200 [00:00<00:00, 217.71it/s]
100%|██████████| 200/200 [00:00<00:00, 216.56it/s]
100%|██████████| 200/200 [00:01<00:00, 197.43it/s]
100%|██████████| 200/200 [00:01<00:00, 153.06it/s]
100%|██████████| 200/200 [00:00<00:00, 249.29it/s]
100%|██████████| 200/200 [00:00<00:00, 232.33it/s]
100%|██████████| 200/200 [00:00<00:00, 260.41it/s]
100%|██████████| 142/142 [00:00<00:00, 200.51it/s]
              precision    recall  f1-score   support

       virus     0.9337    0.9931    0.9625       581
    backdoor     0.9818    0.9963    0.9890       541
        worm     0.9863    0.9031    0.9429       320
      trojan     1.0000    0.9400    0.9691       300

    accuracy                         0.9684      1742
   macro avg     0.9754    0.9581    0.9659      1742
weighted avg     0.9697    0.9684    0.9682      1742



In [16]:
# 将模型预测错误的数据标注为OOD数据
# 分批预测，防止内存过载
batch_size = 200
for idx in range(0, len(data_test_1), batch_size):
    codes_tmp, labels_tmp = path_loader(data_test_1[idx:idx+batch_size])
    codes_tmp, labels_tmp = test_data_loader(codes_tmp, labels_tmp)
    test_pred_tmp = model.predict(codes_tmp)
    test_pred_tmp = np.argmax(test_pred_tmp, axis=1)
    test_true_tmp = np.argmax(labels_tmp, axis=1)
    bool_pred_tmp = np.equal(test_pred_tmp, test_true_tmp)
    bool_pred = bool_pred_tmp if idx == 0 else np.concatenate((bool_pred, bool_pred_tmp), axis=0)

data_test_1['id'] = bool_pred
data_test_1.to_csv('data_test_1.csv', index=0)

100%|██████████| 200/200 [00:04<00:00, 46.42it/s]
100%|██████████| 200/200 [00:03<00:00, 62.19it/s]
100%|██████████| 200/200 [00:03<00:00, 52.14it/s]
100%|██████████| 200/200 [00:03<00:00, 58.41it/s]
100%|██████████| 200/200 [00:04<00:00, 47.28it/s]
100%|██████████| 200/200 [00:03<00:00, 51.00it/s]
100%|██████████| 200/200 [00:03<00:00, 57.56it/s]
100%|██████████| 200/200 [00:03<00:00, 58.61it/s]
100%|██████████| 200/200 [00:02<00:00, 68.53it/s]
100%|██████████| 148/148 [00:02<00:00, 63.35it/s]


In [17]:
# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_1['ocsvm'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_1['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9684    0.9425    0.9553      1790
           1     0.5000    0.6519    0.5659       158

    accuracy                         0.9189      1948
   macro avg     0.7342    0.7972    0.7606      1948
weighted avg     0.9304    0.9189    0.9237      1948



In [18]:
# 对第二个测试子集进行novelty detection
# 分批预测，防止内存过载
batch_size = 200
for idx in range(0, len(data_test_2), batch_size):
    codes_tmp, labels_tmp = path_loader(data_test_2[idx:idx+batch_size])
    codes_tmp, labels_tmp = test_data_loader(codes_tmp, labels_tmp)
    feature_tmp = layer_model.predict(codes_tmp)
    feature_test_2 = feature_tmp if idx == 0 else np.concatenate((feature_test_2, feature_tmp), axis=0)

data_test_２['novelty'] = ''

res_1 = one_svm_backdoor.predict(feature_test_2)
res_2 = one_svm_virus.predict(feature_test_2)
res_3 = one_svm_worm.predict(feature_test_2)
res_4 = one_svm_trojan.predict(feature_test_2)

# 综合不同分类器的结果判断最终结果
for idx in range(data_test_２.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_２.at[idx, 'novelty'] = 1
    else:
        data_test_２.at[idx, 'novelty'] = 0
        
data_normal = data_test_２[data_test_２['novelty']==0]
print(data_normal.shape)

100%|██████████| 200/200 [00:08<00:00, 23.75it/s]
100%|██████████| 200/200 [00:09<00:00, 21.01it/s]
100%|██████████| 200/200 [00:07<00:00, 27.87it/s]
100%|██████████| 200/200 [00:06<00:00, 32.67it/s]
100%|██████████| 200/200 [00:04<00:00, 44.56it/s]
100%|██████████| 200/200 [00:08<00:00, 24.00it/s]
100%|██████████| 200/200 [00:05<00:00, 39.06it/s]
100%|██████████| 200/200 [00:04<00:00, 47.76it/s]
100%|██████████| 200/200 [00:04<00:00, 43.39it/s]
100%|██████████| 200/200 [00:04<00:00, 46.37it/s]
100%|██████████| 94/94 [00:02<00:00, 42.02it/s]
(1780, 10)


In [19]:
from sklearn.metrics import classification_report

codes_normal, labels_normal = path_loader(data_normal)

# 分批预测
batch_size = 200
for idx in range(0, len(codes_normal), batch_size):
    codes_tmp, labels_tmp = test_data_loader(codes_normal[idx: idx+batch_size], labels_normal[idx: idx+batch_size])
    y_pred_tmp = model.predict(codes_tmp)
    y_true_tmp = np.argmax(labels_tmp, axis=1)
    y_pred_tmp = np.argmax(y_pred_tmp, axis=1)
    y_pred = y_pred_tmp if idx == 0 else np.concatenate((y_pred, y_pred_tmp), axis=0)
    y_true = y_true_tmp if idx == 0 else np.concatenate((y_true, y_true_tmp), axis=0)

target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

100%|██████████| 200/200 [00:01<00:00, 193.42it/s]
100%|██████████| 200/200 [00:01<00:00, 174.05it/s]
100%|██████████| 200/200 [00:00<00:00, 221.00it/s]
100%|██████████| 200/200 [00:01<00:00, 199.21it/s]
100%|██████████| 200/200 [00:00<00:00, 224.40it/s]
100%|██████████| 200/200 [00:01<00:00, 169.56it/s]
100%|██████████| 200/200 [00:01<00:00, 193.66it/s]
100%|██████████| 200/200 [00:01<00:00, 176.09it/s]
100%|██████████| 180/180 [00:01<00:00, 166.43it/s]
              precision    recall  f1-score   support

       virus     0.9676    0.9676    0.9676       524
    backdoor     0.9580    0.9887    0.9731       530
        worm     0.9877    0.9902    0.9890       407
      trojan     0.9900    0.9342    0.9613       319

    accuracy                         0.9730      1780
   macro avg     0.9758    0.9701    0.9727      1780
weighted avg     0.9733    0.9730    0.9730      1780



In [20]:
# 将模型预测错误的数据标注为OOD数据
# 分批预测，防止内存过载
batch_size = 200
for idx in range(0, len(data_test_2), batch_size):
    codes_tmp, labels_tmp = path_loader(data_test_2[idx:idx+batch_size])
    codes_tmp, labels_tmp = test_data_loader(codes_tmp, labels_tmp)
    test_pred_tmp = model.predict(codes_tmp)
    test_pred_tmp = np.argmax(test_pred_tmp, axis=1)
    test_true_tmp = np.argmax(labels_tmp, axis=1)
    bool_pred_tmp = np.equal(test_pred_tmp, test_true_tmp)
    bool_pred = bool_pred_tmp if idx == 0 else np.concatenate((bool_pred, bool_pred_tmp), axis=0)

data_test_2['id'] = bool_pred
data_test_2.to_csv('data_test_2.csv', index=0)

100%|██████████| 200/200 [00:01<00:00, 131.16it/s]
100%|██████████| 200/200 [00:01<00:00, 179.64it/s]
100%|██████████| 200/200 [00:01<00:00, 186.90it/s]
100%|██████████| 200/200 [00:01<00:00, 188.54it/s]
100%|██████████| 200/200 [00:00<00:00, 208.06it/s]
100%|██████████| 200/200 [00:00<00:00, 219.18it/s]
100%|██████████| 200/200 [00:01<00:00, 144.77it/s]
100%|██████████| 200/200 [00:00<00:00, 208.64it/s]
100%|██████████| 200/200 [00:01<00:00, 150.11it/s]
100%|██████████| 200/200 [00:00<00:00, 259.84it/s]
100%|██████████| 94/94 [00:00<00:00, 125.94it/s]


In [21]:
# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_2['novelty'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_2['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9730    0.8956    0.9327      1934
           1     0.3567    0.7000    0.4726       160

    accuracy                         0.8806      2094
   macro avg     0.6649    0.7978    0.7026      2094
weighted avg     0.9259    0.8806    0.8975      2094

