## 利用OCSVM进行novelty detection

In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers, optimizers, Sequential, metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
# 加载数据
def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}
data_csv = pd.read_csv('res_handle.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
    
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017.append(data_2018_１)
data_test_1 = data_2018_２
data_test_2 = data_2019
print(data_train.shape, data_test_1.shape, data_test_2.shape)

(6104, 5) (2642, 5) (3656, 5)


In [3]:
# 对测试集进行下采样
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)
data_test_1 = data_test_1.sample(frac=0.7, random_state=1).reset_index(drop=True)
data_test_2 = data_test_2.sample(frac=0.5, random_state=1).reset_index(drop=True)
print(data_train.shape, data_test_1.shape, data_test_2.shape)

(6104, 5) (1849, 5) (1828, 5)


In [4]:
from utils import *
data_path_1, labels_1 = path_loader(data_train)
data_path_2, labels_2 = path_loader(data_test_1)
data_path_3, labels_3 = path_loader(data_test_2)

In [5]:
# 利用labelencoder构建opcode编码器

from sklearn import preprocessing
from tqdm import tqdm

opcode_all = []
for idx in tqdm(range(data_csv.shape[0])):
    opcode_str = data_csv.at[idx, 'opcode'].split()
    opcode_all += opcode_str
    opcode_all = list(set(opcode_all))

le = preprocessing.LabelEncoder()
le.fit(opcode_all)
len(list(le.classes_))

100%|██████████| 12402/12402 [00:06<00:00, 1839.10it/s]


641

In [6]:
def train_data_loader(codes, labels, mode):
    
    if mode == 'train':    
        codes = codes[: 5100]
        labels = labels[: 5100]
    elif mode == 'val':    
        codes = codes[5100: ]
        labels = labels[5100: ]
    
    labels_res = np.eye(4)[labels]
    res_all = []
    for idx in tqdm(range(len(codes))):
        fn = codes[idx]
        op_string = data_csv[data_csv.name == fn]
        op_string = op_string.iloc[0, 1].split()
        res = list(le.transform(op_string))
        res = pad_data(res)
        res_all.append(res)
    return np.array(res_all), labels_res

In [7]:
from tensorflow.keras import Model

model = keras.models.load_model('./model/textcnn_split.h5')
codes_train, labels_train = train_data_loader(data_path_1, labels_1, 'test')
layer_model = Model(inputs=model.input, outputs=model.layers[12].output)
feature_train = layer_model.predict(codes_train)

100%|██████████| 6104/6104 [00:27<00:00, 222.87it/s]


In [8]:
# 将特征向量保存为文件
np.save('feature_train.npy', feature_train)

In [9]:
feature_train.shape

(6104, 64)

In [10]:
data_train['vector'] = ''
for idx in range(data_train.shape[0]):
    data_train.at[idx, 'vector'] = list(feature_train[idx])
    
data_backdoor = data_train[data_train['label']=='backdoor']
data_virus = data_train[data_train['label']=='virus']
data_worm = data_train[data_train['label']=='worm']
data_trojan = data_train[data_train['label']=='trojan']

data_backdoor.shape, data_virus.shape, data_worm.shape, data_trojan.shape

((1411, 6), (1804, 6), (1324, 6), (1565, 6))

In [11]:
data_backdoor = np.array(data_backdoor['vector'].to_list())
data_backdoor_train = data_backdoor[: 1200]
data_backdoor_val = data_backdoor[1200: ]

data_virus = np.array(data_virus['vector'].to_list())
data_virus_train = data_virus[: 1600]
data_virus_val = data_virus[1600: ]

data_worm = np.array(data_worm['vector'].to_list())
data_worm_train = data_worm[: 1100]
data_worm_val = data_worm[1100: ]

data_trojan = np.array(data_trojan['vector'].to_list())
data_trojan_train = data_trojan[: 1300]
data_trojan_val = data_trojan[1300: ]

In [12]:
from sklearn.svm import OneClassSVM
def percen(arr):
    num = sum(arr==1)
    return num / arr.size

one_svm_backdoor = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_backdoor_train)
percen(one_svm_backdoor.predict(data_backdoor_val))

1.0

In [13]:
one_svm_virus = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_virus_train)
percen(one_svm_virus.predict(data_virus_val))

1.0

In [14]:
one_svm_worm = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_worm_train)
percen(one_svm_worm.predict(data_worm_val))

0.9821428571428571

In [15]:
one_svm_trojan = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_trojan_train)
percen(one_svm_trojan.predict(data_trojan_val))

1.0

In [16]:
# 对第一个测试子集进行novelty detection
codes_test_1, labels_test_1 = path_loader(data_test_1)
codes_test_1, labels_test_1 = train_data_loader(codes_test_1, labels_test_1, 'test')
feature_test = layer_model.predict(codes_test_1)
data_test_1['novelty'] = ''

res_1 = one_svm_backdoor.predict(feature_test)
res_2 = one_svm_virus.predict(feature_test)
res_3 = one_svm_worm.predict(feature_test)
res_4 = one_svm_trojan.predict(feature_test)

for idx in range(data_test_1.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_1.at[idx, 'novelty'] = 1
    else:
        data_test_1.at[idx, 'novelty'] = 0
        
data_normal = data_test_1[data_test_1['novelty']==0]
print(data_normal.shape)

100%|██████████| 1849/1849 [00:13<00:00, 137.98it/s]


(1454, 6)


In [17]:
from sklearn.metrics import classification_report

codes_normal, labels_normal = path_loader(data_normal)
codes_normal, labels_normal = train_data_loader(codes_normal, labels_normal, 'test')

y_pred = model.predict(codes_normal)
y_true = np.argmax(labels_normal, axis=1)
y_pred = np.argmax(y_pred, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names))

100%|██████████| 1454/1454 [00:10<00:00, 133.50it/s]


              precision    recall  f1-score   support

      trojan       0.99      1.00      0.99       455
    backdoor       0.99      1.00      1.00       521
       virus       1.00      0.88      0.94       150
        worm       0.95      0.98      0.96       328

    accuracy                           0.98      1454
   macro avg       0.98      0.96      0.97      1454
weighted avg       0.98      0.98      0.98      1454



In [18]:
# 将模型预测错误的数据标注为OOD数据

test_pred_1 = model.predict(codes_test_1)
test_pred_1 = np.argmax(test_pred_1, axis=1)
test_true_1 = np.argmax(labels_test_1, axis=1)
bool_pred = np.equal(test_pred_1, test_true_1)
data_test_1['id'] = bool_pred
data_test_1.to_csv('data_test_1.csv', index=0)

In [19]:
# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_1['novelty'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_1['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1643
           1       0.45      0.87      0.60       206

    accuracy                           0.87      1849
   macro avg       0.72      0.87      0.76      1849
weighted avg       0.92      0.87      0.89      1849



In [20]:
# 对第二个测试子集进行novelty detection
codes_test_２, labels_test_２ = path_loader(data_test_２)
codes_test_２, labels_test_２ = train_data_loader(codes_test_２, labels_test_２, 'test')
feature_test_２ = layer_model.predict(codes_test_２)
data_test_２['novelty'] = ''

res_1 = one_svm_backdoor.predict(feature_test_２)
res_2 = one_svm_virus.predict(feature_test_２)
res_3 = one_svm_worm.predict(feature_test_２)
res_4 = one_svm_trojan.predict(feature_test_２)

for idx in range(data_test_２.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_２.at[idx, 'novelty'] = 1
    else:
        data_test_２.at[idx, 'novelty'] = 0
        
data_normal = data_test_２[data_test_２['novelty']==0]
print(data_normal.shape)

100%|██████████| 1828/1828 [00:06<00:00, 304.01it/s]


(1123, 6)


In [21]:
from sklearn.metrics import classification_report

codes_normal, labels_normal = path_loader(data_normal)
codes_normal, labels_normal = train_data_loader(codes_normal, labels_normal, 'test')

y_pred = model.predict(codes_normal)
y_true = np.argmax(labels_normal, axis=1)
y_pred = np.argmax(y_pred, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names))

100%|██████████| 1123/1123 [00:04<00:00, 262.01it/s]


              precision    recall  f1-score   support

    backdoor       0.99      0.80      0.88       292
        worm       0.86      1.00      0.92       319
      trojan       0.98      0.88      0.93       182
       virus       0.94      1.00      0.97       330

    accuracy                           0.93      1123
   macro avg       0.94      0.92      0.93      1123
weighted avg       0.94      0.93      0.93      1123



In [22]:
# 将模型预测错误的数据标注为OOD数据

test_pred_2 = model.predict(codes_test_2)
test_pred_2 = np.argmax(test_pred_2, axis=1)
test_true_2 = np.argmax(labels_test_2, axis=1)
bool_pred = np.equal(test_pred_2, test_true_2)
data_test_2['id'] = bool_pred
data_test_2.to_csv('data_test_2.csv', index=0)

# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_2['novelty'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_2['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred))

              precision    recall  f1-score   support

           0       0.93      0.69      0.79      1505
           1       0.34      0.75      0.47       323

    accuracy                           0.70      1828
   macro avg       0.64      0.72      0.63      1828
weighted avg       0.83      0.70      0.74      1828

