## 利用OCSVM进行novelty detection

In [7]:
# tensorflowGPU的设置
import tensorflow as tf

# tf.config.experimental.list_physical_devices('CPU')
gpus = tf.config.experimental.list_physical_devices('GPU')
# 在同时具有CPU和GPU的设备上，优先使用GPU运算, 实际中可以使用with tf.device创建设备上下文
# 例如： with tf.decice('/CPU:0'): 
# 限制内存增长
tf.config.experimental.set_memory_growth(gpus[0], True)

In [8]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers, optimizers, Sequential, metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [9]:
# 加载数据
def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}
data_csv = pd.read_csv('res_handle.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
    
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017.append(data_2018_１)
data_test_1 = data_2018_２
data_test_2 = data_2019
print("训练集，2018数据集，2019数据集统计：", data_train.shape, data_test_1.shape, data_test_2.shape)

# 对测试集进行下采样
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)
data_test_1 = data_test_1.sample(frac=0.7, random_state=1).reset_index(drop=True)
data_test_2 = data_test_2.sample(frac=0.5, random_state=1).reset_index(drop=True)
print("采样后训练集，2018数据集，2019数据集统计：", data_train.shape, data_test_1.shape, data_test_2.shape)

训练集，2018数据集，2019数据集统计： (6104, 5) (2642, 5) (3656, 5)
采样后训练集，2018数据集，2019数据集统计： (6104, 5) (1849, 5) (1828, 5)


In [10]:
from utils import *
data_name_1, labels_1 = name_loader(data_train)
data_name_2, labels_2 = name_loader(data_test_1)
data_name_3, labels_3 = name_loader(data_test_2)

In [11]:
# 利用labelencoder构建opcode编码器

from sklearn import preprocessing
from tqdm import tqdm

opcode_all = []
for idx in tqdm(range(data_csv.shape[0])):
    opcode_str = data_csv.at[idx, 'opcode'].split()
    opcode_all += opcode_str
    opcode_all = list(set(opcode_all))

le = preprocessing.LabelEncoder()
le.fit(opcode_all)
len(list(le.classes_))

100%|██████████| 12402/12402 [00:07<00:00, 1720.76it/s]


641

In [12]:
def train_data_loader(codes, labels, mode):
    
    if mode == 'train':    
        codes = codes[: 5100]
        labels = labels[: 5100]
    elif mode == 'val':    
        codes = codes[5100: ]
        labels = labels[5100: ]
    
    labels_res = np.eye(4)[labels]
    res_all = []
    for idx in tqdm(range(len(codes))):
        fn = codes[idx]
        op_string = data_csv[data_csv.name == fn]
        op_string = op_string.iloc[0, 1].split()
        res = list(le.transform(op_string))
        res = pad_data(res)
        res_all.append(res)
    return np.array(res_all), labels_res

In [13]:
from tensorflow.keras import Model

model = keras.models.load_model('./model/textcnn_split.h5')
codes_train, labels_train = train_data_loader(data_name_1, labels_1, 'test')
layer_model = Model(inputs=model.input, outputs=model.layers[12].output)
feature_train = layer_model.predict(codes_train)

# 将特征向量保存为文件
np.save('feature_train.npy', feature_train)

print("特征向量维度：", feature_train.shape)

100%|██████████| 6104/6104 [00:32<00:00, 187.53it/s]
特征向量维度： (6104, 64)


In [14]:
data_train['vector'] = ''
for idx in range(data_train.shape[0]):
    data_train.at[idx, 'vector'] = list(feature_train[idx])
    
data_backdoor = data_train[data_train['label']=='backdoor']
data_virus = data_train[data_train['label']=='virus']
data_worm = data_train[data_train['label']=='worm']
data_trojan = data_train[data_train['label']=='trojan']

print("backdoor，virus, worm, trojan数据统计：", data_backdoor.shape, data_virus.shape, data_worm.shape, data_trojan.shape)

data_backdoor = np.array(data_backdoor['vector'].to_list())
data_backdoor_train = data_backdoor[: 1200]
data_backdoor_val = data_backdoor[1200: ]

data_virus = np.array(data_virus['vector'].to_list())
data_virus_train = data_virus[: 1600]
data_virus_val = data_virus[1600: ]

data_worm = np.array(data_worm['vector'].to_list())
data_worm_train = data_worm[: 1100]
data_worm_val = data_worm[1100: ]

data_trojan = np.array(data_trojan['vector'].to_list())
data_trojan_train = data_trojan[: 1300]
data_trojan_val = data_trojan[1300: ]

backdoor，virus, worm, trojan数据统计： (1411, 6) (1804, 6) (1324, 6) (1565, 6)


In [15]:
from sklearn.svm import OneClassSVM
def percen(arr):
    num = sum(arr==1)
    return num / arr.size

one_svm_backdoor = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_backdoor_train)
print("backdoor验证分数：", percen(one_svm_backdoor.predict(data_backdoor_val)))

one_svm_virus = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_virus_train)
print("virus验证分数：", percen(one_svm_virus.predict(data_virus_val)))

one_svm_worm = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_worm_train)
print("worm验证分数：", percen(one_svm_worm.predict(data_worm_val)))

one_svm_trojan = OneClassSVM(nu=0.001, kernel="rbf", gamma='scale').fit(data_trojan_train)
print("trojan验证分数：", percen(one_svm_trojan.predict(data_trojan_val)))

backdoor验证分数： 1.0
virus验证分数： 0.9950980392156863
worm验证分数： 0.9866071428571429
trojan验证分数： 0.9962264150943396


In [18]:
# 对第一个测试子集进行novelty detection
codes_test_1, labels_test_1 = name_loader(data_test_1)
codes_test_1, labels_test_1 = train_data_loader(codes_test_1, labels_test_1, 'test')
feature_test = layer_model.predict(codes_test_1)
data_test_1['novelty'] = ''

res_1 = one_svm_backdoor.predict(feature_test)
res_2 = one_svm_virus.predict(feature_test)
res_3 = one_svm_worm.predict(feature_test)
res_4 = one_svm_trojan.predict(feature_test)

for idx in range(data_test_1.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_1.at[idx, 'novelty'] = 1
    else:
        data_test_1.at[idx, 'novelty'] = 0
        
data_normal = data_test_1[data_test_1['novelty']==0]
print(data_normal.shape)

# textcnn模型的performance
from sklearn.metrics import classification_report

codes_normal, labels_normal = name_loader(data_normal)
codes_normal, labels_normal = train_data_loader(codes_normal, labels_normal, 'test')

y_pred = model.predict(codes_normal)
y_true = np.argmax(labels_normal, axis=1)
y_pred = np.argmax(y_pred, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

# 将模型预测错误的数据标注为OOD数据

test_pred_1 = model.predict(codes_test_1)
test_pred_1 = np.argmax(test_pred_1, axis=1)
test_true_1 = np.argmax(labels_test_1, axis=1)
bool_pred = np.equal(test_pred_1, test_true_1)
data_test_1['id'] = bool_pred
data_test_1.to_csv('data_test_1.csv', index=0)

# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_1['novelty'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_1['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred, digits=4))

100%|██████████| 1849/1849 [00:12<00:00, 145.76it/s]
  2%|▏         | 29/1328 [00:00<00:06, 207.14it/s](1328, 6)
100%|██████████| 1328/1328 [00:11<00:00, 120.30it/s]
              precision    recall  f1-score   support

       virus     0.9877    0.9907    0.9892       323
    backdoor     0.9887    1.0000    0.9943       526
        worm     1.0000    0.8239    0.9034       159
      trojan     0.9238    0.9844    0.9531       320

    accuracy                         0.9729      1328
   macro avg     0.9750    0.9497    0.9600      1328
weighted avg     0.9742    0.9729    0.9723      1328

              precision    recall  f1-score   support

           0     0.9729    0.7840    0.8683      1648
           1     0.3167    0.8209    0.4571       201

    accuracy                         0.7880      1849
   macro avg     0.6448    0.8024    0.6627      1849
weighted avg     0.9016    0.7880    0.8236      1849



In [19]:
# 对第二个测试子集进行novelty detection
codes_test_２, labels_test_２ = name_loader(data_test_2)
codes_test_２, labels_test_２ = train_data_loader(codes_test_２, labels_test_２, 'test')
feature_test_２ = layer_model.predict(codes_test_２)
data_test_２['novelty'] = ''

res_1 = one_svm_backdoor.predict(feature_test_２)
res_2 = one_svm_virus.predict(feature_test_２)
res_3 = one_svm_worm.predict(feature_test_２)
res_4 = one_svm_trojan.predict(feature_test_２)

for idx in range(data_test_２.shape[0]):
    res = res_1[idx]+res_2[idx]+res_3[idx]+res_4[idx]
    if res == -4 or res >= 0:
        data_test_２.at[idx, 'novelty'] = 1
    else:
        data_test_２.at[idx, 'novelty'] = 0
        
data_normal = data_test_２[data_test_２['novelty']==0]
print(data_normal.shape)

# text_cnn模型performance
from sklearn.metrics import classification_report

codes_normal, labels_normal = name_loader(data_normal)
codes_normal, labels_normal = train_data_loader(codes_normal, labels_normal, 'test')

y_pred = model.predict(codes_normal)
y_true = np.argmax(labels_normal, axis=1)
y_pred = np.argmax(y_pred, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

# 将模型预测错误的数据标注为OOD数据

test_pred_2 = model.predict(codes_test_2)
test_pred_2 = np.argmax(test_pred_2, axis=1)
test_true_2 = np.argmax(labels_test_2, axis=1)
bool_pred = np.equal(test_pred_2, test_true_2)
data_test_2['id'] = bool_pred
data_test_2.to_csv('data_test_2.csv', index=0)

# 计算ocsvm识别的准确率

svm_pred = np.array(data_test_2['novelty'])
svm_pred = svm_pred.astype('int64')
ood2label = {True: 0, False: 1}
label_id = np.array(data_test_2['id'].map(lambda x: ood2label[x]))
print(classification_report(label_id, svm_pred, digits=4))

100%|██████████| 1828/1828 [00:07<00:00, 250.59it/s]
  2%|▏         | 21/1205 [00:00<00:08, 142.79it/s](1205, 6)
100%|██████████| 1205/1205 [00:05<00:00, 217.39it/s]
              precision    recall  f1-score   support

       virus     0.9907    0.7970    0.8833       266
    backdoor     0.8628    0.9971    0.9251       347
        worm     0.9945    0.6830    0.8098       265
      trojan     0.7990    0.9969    0.8871       327

    accuracy                         0.8838      1205
   macro avg     0.9118    0.8685    0.8763      1205
weighted avg     0.9027    0.8838    0.8802      1205

              precision    recall  f1-score   support

           0     0.8838    0.7355    0.8029      1448
           1     0.3852    0.6316    0.4786       380

    accuracy                         0.7139      1828
   macro avg     0.6345    0.6835    0.6407      1828
weighted avg     0.7802    0.7139    0.7354      1828

