## 进行时间切分实验

In [1]:
import tensorflow as tf 

print(tf.__version__)

2.4.1


In [5]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [6]:
# 加载数据 为平衡数据量，将2018年3月之前的数据和2017年的数据合并
def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}
data_csv = pd.read_csv('../../csv/dataset_handled.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
    
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017.append(data_2018_１)
data_test_1 = data_2018_2
data_test_2 = data_2019
print(data_train.shape, data_test_1.shape, data_test_2.shape)

(6929, 9) (2668, 9) (3878, 9)


In [9]:
from collections import Counter

print(Counter(data_train.label))
print(Counter(data_test_1.label))
print(Counter(data_test_2.label))

Counter({'worm': 1900, 'virus': 1865, 'trojan': 1783, 'backdoor': 1381})
Counter({'trojan': 817, 'virus': 735, 'worm': 559, 'backdoor': 557})
Counter({'trojan': 1000, 'virus': 1000, 'worm': 1000, 'backdoor': 878})


In [7]:
data_test_1.to_csv("data_test_1.csv")
data_test_2.to_csv("data_test_2.csv")

In [10]:
# 对测试集进行下采样
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)
data_test_1 = data_test_1.sample(frac=0.73, random_state=1).reset_index(drop=True)
data_test_2 = data_test_2.sample(frac=0.54, random_state=1).reset_index(drop=True)
print(data_train.shape, data_test_1.shape, data_test_2.shape)

(6929, 9) (1948, 9) (2094, 9)


In [11]:
from collections import Counter
Counter(data_train['label']), Counter(data_test_１['label']), Counter(data_test_2['label']), Counter(data_csv['label'])

(Counter({'worm': 1900, 'trojan': 1783, 'backdoor': 1381, 'virus': 1865}),
 Counter({'worm': 415, 'trojan': 589, 'virus': 541, 'backdoor': 403}),
 Counter({'worm': 523, 'backdoor': 480, 'trojan': 553, 'virus': 538}),
 Counter({'backdoor': 2816, 'trojan': 3600, 'virus': 3600, 'worm': 3459}))

In [12]:
from utils import *
data_path_1, labels_1 = path_loader(data_train)
data_path_2, labels_2 = path_loader(data_test_1)
data_path_3, labels_3 = path_loader(data_test_2)

In [7]:
# 加载数据并进行训练
from tensorflow import keras
from tensorflow.keras import layers, optimizers, Sequential, metrics
from tensorflow.keras import Input, Model, regularizers
from malconv import get_malconv


batch_size = 128
db_train = tf.data.Dataset.from_generator(train_data_generator, args=[data_path_1, labels_1, 'train'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_train = db_train.batch(batch_size).repeat(9)

db_val = tf.data.Dataset.from_generator(train_data_generator, args=[data_path_1, labels_1, 'val'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_val = db_val.batch(batch_size).repeat(8)

model = get_malconv()
model.compile(optimizer=keras.optimizers.Adam(),
             loss=keras..CategoricalCrossentropy(from_logits=True),
             metrics=[keras.metrics.CategoricalAccuracy()])

history = model.fit(db_train,
                    epochs=7,
                    steps_per_epoch=46,
                    validation_data=db_val,
                    validation_steps=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [13]:
 # 绘制loss和acc曲线
def PlotLoss(model_name, history):
    
    p1 = plt.figure(figsize=(10, 3.5),dpi=300)
    ax1 = p1.add_subplot(1,2,1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["train","val"],loc="upper right")

    ax2 = p1.add_subplot(1,2,2)
    plt.plot(history.history['categorical_accuracy'])
    plt.plot(history.history['val_categorical_accuracy'])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train","val"],loc="lower right")
    plt.savefig("./figures/" + model_name + '.png')

PlotLoss('malconv_split', history)

NameError: name 'history' is not defined

In [9]:
# 进行模型保存
model.save('../model/malconv_split.h5')

In [14]:
# 测试数据
from sklearn.metrics import classification_report
from tensorflow import keras
model = keras.models.load_model('../model/malconv_split.h5')
# 批量预测数据
batch_size = 100
for idx in range(0, len(data_path_2), batch_size):
    code_tmp, label_tmp = test_data_loader(data_path_2[idx:idx+batch_size], labels_2[idx:idx+batch_size])
    y_pred_tmp = model.predict(code_tmp)
    y_pred_tmp = np.argmax(y_pred_tmp, axis=1)
    y_true_tmp = np.argmax(label_tmp, axis=1)
    y_pred = y_pred_tmp if idx == 0 else np.concatenate((y_pred, y_pred_tmp), axis = 0)
    y_true = y_true_tmp if idx == 0 else np.concatenate((y_true, y_true_tmp), axis = 0)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}

print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

100%|██████████| 100/100 [00:01<00:00, 65.43it/s]
100%|██████████| 100/100 [00:01<00:00, 60.08it/s]
100%|██████████| 100/100 [00:01<00:00, 63.91it/s]
100%|██████████| 100/100 [00:01<00:00, 74.15it/s]
100%|██████████| 100/100 [00:01<00:00, 72.04it/s]
100%|██████████| 100/100 [00:01<00:00, 64.46it/s]
100%|██████████| 100/100 [00:01<00:00, 60.88it/s]
100%|██████████| 100/100 [00:01<00:00, 66.58it/s]
100%|██████████| 100/100 [00:01<00:00, 53.95it/s]
100%|██████████| 100/100 [00:01<00:00, 67.25it/s]
100%|██████████| 100/100 [00:01<00:00, 51.36it/s]
100%|██████████| 100/100 [00:02<00:00, 45.68it/s]
100%|██████████| 100/100 [00:03<00:00, 31.12it/s]
100%|██████████| 100/100 [00:02<00:00, 38.35it/s]
100%|██████████| 100/100 [00:02<00:00, 45.98it/s]
100%|██████████| 100/100 [00:04<00:00, 20.15it/s]
100%|██████████| 100/100 [00:02<00:00, 36.23it/s]
100%|██████████| 100/100 [00:02<00:00, 39.76it/s]
100%|██████████| 100/100 [00:04<00:00, 20.33it/s]
100%|██████████| 48/48 [00:01<00:00, 35.07it/s]
  

In [15]:
# 2019年测试集
# 批量预测数据
batch_size = 100
for idx in range(0, len(data_path_3), batch_size):
    code_tmp, label_tmp = test_data_loader(data_path_3[idx:idx+batch_size], labels_3[idx:idx+batch_size])
    y_pred_tmp = model.predict(code_tmp)
    y_pred_tmp = np.argmax(y_pred_tmp, axis=1)
    y_true_tmp = np.argmax(label_tmp, axis=1)
    y_pred = y_pred_tmp if idx == 0 else np.concatenate((y_pred, y_pred_tmp), axis = 0)
    y_true = y_true_tmp if idx == 0 else np.concatenate((y_true, y_true_tmp), axis = 0)

target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

100%|██████████| 100/100 [00:04<00:00, 22.61it/s]
100%|██████████| 100/100 [00:03<00:00, 25.45it/s]
100%|██████████| 100/100 [00:05<00:00, 19.00it/s]
100%|██████████| 100/100 [00:03<00:00, 27.47it/s]
100%|██████████| 100/100 [00:03<00:00, 29.13it/s]
100%|██████████| 100/100 [00:03<00:00, 27.45it/s]
100%|██████████| 100/100 [00:04<00:00, 23.72it/s]
100%|██████████| 100/100 [00:03<00:00, 27.86it/s]
100%|██████████| 100/100 [00:03<00:00, 30.38it/s]
100%|██████████| 100/100 [00:02<00:00, 35.15it/s]
100%|██████████| 100/100 [00:03<00:00, 31.48it/s]
100%|██████████| 100/100 [00:03<00:00, 32.11it/s]
100%|██████████| 100/100 [00:04<00:00, 24.90it/s]
100%|██████████| 100/100 [00:02<00:00, 35.43it/s]
100%|██████████| 100/100 [00:02<00:00, 37.35it/s]
100%|██████████| 100/100 [00:02<00:00, 36.69it/s]
100%|██████████| 100/100 [00:02<00:00, 39.93it/s]
100%|██████████| 100/100 [00:03<00:00, 31.27it/s]
100%|██████████| 100/100 [00:02<00:00, 36.39it/s]
100%|██████████| 100/100 [00:02<00:00, 41.23it/s]


In [17]:
# 训练数据
from sklearn.metrics import classification_report
from tensorflow import keras
model = keras.models.load_model('../model/malconv_split.h5')

# 批量预测数据
batch_size = 100
for idx in range(0, len(data_path_1), batch_size):
    code_tmp, label_tmp = test_data_loader(data_path_1[idx:idx+batch_size], labels_1[idx:idx+batch_size])
    y_pred_tmp = model.predict(code_tmp)
    y_pred_tmp = np.argmax(y_pred_tmp, axis=1)
    y_true_tmp = np.argmax(label_tmp, axis=1)
    y_pred = y_pred_tmp if idx == 0 else np.concatenate((y_pred, y_pred_tmp), axis = 0)
    y_true = y_true_tmp if idx == 0 else np.concatenate((y_true, y_true_tmp), axis = 0)

# codes_train, labels_train = train_data_loader(data_path_１, labels_１, 'train')
# y_pred = model.predict(codes_train)
# y_pred = np.argmax(y_pred, axis=1)
# y_true = np.argmax(labels_train, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names, digits=6))

100%|██████████| 100/100 [00:03<00:00, 25.84it/s]
100%|██████████| 100/100 [00:04<00:00, 23.55it/s]
100%|██████████| 100/100 [00:03<00:00, 27.13it/s]
100%|██████████| 100/100 [00:03<00:00, 25.77it/s]
100%|██████████| 100/100 [00:03<00:00, 29.38it/s]
100%|██████████| 100/100 [00:03<00:00, 26.98it/s]
100%|██████████| 100/100 [00:04<00:00, 24.63it/s]
100%|██████████| 100/100 [00:04<00:00, 22.85it/s]
100%|██████████| 100/100 [00:02<00:00, 33.67it/s]
100%|██████████| 100/100 [00:03<00:00, 28.74it/s]
100%|██████████| 100/100 [00:04<00:00, 22.67it/s]
100%|██████████| 100/100 [00:04<00:00, 21.49it/s]
100%|██████████| 100/100 [00:04<00:00, 20.39it/s]
100%|██████████| 100/100 [00:04<00:00, 22.85it/s]
100%|██████████| 100/100 [00:06<00:00, 16.01it/s]
100%|██████████| 100/100 [00:07<00:00, 13.02it/s]
100%|██████████| 100/100 [00:05<00:00, 16.75it/s]
100%|██████████| 100/100 [00:05<00:00, 16.85it/s]
100%|██████████| 100/100 [00:09<00:00, 10.77it/s]
100%|██████████| 100/100 [00:07<00:00, 12.77it/s]
