In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras import layers, optimizers, Sequential, metrics

tf.__version__

'2.0.0'

In [2]:
# 将2018年的数据作进一步拆分

def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

    
data_csv = pd.read_csv('../../../csv/dataset.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)

In [3]:
name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}

# 训练数据
codes_train = data_train['name'].to_list()
labels_train = data_train['label'].map(lambda x: name2label[x])
labels_train = labels_train.to_list()

In [4]:
def pad_data(res, max_len=200000):
    
    length = len(res)
    if length > max_len:
        return res[ :max_len]
    elif length < max_len:
        return res + [0]*(max_len-length)
    return res


def load_data(codes, labels, mode):
    
    if mode == 'train':    
        codes = codes[: 3000]
        labels = labels[: 3000]
    elif mode == 'val':    
        codes = codes[3000: ]
        labels = labels[3000: ]
    
    labels = np.eye(4)[labels]
    for idx in range(len(codes)):
        fn = codes[idx]
        fn = bytes.decode(fn)
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                yield res, labels[idx]

                
def load_test_data(codes, labels):
    
    for idx in range(len(codes)):
        fn = codes[idx]
        fn = bytes.decode(fn)
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                yield res, labels[idx]
                
                
def load_val_data(codes, labels):
    
    res_all = []
    for idx in tqdm(range(len(codes))):
        fn = codes[idx]
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                res_all.append(res)
    return np.array(res_all), labels

In [5]:
# 定义网络结构
from tensorflow.keras import Input, Model, regularizers
from tensorflow import keras


max_len = 200000
win_size = 500

x = Input((max_len,))
emb = layers.Embedding(256, 8)(x)
conv1 = layers.Conv1D(kernel_size
                      =(win_size), filters=128, strides=(win_size), padding='same')(emb)
conv2 = layers.Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
a = layers.Activation('sigmoid', name='sigmoid')(conv2)
mul = layers.multiply([conv1, a])
a = layers.Activation('relu', name='relu')(mul)
p = layers.GlobalMaxPool1D()(a)
d = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(p)
# out1 = layers.Dense(4, activation='softmax')(d)
# out2 = layers.Dense(4, activation='softmax')(d)
# model = Model(inputs=x, outputs=[out1, out2])
out = layers.Dense(4, activation=None)(d)
model = Model(inputs=x, outputs=out)

In [None]:
batch_size = 128
db_train = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'train'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_train = db_train.batch(batch_size).repeat(8)

db_val = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'val'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_val = db_val.batch(batch_size).repeat(8)

model.compile(optimizer=keras.optimizers.Adam(),
             loss=keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=[keras.metrics.CategoricalAccuracy()])

history = model.fit(db_train,
                    epochs=5,
                    steps_per_epoch=3,
                    validation_data=db_val)

Train for 3 steps
Epoch 1/5

In [None]:
# 自定义损失函数
from tensorflow.keras import backend as K

class discrepancyloss(keras.losses.Loss):
    
    def __init__(self, reduction=keras.losses.Reduction.NONE, name='discrepancyLoss'):
        
        super().__init__(reduction=reduction, name=name)
    
    def call(self, out, _):
        
        max_ = K.max(out, axis=1)
        min_ = K.min(out, axis=1)
        
        return K.mean(1 - K.abs(max_-min_))

In [8]:
# 准备数据集

ood2label = {True: 0, False: 1}
data_test_1 = pd.read_csv('data_test_1.csv')
codes_test_1 = data_test_1['name'].to_list()
labels_test_1 = data_test_1['ID'].map(lambda x: ood2label[x])
labels_test_1 = labels_test_1.to_list()

batch_size = 128
sup_train = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'train'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
sup_train = sup_train.batch(batch_size)

# unsup_train = tf.data.Dataset.from_generator(load_test_data, args=[codes_test_1, labels_test_1], 
#                                           output_types=(tf.float32, tf.int32), output_shapes=((None,), ()))
# unsup_train = unsup_train.batch(batch_size)

codes_val, labels_val = load_val_data(codes_test_1, labels_test_1)

100%|██████████████████████████████████████████████████████████████████████████████| 1048/1048 [00:34<00:00, 30.70it/s]


In [None]:
# 开始训练
import visdom
from evaluate import evaluate

optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_sup = keras.losses.CategoricalCrossentropy()
loss_av = discrepancyloss()
vis = visdom.Visdom(server='http://localhost')

print('>> Fine-tune a Model.')
best_roc = 0.0
num_epochs = 10
iters = 0
plot_data = {'X': [], 'Y': [], 'legend': ['Sup. Loss', 'Av. Loss', 'Tot. Loss']}
for epoch in range(num_epochs):
    for step, batch in enumerate(sup_train):
        
        print('Start of epoch %d step %d' % (epoch,step))
        x_batch_train, y_batch_train = batch
        with tf.GradientTape() as tape:
            out = model(x_batch_train)
            loss_sup_value = loss_sup(y_batch_train, out)
            loss_av_value = loss_av(out, _)
            loss_value = loss_sup_value + loss_av_value
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # 可视化
        if step % 2 == 0:
            
            loss_sup_value = loss_sup_value.numpy()
            loss_unsup_value = loss_av_value.numpy()
            loss_value = loss_value.numpy()
            
            plot_data['X'].append(iters)
            plot_data['Y'].append([
                loss_sup_value, loss_av_value, loss_value])
            vis.line(
                X=np.stack([np.array(plot_data['X'])] * len(plot_data['legend']), 1),
                Y=np.array(plot_data['Y']),
                opts={
                    'title': 'Loss over Time',
                    'legend': plot_data['legend'],
                    'xlabel': 'Iterations',
                    'ylabel': 'Loss',
                    'width': 1200,
                    'height': 390,
                },
                win=2
            )
        iters += 1
            
    # 进行验证
    from sklearn import metrics
    out_val = model.predict(codes_val)
    labels = np.array(labels_val)
    dists = np.max(out_val, axis=1).reshape((labels.shape[0], ))
    auc = metrics.roc_auc_score(labels, dists)
    print('Epoch{} AUROC: {:.3f}'.format(epoch, auc))
    if best_auc < auc:
        best_auc = auc
        model.save_weights('./out/malconv_2.h5')
        print('Model saved.')
print('>> Finished.')

Setting up a new session...


>> Fine-tune a Model.
Start of epoch 0 step 0
Start of epoch 0 step 1
Start of epoch 0 step 2
Start of epoch 0 step 3
Start of epoch 0 step 4
Start of epoch 0 step 5
Start of epoch 0 step 6
Start of epoch 0 step 7
