In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras import layers, optimizers, Sequential, metrics

tf.__version__

'2.1.0'

In [2]:
# 将2018年的数据作进一步拆分

def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

    
data_csv = pd.read_csv('../../../csv/dataset.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017
data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)

In [3]:
name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}

# 训练数据
codes_train = data_train['name'].to_list()
labels_train = data_train['label'].map(lambda x: name2label[x])
labels_train = labels_train.to_list()

In [4]:
def pad_data(res, max_len=200000):
    
    length = len(res)
    if length > max_len:
        return res[ :max_len]
    elif length < max_len:
        return res + [0]*(max_len-length)
    return res


def load_data(codes, labels, mode):
    
    if mode == 'train':    
        codes = codes[: 3000]
        labels = labels[: 3000]
    elif mode == 'val':    
        codes = codes[3000: ]
        labels = labels[3000: ]
    
    labels = np.eye(4)[labels]
    for idx in range(len(codes)):
        fn = codes[idx]
        fn = bytes.decode(fn)
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                yield res, labels[idx]

                
def load_test_data(codes, labels):
    
    for idx in range(len(codes)):
        fn = codes[idx]
        fn = bytes.decode(fn)
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                yield res, labels[idx]

In [5]:
# 定义网络结构
from tensorflow.keras import Input, Model, regularizers
from tensorflow import keras


max_len = 200000
win_size = 500

x = Input((max_len,))
emb = layers.Embedding(256, 8)(x)
conv1 = layers.Conv1D(kernel_size
                      =(win_size), filters=128, strides=(win_size), padding='same')(emb)
conv2 = layers.Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
a = layers.Activation('sigmoid', name='sigmoid')(conv2)
mul = layers.multiply([conv1, a])
a = layers.Activation('relu', name='relu')(mul)
p = layers.GlobalMaxPool1D()(a)
d = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(p)
# out1 = layers.Dense(4, activation='softmax')(d)
# out2 = layers.Dense(4, activation='softmax')(d)
# model = Model(inputs=x, outputs=[out1, out2])
out = layers.Dense(4, activation=None)(d)
model = Model(inputs=x, outputs=out)

In [6]:
batch_size = 128  
db_train = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'train'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_train = db_train.batch(batch_size).repeat(8)

db_val = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'val'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
db_val = db_val.batch(batch_size).repeat(6)

model.compile(optimizer=keras.optimizers.Adam(),
             loss=keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=[keras.metrics.CategoricalAccuracy()])

history = model.fit(db_train,
                    epochs=6,
                    steps_per_epoch=24,
                    validation_data=db_val,
                    validation_steps = 4)

Train for 24 steps, validate for 4 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [11]:
model.save('./out/malconv_2.h5')

In [6]:
model = keras.models.load_model('./out/malconv_2.h5')

In [None]:
from tensorflow.keras import backend as K

class MembershipLoss(keras.losses.Loss):
    
    def __init__(self, reduction=keras.losses.Reduction.NONE, 
                 from_logits=True, name='membership loss'):
        super().__init__(reduction=reduction, name=name)
    
    def call(self, y_true, y_pred, gamma = 5.0):
        y_pred = K.sigmoid(y_pred)
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        pt_1 = K.sum(K.pow((1. - pt_1), 2), axis=1)
        pt_0 = gamma*(K.mean(K.pow(pt_0, 2), axis=1))
        cc = keras.losses.CategoricalCrossentropy(from_logits=True)
        cc_value = cc(y_true, y_pred)
        
        return K.mean(pt_1+pt_0) + cc_value

In [8]:
# 准备数据集

ood2label = {True: 0, False: 1}
data_test_1 = pd.read_csv('data_test_1.csv')
codes_test_1 = data_test_1['name'].to_list()
labels_test_1 = data_test_1['ID'].map(lambda x: ood2label[x])
labels_test_1 = labels_test_1.to_list()

batch_size = 128
sup_train = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'train'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
sup_train = sup_train.batch(batch_size, drop_remainder=True)

sup_val = tf.data.Dataset.from_generator(load_data, args=[codes_train, labels_train, 'val'], 
                                          output_types=(tf.float32, tf.int32), output_shapes=((None,), (4, )))
sup_val = sup_val.batch(batch_size, drop_remainder=True)


# unsup_train = tf.data.Dataset.from_generator(load_test_data, args=[codes_test_1, labels_test_1], 
#                                           output_types=(tf.float32, tf.int32), output_shapes=((None,), ()))
# unsup_train = unsup_train.batch(batch_size)

# codes_val, labels_val = load_val_data(codes_test_1, labels_test_1)

In [9]:
# 开始训练
import visdom
from evaluate import evaluate

optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_sup = keras.losses.CategoricalCrossentropy(from_logits=True)
loss_ml = MembershipLoss()
train_acc_metric = keras.metrics.CategoricalAccuracy()
val_acc_metric = keras.metrics.CategoricalAccuracy()
vis = visdom.Visdom(server='http://localhost')

print('>> Fine-tune a Model.')
best_roc = 0.0
num_epochs = 6
iters = 0
plot_data = {'X': [], 'Y': [], 'legend': ['Sup. Loss', 'Ml. Loss', 'Tot. Loss']}
for epoch in range(num_epochs):
    for step, (x_batch_train, y_batch_train) in enumerate(sup_train):
        
        print('Start of epoch %d step %d' % (epoch,step))
        with tf.GradientTape() as tape:
            out = model(x_batch_train)
            loss_sup_value = loss_sup(y_batch_train, out)
            loss_ml_value = loss_ml(y_batch_train, out)
            loss_value = loss_sup_value + loss_ml_value
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # 可视化
        if step % 5 == 0:
            
            loss_sup_value = loss_sup_value.numpy()
            loss_ml_value = loss_ml_value.numpy()
            loss_value = loss_value.numpy()
            
            plot_data['X'].append(iters)
            plot_data['Y'].append([
                loss_sup_value, loss_ml_value, loss_value])
            vis.line(
                X=np.stack([np.array(plot_data['X'])] * len(plot_data['legend']), 1),
                Y=np.array(plot_data['Y']),
                opts={
                    'title': 'Loss over Time',
                    'legend': plot_data['legend'],
                    'xlabel': 'Iterations',
                    'ylabel': 'Loss',
                    'width': 1200,
                    'height': 390,
                },
                win=2
            )
        iters += 1
    
    for x_batch_val, y_batch_val in sup_val:
        val_logits = model(x_batch_val)
        val_acc_metric(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print('Validation acc: %s' % (float(val_acc),))
#     # 进行验证
#     from sklearn import metrics
#     out_val = model.predict(codes_val)
#     labels = np.array(labels_val)
#     dists = np.max(out_val, axis=1).reshape((labels.shape[0], ))
#     auc = metrics.roc_auc_score(labels, dists)
#     print('Epoch{} AUROC: {:.3f}'.format(epoch, auc))
#     if best_auc < auc:
#         best_auc = auc
#         model.save_weights('./out/malconv_2.h5')
#         print('Model saved.')
# print('>> Finished.')

Setting up a new session...


>> Fine-tune a Model.
Start of epoch 0 step 0
Start of epoch 0 step 1
Start of epoch 0 step 2
Start of epoch 0 step 3
Start of epoch 0 step 4
Start of epoch 0 step 5
Start of epoch 0 step 6
Start of epoch 0 step 7
Start of epoch 0 step 8
Start of epoch 0 step 9
Start of epoch 0 step 10
Start of epoch 0 step 11
Start of epoch 0 step 12
Start of epoch 0 step 13
Start of epoch 0 step 14
Start of epoch 0 step 15
Start of epoch 0 step 16
Start of epoch 0 step 17
Start of epoch 0 step 18
Start of epoch 0 step 19
Start of epoch 0 step 20
Start of epoch 0 step 21
Start of epoch 0 step 22
Start of epoch 0 step 23
Start of epoch 0 step 24
Start of epoch 0 step 25
Start of epoch 0 step 26
Validation acc: 0.9997106194496155
Start of epoch 1 step 0
Start of epoch 1 step 1
Start of epoch 1 step 2
Start of epoch 1 step 3
Start of epoch 1 step 4
Start of epoch 1 step 5
Start of epoch 1 step 6
Start of epoch 1 step 7
Start of epoch 1 step 8
Start of epoch 1 step 9
Start of epoch 1 step 10
Start of epoc

In [10]:
model.save('./out/malconv_3.h5')

In [5]:
model = keras.models.load_model('./out/malconv_3.h5')

In [6]:
def load_test_data(codes, labels):
    
    res_all = []
    for idx in tqdm(range(len(codes))):
        fn = codes[idx]
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                res_all.append(res)
    return np.array(res_all), labels

codes_train, labels_train = load_test_data(codes_train, labels_train)
ood2label = {True: 0, False: 1}
data_test_1 = pd.read_csv('data_test_1.csv')
codes_test_1 = data_test_1['name'].to_list()
labels_test_1 = data_test_1['ID'].map(lambda x: ood2label[x])
labels_test_1 = labels_test_1.to_list()
codes_test_1, labels_test_1 = load_test_data(codes_test_1, labels_test_1)

layer_model = Model(inputs=model.input, outputs=model.layers[8].output)
feature_test_8 = layer_model.predict(codes_test_1)
feature_train_8 = layer_model.predict(codes_train)

100%|██████████████████████████████████████████████████████████████████████████████| 3476/3476 [05:04<00:00, 11.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1048/1048 [00:35<00:00, 29.28it/s]


In [None]:
from methods.kpca import kPCA
from sklearn import metrics
import matplotlib.pyplot as plt
from utils.utils import Seedy,param_heatmap,param_scatter

model = kPCA(q = 68, sigma = 1.0985)
model.fit(feature_train_8) #still using model data
test_scores = model.decision_function(feature_test_8)
# scores.append(test_scores)
test_auc = metrics.roc_auc_score(labels_test_1, test_scores)
# test_aucs.append(test_auc)
print('test auc:', test_auc)

In [45]:
codes_test_id = codes_test_1[test_scores>model.threshold_]
model_pre = keras.models.load_model('../model/malconv_1.h5')
labels_test_id = data_test_1['label'].map(lambda x: name2label[x])

In [46]:
labels_test_id = np.array(labels_test_id)[test_scores>model.threshold_]

In [47]:
y_pred = model_pre.predict(codes_test_id)
y_true = labels_test_id

from sklearn.metrics import classification_report
# y_true = np.argmax(y_true, axis=1)
y_pred = np.argmax(y_pred, axis=1)
target_names = {'trojan', 'virus', 'worm', 'backdoor'}
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        worm       0.40      1.00      0.57       137
       virus       0.94      0.55      0.70       246
      trojan       0.72      0.79      0.76       252
    backdoor       1.00      0.19      0.33       159

    accuracy                           0.63       794
   macro avg       0.77      0.64      0.59       794
weighted avg       0.79      0.63      0.62       794

