## 利用Mahalanobis detector进行异常检测

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras import layers, optimizers, Sequential, metrics

tf.__version__

'2.0.0'

In [2]:
# 将2018年的数据作进一步拆分

def judge_1(time):
    time = time[:7].replace('-', '')
    if time <= '201803':
        return True
    else:
        return False

def judge_2(time):
    time = time[:7].replace('-', '')
    if time > '201803':
        return True
    else:
        return False

    
data_csv = pd.read_csv('../../../csv/dataset.csv')
data_2017 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2017']
data_2018 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2018']
data_2019 = data_csv[data_csv['first_seen'].apply(lambda x: x[:4]) == '2019']
data_2018_1 = data_2018[data_2018['first_seen'].apply(judge_1)]
data_2018_2 = data_2018[data_2018['first_seen'].apply(judge_2)]

data_train = data_2017
data_test_1 = data_2018_1
data_test_2 = data_2018_2
data_test_3 = data_2019

data_train = data_train.sample(frac=1, random_state=1).reset_index(drop=True)
data_test_1 = data_test_1.sample(frac=0.3, random_state=1).reset_index(drop=True)
data_test_2 = data_test_2.sample(frac=0.3, random_state=1).reset_index(drop=True)
data_test_3 = data_test_3.sample(frac=0.3, random_state=1).reset_index(drop=True)

In [3]:
name2label = {'trojan':0, 'virus':1, 'worm':2, 'backdoor':3}

# 训练数据
codes_train = data_train['name'].to_list()
labels_train = data_train['label'].map(lambda x: name2label[x])
labels_train = labels_train.to_list()
# 2018-03测试数据
codes_test_1 = data_test_1['name'].to_list()
labels_test_1 = data_test_1['label'].map(lambda x: name2label[x])
labels_test_1 = labels_test_1.to_list()
# 2018-06测试数据
codes_test_2 = data_test_2['name'].to_list()
labels_test_2 = data_test_2['label'].map(lambda x: name2label[x])
labels_test_2 = labels_test_2.to_list()
# 2019测试数据
codes_test_3 = data_test_3['name'].to_list()
labels_test_3 = data_test_3['label'].map(lambda x: name2label[x])
labels_test_3 = labels_test_3.to_list()

In [4]:
def pad_data(res, max_len=200000):
    length = len(res)
    if length > max_len:
        return res[ :max_len]
    elif length < max_len:
        return res + [0]*(max_len-length)
    return res

def load_test_data(codes, labels):
    
    labels = np.eye(4)[labels]
    res_all = []
    for idx in tqdm(range(len(codes))):
        fn = codes[idx]
        fn = '../../../dataset/' + fn[8: ]
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                res = f.read()
                res = [byte for byte in res]
                res = pad_data(res)
                res_all.append(res)
    return np.array(res_all), labels

In [6]:
# 加载预训练模型
model = keras.models.load_model('../model/malconv_1.h5')

# 得到第七层、第八层的模型
layer_model_7 = Model(inputs=model.input, outputs=model.layers[7].output)
layer_model_8 = Model(inputs=model.input, outputs=model.layers[8].output)

codes_train, labels_train = load_test_data(codes_train, labels_train)
feature_train_7 = layer_model_7.predict(codes_train)
feature_train_8 = layer_model_8.predict(codes_train)

# 将结果保存为文件

np.save('feature_train_7.npy', feature_train_7)
np.save('feature_train_8.npy', feature_train_8)

  1%|▍                                                                               | 19/3476 [00:03<09:16,  6.21it/s]


KeyboardInterrupt: 

In [5]:
# 加载预训练模型
model = keras.models.load_model('../model/malconv_1.h5')

# 得到第七层、第八层的模型
layer_model_7 = Model(inputs=model.input, outputs=model.layers[7].output)
layer_model_8 = Model(inputs=model.input, outputs=model.layers[8].output)

feature_7 = np.load('feature_train_7.npy')
feature_8 = np.load('feature_train_8.npy')

feature_train_7 = feature_7[: 2476]
feature_val_7 = feature_7[2476: ]
feature_train_8 = feature_8[: 2476]
feature_val_8 = feature_8[2476:]

data_feature = [feature_train_7, feature_train_8]

In [8]:
import sklearn.covariance


num_classes = 4
group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered=False)
num_output = len(data_feature)    # 值为2，即层的个数
num_sample_per_class = np.empty(num_classes)
num_sample_per_class.fill(0)
list_features = []                # 第一个维度是层，第二个维度是类别，所以是2x4
for i in range(num_output):
    temp_list = []
    for j in range(num_classes):
        temp_list.append(0)
    list_features.append(temp_list)


for idx in range(len(data_feature[0])):
    out_features = [feature_train_7[idx], feature_train_8[idx]]
    label = labels_train[idx]
    if num_sample_per_class[label] == 0:
        out_count = 0
        for out in out_features:
            out = out[np.newaxis, :]
            list_features[out_count][label] = out
            out_count += 1
    else:
        out_count = 0
        for out in out_features:
            out = out[np.newaxis, :]
            list_features[out_count][label] \
            = np.concatenate((list_features[out_count][label], out), axis=0)
            out_count += 1
    num_sample_per_class[label] += 1  

# list_feature的行代表层，列代表类别，元素为一个二维矩阵，矩阵由每个样本对应的特征向量以行的形式拼接

In [9]:
# 从样本数目维度求均值，对于每一个类别都得到128维和64维的向量

sample_class_mean = []            # 它的元素代表每个类别对应的求均值后的特征向量
out_count = 0
for num_feature in [128, 64]:
    temp_list = []
    for j in range(num_classes):
        temp_list.append(np.mean(list_features[out_count][j], axis=0))
    temp_list = np.array(temp_list)
    sample_class_mean.append(temp_list)
    out_count += 1

In [10]:
precision = []
for k in range(num_output):
    X = 0
    for i in range(num_classes):
        if i == 0:
            X = list_features[k][i] - sample_class_mean[k][i]
        else:
            X = np.concatenate((X, list_features[k][i] - sample_class_mean[k][i]), axis=0)
    
    group_lasso.fit(X)
    temp_precision = group_lasso.precision_
    precision.append(temp_precision)

In [34]:
def get_Mahalanobis_score(codes_test, num_classes, sample_mean, \
                          layer_index, out_flag, precision):
    
    Mahalanobis = []
    if out_flag == True:
        temp_file_name = './confidence_Ga%s_In.txt'%(str(layer_index))
    else:
        temp_file_name = './confidence_Ga%s_Out.txt'%(str(layer_index))
    
    g = open(temp_file_name, 'w')
    
    # 计算Mahalanobis score
    gaussian_score = 0
    for i in range(num_classes):
        batch_sample_mean = sample_mean[layer_index][i]    # 得到该类别下的样本均值向量
        zero_f = codes_test - batch_sample_mean
        term_gau = -0.5*np.dot(np.dot(zero_f, precision[layer_index]), zero_f.T).diagonal()
        if i == 0:
            gaussian_score = term_gau.reshape(-1, 1)
        else:
            # 以列向量进行拼接
            gaussian_score = np.concatenate((gaussian_score, term_gau.reshape(-1, 1)), axis=1)
            
    gaussian_score = np.max(gaussian_score, axis=1)         # 它是一个二维矩阵，行向量代表的是样本在每个类别的score
    Mahalanobis.extend(gaussian_score)
    
    for i in range(len(codes_test)):
        g.write("{}\n".format(gaussian_score[i]))
    g.close()
    
    return Mahalanobis

In [None]:
# 得到验证数据的score

val_all = [feature_val_7, feature_val_8]
for i in range(num_output):
    
    M_in = get_Mahalanobis_score(val_all[i], 4, sample_class_mean, i, True, precision)
    M_in = np.asarray(M_in, dtype=np.float32)
    if i == 0:
        Mahalanobis_in = M_in.reshape((M_in.shape[0], -1))
    else:
        Mahalanobis_in = np.concatenate((Mahalanobis_in, M_in.reshape((M_in.shape[0], -1))), axis=1)

> <ipython-input-34-df9399094ec9>(15)get_Mahalanobis_score()
-> for i in range(num_classes):


(1000, 128)