# 异常日志识别模型
取日志中单词的词频作为特征，使用支持向量机SVM模型进行训练，识别异常日志。
## 加载数据

In [1]:
import json
import random
import math

testset =[]
trainset =[]
testset_dir = 'test/bgl_logs_test.json'
dataset = json.load(open('train/bgl_logs_train.json'))


dataset_normal = []
dataset_abnormal = []

for log_item in dataset:
    if log_item['label'] == 'Normal':
        dataset_normal.append(log_item)
    else:
        dataset_abnormal.append(log_item)

#实际使用时将k置0
k = 0
#分割训练集和测试集，正常日志和异常日志分别取k%作为测试集
test_normal = random.sample(dataset_normal, math.floor(k* len(dataset_normal)))
test_abnormal = random.sample(dataset_abnormal, math.floor(k* len(dataset_abnormal)))
testset = test_normal + test_abnormal
trainset = [x for x in dataset_normal if x not in test_normal] + [x for x in dataset_abnormal if x not in test_abnormal]



## 特征提取

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_data = []
train_label = []
for log_item in trainset:
    log = log_item['log'][3:]
    label = log_item['label']
    train_data.append(log)
    train_label.append(1 if label == 'Normal' else 0)
train_data = vectorizer.fit_transform(train_data)

## 训练svm

In [3]:
from sklearn import svm

clf = svm.SVC(kernel='rbf',class_weight={1:0.06, 0: 0.94}, gamma = 0.1)
clf.fit(train_data, train_label)


## 生成测试集特征向量

In [4]:
test_data = []
test_label = []
for log_item in testset:
    log = log_item['log'][3:]
    label = log_item['label']
    test_data.append(log)
    test_label.append(1 if label == 'Normal' else 0)
test_data = vectorizer.transform(test_data)

## 预测测试集的结果

In [5]:
#预测
if k > 0:
    predict_label = clf.predict(test_data)
    import pandas as pd
    result = pd.DataFrame({'log_index':[log_item['index'] for log_item in testset], 'lable':['Normal' if i == 1 else 'Anomalous' for i in predict_label]})
    filename='test.csv'
    result.to_csv(filename,index=False, header=False)

## 测试阶段，评测性能

In [6]:
#输出各项指标
if k > 0:
    print('Accuracy:', sum([1 for i in range(len(predict_label)) if predict_label[i] == test_label[i]])/len(predict_label))
    print('Precision:', sum([1 for i in range(len(predict_label)) if predict_label[i] == 1 and test_label[i] == 1])/sum(predict_label))
    print('Recall:', sum([1 for i in range(len(predict_label)) if predict_label[i] == 1 and test_label[i] == 1])/sum(test_label))
    print('F1:', 2*sum([1 for i in range(len(predict_label)) if predict_label[i] == 1 and test_label[i] == 1])/(sum(test_label)+sum(predict_label)))
    print('Specificity:', sum([1 for i in range(len(predict_label)) if predict_label[i] == 0 and test_label[i] == 0])/sum([1 for i in range(len(test_label)) if test_label[i] == 0]))

## 保存模型和提取器

In [7]:
from joblib import dump
dump(clf, 'model.joblib')
dump(vectorizer, 'vectorizer.joblib')


['vectorizer.joblib']