In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import argparse
import numpy as np
import pandas as pd
import random
from importlib import reload  
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.utils import shuffle

from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [2]:
ouput_dir = "../output/custom/"
middle_dir = ""
log_file = "comb"

<!-- # Produce event templates from train test dataset -->

# Split train test data

In [3]:
(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)

Train normal size: 1881
Train abnormal size: 4
Total logkey(exclude 0:UNK) 226
Test normal size: 2822
Test abnormal size: 7
num_unk_event in test data: 0


  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


In [4]:
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

Train data shape: 1885-by-139

Test data shape: 2829-by-139



In [5]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 2
Project matrix shape: 139-by-139
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 4, FP: 1774, TN: 107, FN: 0
Precision: 0.225%, recall: 100.000%, F1-measure: 0.449%

Test validation:
Confusion Matrix: TP: 7, FP: 2672, TN: 150, FN: 0
Precision: 0.261%, recall: 100.000%, F1-measure: 0.521%

CPU times: user 244 ms, sys: 226 ms, total: 470 ms
Wall time: 483 ms


In [6]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:


  f1 = 2 * precision * recall / (precision + recall)


Confusion Matrix: TP: 0, FP: 80, TN: 1801, FN: 4
Precision: 0.000, recall: 0.000, F1-measure: nan

Test validation:
Confusion Matrix: TP: 0, FP: 105, TN: 2717, FN: 7
Precision: 0.000, recall: 0.000, F1-measure: nan

CPU times: user 566 ms, sys: 1.37 ms, total: 567 ms
Wall time: 756 ms


  f1 = 2 * precision * recall / (precision + recall)


In [20]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 70, FP: 328, TN: 0, FN: 94
Precision: 17.588, recall: 42.683, F1-measure: 24.911

Test validation:
Confusion Matrix: TP: 171, FP: 494, TN: 0, FN: 222
Precision: 25.714, recall: 43.511, F1-measure: 32.325

CPU times: user 91 ms, sys: 0 ns, total: 91 ms
Wall time: 101 ms


In [21]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)


Starting offline clustering...
Processed 328 instances.
Found 4 clusters offline.

Train validation:
Confusion Matrix: TP: 0, FP: 0, TN: 328, FN: 164
Precision: 0.000, recall: 0.000, F1-measure: nan

Test validation:
Confusion Matrix: TP: 0, FP: 0, TN: 494, FN: 393
Precision: 0.000, recall: 0.000, F1-measure: nan

CPU times: user 696 ms, sys: 0 ns, total: 696 ms
Wall time: 707 ms


  f1 = 2 * precision * recall / (precision + recall)
