In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from loglizer.models import PCA, IsolationForest, LogClustering, OneClassSVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [3]:
ouput_dir = "../output/hdfs/"
(x_train, y_train), (x_test, y_test) = dataloader.load_data(data_dir=ouput_dir)
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)


  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


Train normal size: 167466
Train abnormal size: 6735
Test normal size: 390757
Test abnormal size: 10103
Train data shape: 174201-by-45

Test data shape: 400860-by-45



In [4]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 1


Project matrix shape: 45-by-45
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 6735, FP: 166839, TN: 627, FN: 0
Precision: 3.880%, recall: 100.000%, F1-measure: 7.471%

Test validation:
Confusion Matrix: TP: 10103, FP: 389175, TN: 1582, FN: 0
Precision: 2.530%, recall: 100.000%, F1-measure: 4.936%

CPU times: total: 766 ms
Wall time: 2.26 s


In [5]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 6152, FP: 8055, TN: 159411, FN: 583
Precision: 43.303, recall: 91.344, F1-measure: 58.753

Test validation:
Confusion Matrix: TP: 9251, FP: 18737, TN: 372020, FN: 852
Precision: 33.053, recall: 91.567, F1-measure: 48.573

CPU times: total: 11.7 s
Wall time: 27.2 s


In [6]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:


In [None]:
# %%time
# print("="*20 + " Model: one class SVM " + "="*20)

# nus = [0.001, 0.01, 0.1, 1]
# gammas = [0.001, 0.01, 0.1, 1]
# tuned_parameters = {'kernel' : ['rbf','poly','linear','sigmoid'], 'gamma' : gammas, 'nu': nus}

# ocsvm = svm.OneClassSVM()
# model = GridSearchCV(ocsvm, tuned_parameters, cv=5, scoring="f1_micro")

# model.fit(x_train, y_train.astype(int))

# # print('Train validation:')
# # precision, recall, f1 = model.predict(x_train, y_train.astype(int))
# # print('Test validation:')
# # precision, recall, f1 = model.predict(x_test, y_test.astype(int))

CPU times: user 2min 50s, sys: 3.56 s, total: 2min 54s
Wall time: 2min 54s


GridSearchCV(cv=5, estimator=OneClassSVM(),
             param_grid={'gamma': [0.001, 0.01, 0.1, 1],
                         'kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
                         'nu': [0.001, 0.01, 0.1, 1]},
             scoring='f1_micro')

In [None]:
# print('Train validation:')
# y_eval = model.predict(x_train)
# precision, recall, f1 = metrics(y_eval, y_train)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
    
# print('Test validation:')
# y_pred = model.predict(x_test)
# precision, recall, f1 = metrics(y_pred, y_test)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))


Train validation:
Confusion Matrix: TP: 1543, FP: 5000, TN: 0, FN: 957
Precision: 23.582, recall: 61.720, F1-measure: 34.126

Test validation:
Confusion Matrix: TP: 9114, FP: 553223, TN: 0, FN: 5224
Precision: 1.621, recall: 63.565, F1-measure: 3.161



In [None]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)




Starting offline clustering...
Processed 1000 instances.
Found 4 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5000 instances.
Found 4 clusters online.

Train validation:
Confusion Matrix: TP: 960, FP: 0, TN: 5000, FN: 1540
Precision: 100.000, recall: 38.400, F1-measure: 55.491

Test validation:
Confusion Matrix: TP: 5251, FP: 40, TN: 553183, FN: 9087
Precision: 99.244, recall: 36.623, F1-measure: 53.502

CPU times: user 26.9 s, sys: 4.13 ms, total: 27 s
Wall time: 26.9 s
