[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Anomaly_Detection/anomaly_detection_challenge.ipynb)

### Articles to read about rule based systems

- [RIPPER/JRIP](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.107.2612&rep=rep1&type=pdf)
- [Bayesian Rule Sets](http://jmlr.org/papers/volume18/16-003/16-003.pdf)
- [Scalable Bayesian Rule Lists](https://arxiv.org/pdf/1602.08610.pdf)

In [1]:
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
# For data manipulation
import pandas as pd
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
# For prediction
from tqdm import tqdm

basepath = "/mnt/datasets/anomaly/"

In [2]:
column_names = [
    "SessionNumber", 
    "SystemID",
    "Date",
    "HighPriorityAlerts", 
    "Dumps",
    "CleanupOOMDumps",
    "CompositeOOMDums",
    "IndexServerRestarts",
    "NameServerRestarts",
    "XSEngineRestarts",
    "PreprocessorRestarts",
    "DaemonRestarts",
    "StatisticsServerRestarts",
    "CPU",
    "PhysMEM",
    "InstanceMEM",
    "TablesAllocation", 
    "IndexServerAllocationLimit",
    "ColumnUnloads",
    "DeltaSize",
    "MergeErrors",
    "BlockingPhaseSec", 
    "Disk",
    "LargestTableSize",
    "LargestPartitionSize",
    "DiagnosisFiles",
    "DiagnosisFilesSize",
    "DaysWithSuccessfulDataBackups",
    "DaysWithSuccessfulLogBackups",
    "DaysWithFailedDataBackups",
    "DaysWithFailedfulLogBackups",
    "MinDailyNumberOfSuccessfulDataBackups",
    "MinDailyNumberOfSuccessfulLogBackups",
    "MaxDailyNumberOfFailedDataBackups",
    "MaxDailyNumberOfFailedLogBackups",
    "LogSegmentChange",
    "Check1",
    "Check2",
    "Check3",
    "Check4",
    "Check5",
    "Check6",
    "Check7",
    "Check8"
]

labels = ["Check1", "Check2", "Check3", "Check4", "Check5", "Check6", "Check7", "Check8"]

dataDF = pd.read_csv(basepath + 'data.csv', delimiter=";", header=None, names=column_names)
# create pure data DF
puredataDF = dataDF.drop(labels, axis=1)
#create a copy of the label columns for exploration
anomaliesDF = dataDF.filter(labels, axis=1)
# percentage of NA in each label 
print(anomaliesDF.isna().mean().round(4) * 100) 
anomaliesDF_withoutNA = anomaliesDF.fillna(0)

Check1     8.54
Check2     8.53
Check3     7.86
Check4    12.77
Check5    12.21
Check6     2.57
Check7    12.45
Check8     0.02
dtype: float64


In [3]:
anomaliesDF.describe()

Unnamed: 0,Check1,Check2,Check3,Check4,Check5,Check6,Check7,Check8
count,262520.0,262545.0,264463.0,250384.0,251997.0,279647.0,251309.0,286979.0
mean,0.006232,0.033381,0.030602,0.099108,0.012222,0.28812,0.030592,0.010171
std,0.078696,0.17963,0.172236,0.298807,0.109877,0.452888,0.172209,0.10034
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
anomaliesDF_withoutNA.describe()

Unnamed: 0,Check1,Check2,Check3,Check4,Check5,Check6,Check7,Check8
count,287031.0,287031.0,287031.0,287031.0,287031.0,287031.0,287031.0,287031.0
mean,0.0057,0.030533,0.028196,0.086454,0.010731,0.280708,0.026785,0.01017
std,0.075281,0.17205,0.165531,0.281034,0.103031,0.449346,0.161454,0.100331
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# percentage of NA in each label 
print(puredataDF.isna().mean().round(4) * 100)

SessionNumber                             0.00
SystemID                                  0.00
Date                                      0.00
HighPriorityAlerts                        0.00
Dumps                                     0.00
CleanupOOMDumps                           0.00
CompositeOOMDums                          0.00
IndexServerRestarts                       0.00
NameServerRestarts                        0.00
XSEngineRestarts                          0.00
PreprocessorRestarts                      0.00
DaemonRestarts                            0.00
StatisticsServerRestarts                  0.00
CPU                                       8.78
PhysMEM                                   7.17
InstanceMEM                               7.71
TablesAllocation                          0.80
IndexServerAllocationLimit                9.21
ColumnUnloads                             0.00
DeltaSize                                 0.07
MergeErrors                               2.69
BlockingPhase