[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Anomaly_Detection/anomaly_detection_challenge.ipynb)

In [2]:
# Package for scalable bayesian rule lists
!pip3 install --user 'fim'
!pip3 install --user 'pysbrl'

Collecting fim
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a8/66fbb303236eb7e4caa63096814aa2675073f20aee95104920636af84a7e/fim-6.27.tar.gz (343kB)
[K    100% |################################| 348kB 1.2MB/s 
[?25hBuilding wheels for collected packages: fim
  Running setup.py bdist_wheel for fim ... [?25ldone
[?25h  Stored in directory: /mnt/workspace/.cache/pip/wheels/5c/1c/94/b96c6b9a2eb858e26a675f86a908abfa53a593185b1c058823
Successfully built fim
Installing collected packages: fim
Successfully installed fim-6.27
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
# Elementary
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
from tqdm import tqdm

# For elementary data manipulation
import pandas as pd
import numpy as np

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For scalable bayesian rule lists
import pysbrl

# Import dataframe and cast names
from names import column_names, labels
basepath = "/mnt/datasets/anomaly/"
dataDF = pd.read_csv(basepath + 'data.csv', delimiter=";", header=None, names=column_names)
pure_dataDF = dataDF.drop(labels, axis=1)
anomaliesDF = dataDF.filter(labels, axis=1)

Nyttige artikler om stratified shuffle split
* [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)
* [Visualizing cross-validation behavior in scikit-learn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
* [User guide: cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

# Pre-processing

## Temporary error handling in dataDF

In [2]:
anomaliesDF_with_zerNA = anomaliesDF.fillna(0) # Fill NaNs with 0s, considering them as "not an anomaly"
anomaliesDF_with_negNA = anomaliesDF.fillna(-1) # Fill NaNs with -1 considering them as a separate class for the classifier.
pure_dataDF_with_negNA = pure_dataDF.fillna(-1)

X_t = pure_dataDF_with_negNA.drop('Date', axis=1)
print('Any nan values   :', X_t.isnull().any().any())
print('All values finite:', np.isfinite(np.array(X_t)).all())

Any nan values   : False
All values finite: True


In [3]:
# Removal
X_temp = X_t.drop(['CleanupOOMDumps', 'PreprocessorRestarts', 'DaemonRestarts'], axis=1)

# Direct recasting
direct_recast = ['Dumps', 'CompositeOOMDums', 'DeltaSize', 'MergeErrors', 'BlockingPhaseSec', 
                 'LargestTableSize', 'LargestPartitionSize', 'DiagnosisFiles', 'DiagnosisFilesSize', 
                 'LogSegmentChange']
for column in direct_recast:
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

#Format recasting
format_recast = ['CPU', 'PhysMEM', 'InstanceMEM', 'TablesAllocation', 'IndexServerAllocationLimit', 
                    'Disk']
for column in format_recast:
    X_temp[column] = 100*X_temp[column]
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

In [4]:
#print(X_temp.dtypes)

## Remove data corresponding to one NaN column

In [5]:
def create_binary_classification(puredataDF, anomaliesDF, label):
    y = anomaliesDF[label]
    indices_nan_labels = list(y.index[y.isnull()])
    
    X_mod = puredataDF.drop(indices_nan_labels, axis=0)
    y_mod = y.drop(indices_nan_labels)
    y_mod = y_mod.astype(np.int64, errors='raise')
    
    X_mod = X_mod.reset_index().drop('index', axis=1)
    y_mod = y_mod.reset_index().drop('index', axis=1)
    return X_mod, y_mod

X, y = create_binary_classification(X_temp, anomaliesDF, 'Check1')

print('X_temp.shape:', X_temp.shape, '\t', 'X.shape:', X.shape)

X_temp.shape: (287031, 32) 	 X.shape: (262520, 32)


In [6]:
y.dtypes

Check1    int64
dtype: object

## Stratified-shuffle-split function

This function will split the a given dataframe X, and corresponding label-series y (only one column), into train, validation and test sets such that the distribution of the different labels is retained in the different data sets.

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

# This function splits the dataset into 0.6 train, 0.2 val and 0.2 test sets ONLY SINGLE LABEL
def train_val_test_split(X, y, seed):
    # This generator splits the OG dataset into train and test sets
    sss_train_test = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.2, 
                                   train_size = 0.8, 
                                   random_state = seed)

    # This generator splits the newly created train-set into train and validate sets
    sss_train_val = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.25, 
                                   train_size = 0.75, 
                                   random_state = seed)

    for train_index, test_index in sss_train_test.split(X,y):
        X_temp = X.iloc[train_index, :]
        y_temp = y.iloc[train_index, :]
        X_test = X.iloc[test_index, :]
        y_test = y.iloc[test_index, :]

    for train_index, test_index in sss_train_val.split(X_temp,y_temp):
        X_train = X_temp.iloc[train_index, :]
        y_train = y_temp.iloc[train_index, :]
        X_val = X_temp.iloc[test_index, :]
        y_val = y_temp.iloc[test_index, :]
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_temp, y_temp

# TEST ---------------------------------------------------------------------------------------
seed = 42
X_train, y_train, X_val, y_val, X_test, y_test, X_train_big, y_train_big = train_val_test_split(X, y, seed)

#print('####### Y TRAIN #######\n', y_train.describe())
#print('\n######## Y VAL ########\n', y_val.describe())
#print('\n####### Y TEST ########\n', y_test.describe())

## Scalable Bayesian Rule Lists
### [github repo](https://github.com/myaooo/pysbrl)

### Problems
* WHEN USING DATAFRAME
    * All elements after x[y == label] become NaN
* WHEN USING NUMPY ARRAY

In [8]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [9]:
from time import time

# We block print becuase categorical2pysbrl_data prints out a lot of uninteresting data to stdout. 

# Save dataframes to the format the SBRL library requires
t0 = time()
# Convert to numpy ndarray
_X = X_train_big.values
_y = y_train_big.values[:,0]

with HiddenPrints():
    pysbrl.utils.categorical2pysbrl_data(_X,
                                        _y,
                                        'X_train_big.out',
                                        'y_train_big.label',
                                        method='eclat',
                                        supp=0.05,
                                        zmin=1,
                                        zmax=3)
t1 = time()
print('Converting dataframe took %.2f seconds' % (t1 - t0))
# print('We dont need to convert every time, only once.\nLast time converting dataframe took 61.00 seconds')

Converting dataframe took 61.00 seconds


In [10]:
t0 = time()

# Using SBRL Library from https://github.com/myaooo/pysbrl
rule_ids, outputs, rule_strings = pysbrl.train_sbrl("X_train_big.out", 
                                                    "y_train_big.label", 
                                                    20.0, 
                                                    eta=2.0, 
                                                    max_iters=2000) 
                                                    #nchain=10, 
                                                    #alphas=[1,1])

print('Training the SBRL based model took %.2f seconds' % (time() - t0))

Training the SBRL based model took 31.40 seconds


In [15]:
rules = [rule_strings[i] for i in rule_ids]

for rule in rules:
    print(rule)
    
print(len(rules))

{x8=0,x9=10000}
{x2=2,x30=0}
{x25=0,x2=0}
{x2=1,x7=0,x8=0}
{x2=2,x5=0,x7=0}
{x28=1,x29=0,x7=0}
{x21=-1,x22=-1}
{x16=0,x23=6,x29=0}
{x1=73,x25=0,x3=0}
{x2=3,x6=0}
{x2=5,x8=0}
{x31=-1,x5=0}
{x14=0,x2=6,x8=0}
{x2=4,x8=0}
{x23=1,x29=0,x5=0}
{x16=0,x26=0}
default
17


In [32]:
print(outputs)

[[1.00000000e-01 9.00000000e-01]
 [9.96979817e-01 3.02018295e-03]
 [9.99823702e-01 1.76298438e-04]
 [9.98553485e-01 1.44651510e-03]
 [9.99150815e-01 8.49184783e-04]
 [9.65008201e-01 3.49917988e-02]
 [9.97164218e-01 2.83578242e-03]
 [9.79787234e-01 2.02127660e-02]
 [9.36708861e-01 6.32911392e-02]
 [9.96690528e-01 3.30947156e-03]
 [9.91091487e-01 8.90851344e-03]
 [9.88790104e-01 1.12098956e-02]
 [9.91901408e-01 8.09859155e-03]
 [9.95512505e-01 4.48749521e-03]
 [9.15697674e-01 8.43023256e-02]
 [9.68692022e-01 3.13079777e-02]
 [9.78089334e-01 2.19106665e-02]]


In [28]:
from IPython.display import display, Markdown

def translate_output_to_text(rule_ids, outputs, rule_strings, filename, dataDF, label):
    rules = [rule_strings[i] for i in rule_ids]
    split_rules = [rule[1:-1].split(',') for rule in rules[:-1]] # LAST RULE IS ONLY 'default'
    
    rules_with_column_names = []
    for num_rule in split_rules:
        rule_with_column_names = []
        for sub_rule in num_rule:
            col_number = int(sub_rule.split('=')[0][1:])
            col_name = dataDF.columns[col_number]
            new_sub_rule = col_name + '=' + sub_rule.split('=')[1]
            rule_with_column_names.append(new_sub_rule)
        rules_with_column_names.append(rule_with_column_names)
    
    
    f = open(filename,'w+')
    f.write('| Rule | $P(%s = 0)$ | $P(%s = 1)$ |\n' % (label, label))
    f.write('|:-----|:----------------|:----------------|\n')
    
    separator = ' and '
    for i in range(len(outputs[:-1])):
        rule_to_write = separator.join(rules_with_column_names[i])
        output = outputs[i]
        string_to_write = '| ' + rule_to_write + ' | %.5f | %.5f |' % (output[0], output[1]) + '\n'
        f.write(string_to_write)
        #print(string_to_write[:-1])
    
    default_prob = outputs[-1]
    f.write('| Default | %.5f | %.5f |\n' % (default_prob[0], default_prob[1]))
    f.close()
    
    with open(filename, 'r') as fh:
        content = fh.read()
    display(Markdown(content))
    
translate_output_to_text(rule_ids=rule_ids, 
                         outputs=outputs, 
                         rule_strings=rule_strings, 
                         filename='created_rules.md', 
                         dataDF=X_train_big,
                         label='Check1')


| Rule | $P(Check1 = 0)$ | $P(Check1 = 1)$ |
|:-----|:----------------|:----------------|
| MinDailyNumberOfSuccessfulDataBackups=1 and NameServerRestarts=0 and CPU=10000 | 0.10256 | 0.89744 |
| DaysWithFailedfulLogBackups=0 and HighPriorityAlerts=5 | 0.99012 | 0.00988 |
| HighPriorityAlerts=0 and NameServerRestarts=0 | 0.99982 | 0.00018 |
| BlockingPhaseSec=2 and Dumps=0 and StatisticsServerRestarts=0 | 0.99938 | 0.00062 |
| DaysWithSuccessfulDataBackups=7 and HighPriorityAlerts=3 and NameServerRestarts=0 | 0.99571 | 0.00429 |
| DaysWithSuccessfulLogBackups=11 and MinDailyNumberOfSuccessfulDataBackups=1 and StatisticsServerRestarts=0 | 0.98615 | 0.01385 |
| HighPriorityAlerts=1 and IndexServerRestarts=0 | 0.99884 | 0.00116 |
| HighPriorityAlerts=2 | 0.99748 | 0.00252 |
| MaxDailyNumberOfFailedDataBackups=0 and CPU=10000 | 0.14286 | 0.85714 |
| MinDailyNumberOfSuccessfulLogBackups=1 and MaxDailyNumberOfFailedLogBackups=0 | 0.96375 | 0.03625 |
| MergeErrors=1 and HighPriorityAlerts=3 and XSEngineRestarts=0 | 0.99814 | 0.00186 |
| SystemID=73 and DaysWithFailedfulLogBackups=0 | 0.91045 | 0.08955 |
| HighPriorityAlerts=4 | 0.99542 | 0.00458 |
| ColumnUnloads=0 and MaxDailyNumberOfFailedDataBackups=1 and IndexServerRestarts=0 | 0.99454 | 0.00546 |
| DiagnosisFiles=-1 and DiagnosisFilesSize=-1 | 0.99524 | 0.00476 |
| DaysWithSuccessfulDataBackups=6 and MaxDailyNumberOfFailedLogBackups=0 and StatisticsServerRestarts=0 | 0.97845 | 0.02155 |
| HighPriorityAlerts=3 and CompositeOOMDums=0 | 0.99581 | 0.00419 |
| HighPriorityAlerts=5 and Dumps=0 | 0.99084 | 0.00916 |
| ColumnUnloads=0 and HighPriorityAlerts=6 and StatisticsServerRestarts=0 | 0.99028 | 0.00972 |
| DaysWithSuccessfulDataBackups=1 and DaysWithFailedDataBackups=0 and Dumps=0 | 0.91124 | 0.08876 |
| BlockingPhaseSec=-1 and DaysWithSuccessfulDataBackups=7 | 0.98236 | 0.01764 |
| Default | 0.97377 | 0.02623 |


In [25]:
def test_function(rule_ids, outputs, rule_strings, dataDF, label):
    rules = [rule_strings[i] for i in rule_ids]
    split_rules = [rule[1:-1].split(',') for rule in rules[:-1]] # LAST RULE IS ONLY 'default'
    
    rules_with_column_names = []
    for num_rule in split_rules:
        print(num_rule)
        rule_with_column_names = []
        for sub_rule in num_rule:
            col_number = int(sub_rule.split('=')[0][1:])
            col_name = dataDF.columns[col_number]
            new_sub_rule = col_name + '=' + sub_rule.split('=')[1]
            rule_with_column_names.append(new_sub_rule)
        rules_with_column_names.append(rule_with_column_names)
    
            

test_function(rule_ids=rule_ids, outputs=outputs, rule_strings=rule_strings, dataDF=X_train_big, label='Check1')

['x27=1', 'x6=0', 'x9=10000']
['x26=0', 'x2=5']
['x2=0', 'x6=0']
['x17=2', 'x3=0', 'x8=0']
['x23=7', 'x2=3', 'x6=0']
['x24=11', 'x27=1', 'x8=0']
['x2=1', 'x5=0']
['x2=2']
['x29=0', 'x9=10000']
['x28=1', 'x30=0']
['x16=1', 'x2=3', 'x7=0']
['x1=73', 'x26=0']
['x2=4']
['x14=0', 'x29=1', 'x5=0']
['x21=-1', 'x22=-1']
['x23=6', 'x30=0', 'x8=0']
['x2=3', 'x4=0']
['x2=5', 'x3=0']
['x14=0', 'x2=6', 'x8=0']
['x23=1', 'x25=0', 'x3=0']
['x17=-1', 'x23=7']
['MinDailyNumberOfSuccessfulDataBackups=1', 'NameServerRestarts=0', 'CPU=10000']
['DaysWithFailedfulLogBackups=0', 'HighPriorityAlerts=5']
['HighPriorityAlerts=0', 'NameServerRestarts=0']
['BlockingPhaseSec=2', 'Dumps=0', 'StatisticsServerRestarts=0']
['DaysWithSuccessfulDataBackups=7', 'HighPriorityAlerts=3', 'NameServerRestarts=0']
['DaysWithSuccessfulLogBackups=11', 'MinDailyNumberOfSuccessfulDataBackups=1', 'StatisticsServerRestarts=0']
['HighPriorityAlerts=1', 'IndexServerRestarts=0']
['HighPriorityAlerts=2']
['MaxDailyNumberOfFailedDataBack