[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Anomaly_Detection/anomaly_detection_challenge.ipynb)

In [2]:
# Package for scalable bayesian rule lists
!pip3 install --user 'fim'
!pip3 install --user 'pysbrl'

Collecting fim
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a8/66fbb303236eb7e4caa63096814aa2675073f20aee95104920636af84a7e/fim-6.27.tar.gz (343kB)
[K    100% |################################| 348kB 1.2MB/s 
[?25hBuilding wheels for collected packages: fim
  Running setup.py bdist_wheel for fim ... [?25ldone
[?25h  Stored in directory: /mnt/workspace/.cache/pip/wheels/5c/1c/94/b96c6b9a2eb858e26a675f86a908abfa53a593185b1c058823
Successfully built fim
Installing collected packages: fim
Successfully installed fim-6.27
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
# Elementary
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
from tqdm import tqdm

# For elementary data manipulation
import pandas as pd
import numpy as np

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For scalable bayesian rule lists
import pysbrl

# Import dataframe and cast names
from names import column_names, labels
basepath = "/mnt/datasets/anomaly/"
dataDF = pd.read_csv(basepath + 'data.csv', delimiter=";", header=None, names=column_names)
pure_dataDF = dataDF.drop(labels, axis=1)
anomaliesDF = dataDF.filter(labels, axis=1)

Nyttige artikler om stratified shuffle split
* [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)
* [Visualizing cross-validation behavior in scikit-learn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
* [User guide: cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

# Pre-processing

## Temporary error handling in dataDF

In [26]:
anomaliesDF_with_zerNA = anomaliesDF.fillna(0) # Fill NaNs with 0s, considering them as "not an anomaly"
anomaliesDF_with_negNA = anomaliesDF.fillna(-1) # Fill NaNs with -1 considering them as a separate class for the classifier.
pure_dataDF_with_negNA = pure_dataDF.fillna(-1)

X_temp = pure_dataDF_with_negNA.drop('Date', axis=1)
print('Any nan values   :', X_temp.isnull().any().any())
print('All values finite:', np.isfinite(np.array(X_temp)).all())

Any nan values   : False
All values finite: True


In [27]:
# Removal
X_temp.drop(['CleanupOOMDumps', 'PreprocessorRestarts', 'DaemonRestarts'], axis=1, inplace=True)

# Direct recasting
direct_recast = ['Dumps', 'CompositeOOMDums', 'DeltaSize', 'MergeErrors', 'BlockingPhaseSec', 
                 'LargestTableSize', 'LargestPartitionSize', 'DiagnosisFiles', 'DiagnosisFilesSize', 
                 'LogSegmentChange']
for column in direct_recast:
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

#Format recasting
format_recast = ['CPU', 'PhysMEM', 'InstanceMEM', 'TablesAllocation', 'IndexServerAllocationLimit', 
                    'Disk']
for column in format_recast:
    X_temp[column] = 100*X_temp[column]
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

In [28]:
print(X_temp.dtypes)

SessionNumber                            int64
SystemID                                 int64
HighPriorityAlerts                       int64
Dumps                                    int64
CompositeOOMDums                         int64
IndexServerRestarts                      int64
NameServerRestarts                       int64
XSEngineRestarts                         int64
StatisticsServerRestarts                 int64
CPU                                      int64
PhysMEM                                  int64
InstanceMEM                              int64
TablesAllocation                         int64
IndexServerAllocationLimit               int64
ColumnUnloads                            int64
DeltaSize                                int64
MergeErrors                              int64
BlockingPhaseSec                         int64
Disk                                     int64
LargestTableSize                         int64
LargestPartitionSize                     int64
DiagnosisFile

## Remove data corresponding to one NaN column

In [30]:
def create_binary_classification(puredataDF, anomaliesDF, label):
    y = anomaliesDF[label]
    indices_nan_labels = list(y.index[y.isnull()])
    
    X_mod = puredataDF.drop(indices_nan_labels, axis=0)
    y_mod = y.drop(indices_nan_labels)
    y_mod = y_mod.astype(np.int64, errors='raise')
    
    X_mod = X_mod.reset_index().drop('index', axis=1)
    y_mod = y_mod.reset_index().drop('index', axis=1)
    return X_mod, y_mod

X, y = create_binary_classification(X_temp, anomaliesDF, 'Check1')

print('X_temp.shape:', X_temp.shape, '\t', 'X.shape:', X.shape)

X_temp.shape: (287031, 32) 	 X.shape: (262520, 32)


In [31]:
y.dtypes

Check1    int64
dtype: object

## Stratified-shuffle-split function

This function will split the a given dataframe X, and corresponding label-series y (only one column), into train, validation and test sets such that the distribution of the different labels is retained in the different data sets.

In [32]:
from sklearn.model_selection import StratifiedShuffleSplit

# This function splits the dataset into 0.6 train, 0.2 val and 0.2 test sets ONLY SINGLE LABEL
def train_val_test_split(X, y, seed):
    # This generator splits the OG dataset into train and test sets
    sss_train_test = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.2, 
                                   train_size = 0.8, 
                                   random_state = seed)

    # This generator splits the newly created train-set into train and validate sets
    sss_train_val = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.25, 
                                   train_size = 0.75, 
                                   random_state = seed)

    for train_index, test_index in sss_train_test.split(X,y):
        X_temp = X.iloc[train_index, :]
        y_temp = y.iloc[train_index, :]
        X_test = X.iloc[test_index, :]
        y_test = y.iloc[test_index, :]

    for train_index, test_index in sss_train_val.split(X_temp,y_temp):
        X_train = X_temp.iloc[train_index, :]
        y_train = y_temp.iloc[train_index, :]
        X_val = X_temp.iloc[test_index, :]
        y_val = y_temp.iloc[test_index, :]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

# TEST ---------------------------------------------------------------------------------------
seed = 42
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y, seed)

#print('####### Y TRAIN #######\n', y_train.describe())
#print('\n######## Y VAL ########\n', y_val.describe())
#print('\n####### Y TEST ########\n', y_test.describe())

## Scalable Bayesian Rule Lists
### [github repo](https://github.com/myaooo/pysbrl)

In [63]:
def categorical2transactions(x):
    # type: (np.ndarray) -> List
    """
    Convert a 2D int array into a transaction list:
        [
            ['x0=1', 'x1=0', ...],
            ...
        ]
    :param x:
    :return:
    """
    assert len(x.shape) == 2

    transactions = []
    for entry in x:
        transactions.append(['x%d=%d' % (i, val) for i, val in enumerate(entry)])

    return transactions

X_train = X_train.reset_index().drop('index', axis=1)
y_train = y_train.reset_index().drop('index', axis=1)

_X = np.array(X_train)
_y = np.array(y_train)

x = _X.astype(np.int, casting='safe')
y = y_train.astype(np.int, casting='safe')

labels = np.unique(y)
labels = np.arange(np.max(labels) + 1)

x_by_labels = []
label = 0
print(x[y == label])
x_by_labels.append(x[y == label])


# transactions_by_labels = [categorical2transactions(_x) for _x in x_by_labels]

'''X_train = X_train.reset_index().drop('index', axis=1)
y_train = y_train.reset_index().drop('index', axis=1)
print(X_train[:5])
print(y_train[:5])'''

IndexError: boolean index did not match indexed array along dimension 1; dimension is 33 but corresponding boolean dimension is 1

In [33]:
# Save dataframes to the format the SBRL library requires
pysbrl.utils.categorical2pysbrl_data(X_train,
                                    y_train,
                                    'X.out',
                                    'y.label',
                                    method='eclat',
                                    supp=0.05,
                                    zmin=1,
                                    zmax=3)

TypeError: %d format: a number is required, not str

In [None]:
# Use SBRL Library from https://github.com/myaooo/pysbrl
rule_ids, outputs, rule_strings = pysbrl.train_sbrl("data/ttt_train.out", 
                                                    "data/ttt_train.label", 
                                                    20.0, 
                                                    eta=2.0, 
                                                    max_iters=2000, 
                                                    nchain=10, 
                                                    alphas=[1,1])