[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Anomaly_Detection/anomaly_detection_challenge.ipynb)

In [2]:
# Package for scalable bayesian rule lists
!pip3 install --user 'fim'
!pip3 install --user 'pysbrl'

Collecting fim
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a8/66fbb303236eb7e4caa63096814aa2675073f20aee95104920636af84a7e/fim-6.27.tar.gz (343kB)
[K    100% |################################| 348kB 1.2MB/s 
[?25hBuilding wheels for collected packages: fim
  Running setup.py bdist_wheel for fim ... [?25ldone
[?25h  Stored in directory: /mnt/workspace/.cache/pip/wheels/5c/1c/94/b96c6b9a2eb858e26a675f86a908abfa53a593185b1c058823
Successfully built fim
Installing collected packages: fim
Successfully installed fim-6.27
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
# Elementary
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
from tqdm import tqdm

# For elementary data manipulation
import pandas as pd
import numpy as np

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For scalable bayesian rule lists
import pysbrl

# Import dataframe and cast names, datatypes and NaNs
from names import column_names, labels
basepath = "/mnt/datasets/anomaly/"
dataDF = pd.read_csv(basepath + 'data.csv', delimiter=";", header=None, names=column_names)
pure_dataDF = dataDF.drop(labels, axis=1)
anomaliesDF = dataDF.filter(labels, axis=1) 

anomaliesDF_with_zerNA = anomaliesDF.fillna(0) # Fill NaNs with 0s, considering them as "not an anomaly"
anomaliesDF_with_negNA = anomaliesDF.fillna(-1) # Fill NaNs with -1 considering them as a separate class for the classifier.
pure_dataDF_with_negNA = pure_dataDF.fillna(-1)

In [2]:
check1DF = anomaliesDF['Check1']
indices_of_zero_elements = list(check1DF[check1DF == 0].index)
indices_of_nan_elements = list(check1DF[check1DF.isna()].index)
print(len(indices_of_nan_elements))

24511


Nyttige artikler om stratified shuffle split
* [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)
* [Visualizing cross-validation behavior in scikit-learn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
* [User guide: cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

# Pre-processing

## Stratified-shuffle-split function

This function will split the a given dataframe X, and corresponding label-series y (only one columns), into train, validation and test sets such that the distribution of the different labels is retained in the different data sets.

In [22]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

X = pure_dataDF_with_negNA
y = anomaliesDF_with_negNA['Check1']
seed = 42

# This function splits the dataset into 0.6 train, 0.2 val and 0.2 test sets
def train_val_test_split(X, y, seed):
    # This generator splits the OG dataset into train and test sets
    sss_train_test = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.2, 
                                   train_size = 0.8, 
                                   random_state = seed)

    # This generator splits the newly created train-set into train and validate sets
    sss_train_val = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.25, 
                                   train_size = 0.75, 
                                   random_state = seed)

    for train_index, test_index in sss_train_test.split(X,y):
        X_temp = X.loc[train_index, :]
        y_temp = y[train_index]
        X_test = X.loc[test_index, :]
        y_test = y[test_index]

    for train_index, test_index in sss_train_val.split(X_temp,y_temp):
        X_train = X_temp.loc[train_index, :]
        y_train = y_temp[train_index]
        X_val = X_temp.loc[test_index, :]
        y_val = y_temp[test_index]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y, seed)

print('####### Y TRAIN #######\n', y_train.describe())
print('\n######## Y VAL ########\n', y_val.describe())
print('\n####### Y TEST ########\n', y_test.describe())

####### Y TRAIN #######
 count    137918.000000
mean         -0.079127
std           0.290658
min          -1.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Check1, dtype: float64

######## Y VAL ########
 count    45721.000000
mean        -0.081779
std          0.294571
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Check1, dtype: float64

####### Y TEST ########
 count    57407.000000
mean        -0.079694
std          0.291096
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Check1, dtype: float64


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


## Self-made train-val-test-split-function

In [3]:
def train_val_test_split(train_size=0.6, val_size=0.2, with_nan=False, check, label=anomaliesDF, pure_data):
    test_size = 1 - train_size - test_size
    if check == 'all':
        # perform split for all categories
        pass
    else:
        check_series = label_dataframe[check]
        indices_of_zero_elements = list(check_series[check_series == 0].index)
        indices_of_one_elements = list(check_series[check_series == 1].index)
        
        if with_nan == True:
            indices_of_nan_elements = list(check_series[check_series.isna()].index)

SyntaxError: non-default argument follows default argument (<ipython-input-3-1cc464ab153d>, line 1)

In [None]:
# Save dataframes to the format the SBRL library requires

In [None]:
# Use SBRL Library from https://github.com/myaooo/pysbrl
rule_ids, outputs, rule_strings = pysbrl.train_sbrl("data/ttt_train.out", 
                                                    "data/ttt_train.label", 
                                                    20.0, 
                                                    eta=2.0, 
                                                    max_iters=2000, 
                                                    nchain=10, 
                                                    alphas=[1,1])