[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Anomaly_Detection/anomaly_detection_challenge.ipynb)

In [2]:
# Package for scalable bayesian rule lists
!pip3 install --user 'fim'
!pip3 install --user 'pysbrl'

Collecting fim
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a8/66fbb303236eb7e4caa63096814aa2675073f20aee95104920636af84a7e/fim-6.27.tar.gz (343kB)
[K    100% |################################| 348kB 1.2MB/s 
[?25hBuilding wheels for collected packages: fim
  Running setup.py bdist_wheel for fim ... [?25ldone
[?25h  Stored in directory: /mnt/workspace/.cache/pip/wheels/5c/1c/94/b96c6b9a2eb858e26a675f86a908abfa53a593185b1c058823
Successfully built fim
Installing collected packages: fim
Successfully installed fim-6.27
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Elementary
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
from tqdm import tqdm

# For elementary data manipulation
import pandas as pd
import numpy as np

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For scalable bayesian rule lists
import pysbrl

# Import dataframe and cast names
from names import column_names, labels
basepath = "/mnt/datasets/anomaly/"
dataDF = pd.read_csv(basepath + 'data.csv', delimiter=";", header=None, names=column_names)
pure_dataDF = dataDF.drop(labels, axis=1)
anomaliesDF = dataDF.filter(labels, axis=1)

Nyttige artikler om stratified shuffle split
* [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)
* [Visualizing cross-validation behavior in scikit-learn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
* [User guide: cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

# Pre-processing

## Temporary error handling in dataDF

In [3]:
anomaliesDF_with_zerNA = anomaliesDF.fillna(0) # Fill NaNs with 0s, considering them as "not an anomaly"
anomaliesDF_with_negNA = anomaliesDF.fillna(-1) # Fill NaNs with -1 considering them as a separate class for the classifier.
pure_dataDF_with_negNA = pure_dataDF.fillna(-1)

X_t = pure_dataDF_with_negNA.drop('Date', axis=1)
print('Any nan values   :', X_t.isnull().any().any())
print('All values finite:', np.isfinite(np.array(X_t)).all())

Any nan values   : False
All values finite: True


In [4]:
# Removal
X_temp = X_t.drop(['CleanupOOMDumps', 'PreprocessorRestarts', 'DaemonRestarts'], axis=1)

# Direct recasting
direct_recast = ['Dumps', 'CompositeOOMDums', 'DeltaSize', 'MergeErrors', 'BlockingPhaseSec', 
                 'LargestTableSize', 'LargestPartitionSize', 'DiagnosisFiles', 'DiagnosisFilesSize', 
                 'LogSegmentChange']
for column in direct_recast:
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

#Format recasting
format_recast = ['CPU', 'PhysMEM', 'InstanceMEM', 'TablesAllocation', 'IndexServerAllocationLimit', 
                    'Disk']
for column in format_recast:
    X_temp[column] = 100*X_temp[column]
    X_temp[column] = X_temp[column].astype(np.int64, errors='ignore')

In [5]:
#print(X_temp.dtypes)

## Remove data corresponding to one NaN column

In [6]:
def create_binary_classification(puredataDF, anomaliesDF, label):
    y = anomaliesDF[label]
    indices_nan_labels = list(y.index[y.isnull()])
    
    X_mod = puredataDF.drop(indices_nan_labels, axis=0)
    y_mod = y.drop(indices_nan_labels)
    y_mod = y_mod.astype(np.int64, errors='raise')
    
    X_mod = X_mod.reset_index().drop('index', axis=1)
    y_mod = y_mod.reset_index().drop('index', axis=1)
    return X_mod, y_mod

X, y = create_binary_classification(X_temp, anomaliesDF, 'Check1')

print('X_temp.shape:', X_temp.shape, '\t', 'X.shape:', X.shape)

X_temp.shape: (287031, 32) 	 X.shape: (262520, 32)


In [7]:
y.dtypes

Check1    int64
dtype: object

## Stratified-shuffle-split function

This function will split the a given dataframe X, and corresponding label-series y (only one column), into train, validation and test sets such that the distribution of the different labels is retained in the different data sets.

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

# This function splits the dataset into 0.6 train, 0.2 val and 0.2 test sets ONLY SINGLE LABEL
def train_val_test_split(X, y, seed):
    # This generator splits the OG dataset into train and test sets
    sss_train_test = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.2, 
                                   train_size = 0.8, 
                                   random_state = seed)

    # This generator splits the newly created train-set into train and validate sets
    sss_train_val = StratifiedShuffleSplit(n_splits = 1, 
                                   test_size = 0.25, 
                                   train_size = 0.75, 
                                   random_state = seed)

    for train_index, test_index in sss_train_test.split(X,y):
        X_temp = X.iloc[train_index, :]
        y_temp = y.iloc[train_index, :]
        X_test = X.iloc[test_index, :]
        y_test = y.iloc[test_index, :]

    for train_index, test_index in sss_train_val.split(X_temp,y_temp):
        X_train = X_temp.iloc[train_index, :]
        y_train = y_temp.iloc[train_index, :]
        X_val = X_temp.iloc[test_index, :]
        y_val = y_temp.iloc[test_index, :]
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_temp, y_temp

# TEST ---------------------------------------------------------------------------------------
seed = 42
X_train, y_train, X_val, y_val, X_test, y_test, X_train_big, y_train_big = train_val_test_split(X, y, seed)

#print('####### Y TRAIN #######\n', y_train.describe())
#print('\n######## Y VAL ########\n', y_val.describe())
#print('\n####### Y TEST ########\n', y_test.describe())

## Scalable Bayesian Rule Lists
### [github repo](https://github.com/myaooo/pysbrl)

In [9]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [10]:
from time import time

# We block print becuase categorical2pysbrl_data prints out a lot of uninteresting data to stdout. 

# Save dataframes to the format the SBRL library requires
def format_dataframe(X, y, file_X, file_y):
    # Convert to numpy ndarray
    _X = X.values
    _y = y.values[:,0]
    
    name_X = file_X + '.out'
    name_y = file_y + '.label'
    
    with HiddenPrints():
        pysbrl.utils.categorical2pysbrl_data(_X,
                                            _y,
                                            name_X,
                                            name_y,
                                            method='eclat',
                                            supp=0.05,
                                            zmin=1,
                                            zmax=3)

t0 = time()
format_dataframe(X_train_big, y_train_big, 'X_train_big', 'y_train_big')   
t1 = time()
print('Converting dataframe took %.2f seconds' % (t1 - t0))
# print('We dont need to convert every time, only once.\nLast time converting dataframe took 61.00 seconds')

Converting dataframe took 57.93 seconds


In [11]:
t0 = time()

# Using SBRL Library from https://github.com/myaooo/pysbrl
rule_ids, outputs, rule_strings = pysbrl.train_sbrl("X_train_big.out", 
                                                    "y_train_big.label", 
                                                    20.0, 
                                                    eta=2.0, 
                                                    max_iters=2000) 
                                                    #nchain=10, 
                                                    #alphas=[1,1])

print('Training the SBRL based model took %.2f seconds' % (time() - t0))

Training the SBRL based model took 32.48 seconds


In [12]:
from IPython.display import display, Markdown

def translate_output_to_text(rule_ids, outputs, rule_strings, filename, column_names, label, verbose):
    rules = [rule_strings[i] for i in rule_ids]
    split_rules = [rule[1:-1].split(',') for rule in rules[:-1]] # LAST RULE IS ONLY 'default'
    
    rules_with_column_names = []
    for num_rule in split_rules:
        rule_with_column_names = []
        for sub_rule in num_rule:
            col_number = int(sub_rule.split('=')[0][1:])
            col_name = column_names[col_number]
            new_sub_rule = col_name + '=' + sub_rule.split('=')[1]
            rule_with_column_names.append(new_sub_rule)
        rules_with_column_names.append(rule_with_column_names)
    
    to_return = rules_with_column_names.copy()
    
    f = open(filename,'w+')
    f.write('| Rule | $P(%s = 0)$ | $P(%s = 1)$ |\n' % (label, label))
    f.write('|:-----|:----------------|:----------------|\n')
    
    separator = ' and '
    for i in range(len(outputs[:-1])):
        rule_to_write = separator.join(rules_with_column_names[i])
        output = outputs[i]
        string_to_write = '| ' + rule_to_write + ' | %.5f | %.5f |' % (output[0], output[1]) + '\n'
        f.write(string_to_write)
        #print(string_to_write[:-1])
    
    default_prob = outputs[-1]
    f.write('| Default | %.5f | %.5f |\n' % (default_prob[0], default_prob[1]))
    f.close()
    
    if verbose == True:
        with open(filename, 'r') as fh:
            content = fh.read()
        display(Markdown(content))
    
    return to_return
    
rules = translate_output_to_text(rule_ids=rule_ids, 
                         outputs=outputs, 
                         rule_strings=rule_strings, 
                         filename='created_rules.md', 
                         column_names=X_train_big.columns,
                         label=list(y_train_big.columns)[0],
                         verbose=True)


| Rule | $P(Check1 = 0)$ | $P(Check1 = 1)$ |
|:-----|:----------------|:----------------|
| DiagnosisFilesSize=-1 | 0.99761 | 0.00239 |
| CompositeOOMDums=0 and NameServerRestarts=0 and CPU=10000 | 0.02857 | 0.97143 |
| HighPriorityAlerts=0 and Dumps=0 | 0.99982 | 0.00018 |
| DaysWithSuccessfulLogBackups=8 and HighPriorityAlerts=1 and Dumps=0 | 0.99902 | 0.00098 |
| ColumnUnloads=0 and DaysWithSuccessfulLogBackups=8 and HighPriorityAlerts=2 | 0.99815 | 0.00185 |
| MinDailyNumberOfSuccessfulDataBackups=1 and CPU=10000 | 0.07143 | 0.92857 |
| BlockingPhaseSec=3 and StatisticsServerRestarts=0 | 0.99670 | 0.00330 |
| BlockingPhaseSec=2 | 0.99832 | 0.00168 |
| HighPriorityAlerts=2 and NameServerRestarts=0 and StatisticsServerRestarts=0 | 0.99456 | 0.00544 |
| HighPriorityAlerts=1 and NameServerRestarts=0 and XSEngineRestarts=0 | 0.99616 | 0.00384 |
| MinDailyNumberOfSuccessfulLogBackups=1 and Dumps=0 and NameServerRestarts=0 | 0.96511 | 0.03489 |
| SystemID=73 and DaysWithFailedfulLogBackups=0 and MaxDailyNumberOfFailedDataBackups=0 | 0.92206 | 0.07794 |
| HighPriorityAlerts=3 and IndexServerRestarts=0 and StatisticsServerRestarts=0 | 0.99603 | 0.00397 |
| ColumnUnloads=0 and DaysWithFailedfulLogBackups=0 and HighPriorityAlerts=4 | 0.99343 | 0.00657 |
| ColumnUnloads=0 and HighPriorityAlerts=6 | 0.99031 | 0.00969 |
| HighPriorityAlerts=4 and IndexServerRestarts=0 and XSEngineRestarts=0 | 0.99671 | 0.00329 |
| HighPriorityAlerts=5 and XSEngineRestarts=0 | 0.99084 | 0.00916 |
| DaysWithSuccessfulDataBackups=1 and DaysWithSuccessfulLogBackups=8 and StatisticsServerRestarts=0 | 0.91540 | 0.08460 |
| Default | 0.97631 | 0.02369 |


In [13]:
'''test_row = X_train_big.iloc[0,:]

exit = 0
#print(element)
for el in X_test.iterrows():
    print((el[1]['SystemID']))
    exit += 1
    if exit == 3:
        break; '''

"test_row = X_train_big.iloc[0,:]\n\nexit = 0\n#print(element)\nfor el in X_test.iterrows():\n    print((el[1]['SystemID']))\n    exit += 1\n    if exit == 3:\n        break; "

In [14]:
def check_rule(formatted_rule, row_series):
    pass_var = True
    for sub_rule in formatted_rule:
        if (row_series[sub_rule.split('=')[0]] != int(sub_rule.split('=')[1])):
            pass_var = False
            break
    return pass_var


test_rule = rules[0]
test_row = X_train_big.iloc[0,:]
i = check_rule(test_rule, test_row)


In [15]:
def predict_row(formatted_rule_list, outputs, row_series):
    prediction = 7
    for i in range(len(formatted_rule_list)):
        if(check_rule(formatted_rule_list[i], row_series) == True):
            prediction = int(np.round(outputs[i,1]))
            break
    if prediction == 7:
        prediction = int(np.round(outputs[-1,1]))
    return prediction

print(predict_row(rules,outputs,test_row))

0


In [16]:
def SBRL_predict(formatted_rule_list, outputs, X_test, y_test):
    y_pred = pd.Series(np.zeros(y_test.shape[0], dtype=int))
    for index, row_series in X_test.iterrows():
        y_pred.iloc[index] = predict_row(formatted_rule_list, outputs, row_series)
    return y_pred
 
y_pred = SBRL_predict(rules, outputs, X_test.reset_index(), y_test.reset_index())

In [17]:
y_pred_np = y_pred.values
print(y_pred_np[:3])
y_test_np = y_test.values[:,0]
print(y_test_np[:3])

[0 0 0]
[0 0 0]


In [18]:
from sklearn.metrics import confusion_matrix, f1_score

print(y_pred.shape)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
f1 = f1_score(y_test, y_test, average='macro')
print('F1 Score of model = %.2f' %(f1*100))
print('Accuracy in 1s = %.2f' %(tp/(tp + fn)*100))
print('Accuracy in 0s = %.5f' %(tn/(tn + fp)*100))

(52504,)
F1 Score of model = 100.00
Accuracy in 1s = 4.28
Accuracy in 0s = 99.99808


In [19]:
def full_pipeline_SBRL(X_train, y_train, X_test, y_test, verbose):
    # REFORMATTING input_data ----------------------------------------------------------------
    print('Reformatting')
    format_dataframe(X_train, y_train, 'file_X', 'file_y')
    
    # TRAINING SBRL MODEL ---------------------------------------------------------------
    print('Training SBRL model')
    t2 = time()

    # Using SBRL Library from https://github.com/myaooo/pysbrl
    rule_ids, outputs, rule_strings = pysbrl.train_sbrl('file_X.out', 
                                                        'file_y.label', 
                                                        20.0, 
                                                        eta=2.0, 
                                                        max_iters=2000) 
                                                        #nchain=10, 
                                                        #alphas=[1,1])
    
    t3 = time()
    print('Training the SBRL based model took %.2f seconds' % (t3 - t2))
    
    # PRINTING DATA IN NICE FORMAT -------------------------------------------------------
    formatted_rule_list = translate_output_to_text(rule_ids=rule_ids, 
                                                     outputs=outputs, 
                                                     rule_strings=rule_strings, 
                                                     filename='function_rule_file.md', 
                                                     column_names=X_train.columns,
                                                     label=list(y_train.columns)[0],
                                                     verbose=verbose)
    
    # TESTING MODEL ----------------------------------------------------------------------
    test_data = X_test.reset_index().drop('index', axis=1)
    test_labels = y_test.reset_index().drop('index', axis=1)
    print('Using model for predictions')
    y_pred = SBRL_predict(formatted_rule_list=formatted_rule_list, 
                          outputs=outputs, 
                          X_test=test_data, 
                          y_test=test_labels)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    f1 = f1_score(y_test, y_test, average='macro')
    print('F1 Score of model = %.2f' %(f1*100))
    print('Accuracy in 1s = %.2f' %(tp/(tp + fn)*100))
    print('Accuracy in 0s = %.5f' %(tn/(tn + fp)*100))

full_pipeline_SBRL(X_train_big, y_train_big, X_test, y_test, verbose=False)

Reformatting
Training SBRL model
Training the SBRL based model took 33.25 seconds
Using model for predictions
F1 Score of model = 100.00
Accuracy in 1s = 4.28
Accuracy in 0s = 99.99808
