@author Nassir Mohammad

# Preliminaries

In [1]:
import os
import sys 
sys.path.append('../')
sys.path.append('../scripts')

import warnings
from perception_nassir import Perception

import dataframe_image as dfi

import numpy as np
import pandas as pd

from scripts.utilities import apply_classifiers
from scripts.utilities import get_file_data

from sklearn.preprocessing import StandardScaler
from scripts.rendering_functions import highlight_max, highlight_min

image_save_path = ''
image_save_switch = False

# Paper 1 datasets 

> **Note on creditcard.csv**  
> The file `creditcard.csv` is not included in this repository under `data/ODDS_multivariate` due to its large size.  
> Download it from: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud and place it inside `data/ODDS_multivariate/`.  

In [2]:
# make table for dataset, # dimensions, # samples, # percentage of anomalies

base_path = "../data/ODDS_multivariate/"
data_properties_df = None

# loop over datasets in directory
for file_name in os.listdir(base_path):
 
    dataset_name = file_name.split('.')[0]
    file_path = base_path + file_name

    if dataset_name == "creditcard":
        df_temp = pd.read_csv(file_path, low_memory=False)
        X_original = df_temp.iloc[:, :-1].values.astype(float)
        y = df_temp.iloc[:, -1].values.astype(float)
    else:
        dataset_name, X_original, y = get_file_data(base_path, file_name)

    if dataset_name is None:
        continue

    # write dataset summary to dataframe
    data_properties_temp = pd.DataFrame({
        'Name': [dataset_name],
        '# examples': [X_original.shape[0]],
        '# features': [X_original.shape[1]],
        # '# anomalies': [y.sum()],
        '% anomalies': [round(y.sum() / X_original.shape[0] * 100, 2)],
    })

    data_properties_df = pd.concat(
        [data_properties_df, data_properties_temp]).reset_index(drop=True)
            

In [5]:
img_title = "Dataset properties"
path_save = image_save_path + "dataset_properties.png"

# order the dataset rows by name
data_properties_df = data_properties_df.sort_values(by=['Name']).reset_index(drop=True)

data_properties_df_styled = data_properties_df.style.format({'% anomalies': "{:.2f}"}).hide()

#data_properties_df_styled = data_properties_df.style.hide_index()

#dfi.export(data_properties_df,path_save)

data_properties_df_styled

Name,# examples,# features,% anomalies
cardio,1831,21,9.61
creditcard,284807,30,0.17
http,567498,3,0.39
musk,3062,166,3.17
satimage-2,5803,36,1.22
shuttle,49097,9,7.15
smtp,95156,3,0.03
thyroid,3772,6,2.47
wbc,378,30,5.56


In [None]:
# file names
# ########################
# file_name = "wbc.mat" 
# file_name = "cardio.mat"
# file_name = "thyroid.mat"
# file_name = "musk.mat"
# file_name = "shuttle.mat"
# file_name = "satimage-2.mat"
# file_name = "http.matv7"
# file_name = "smtp.matv7"
# file_name = "creditcard.csv"

classifiers = [
    'HBOS',  # to be ignored, first run in loop slower
    'HBOS',
    'IForest',
    'KNN',
    'LOF',
    'MCD',
    'OCSVM',
    'Perception',
]

metrics_df = None

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # loop over datasets in directory
    for file_name in os.listdir(base_path):

        dataset_name = file_name.split('.')[0]
        file_path = base_path + file_name

        if dataset_name == "creditcard":
            df_temp = pd.read_csv(file_path, low_memory=False)
            X_original = df_temp.iloc[:, :-1].values.astype(float)
            y = df_temp.iloc[:, -1].values.astype(float)

        else:
            dataset_name, X_original, y = get_file_data(base_path, file_name)

        if dataset_name is None:
            continue

        # scaling (very important to get right)
        # scale to zero mean and unit standard deviation along each feature
        sc = StandardScaler(with_mean=False)
        sc.fit(X_original)
        X = sc.transform(X_original)

        # Apply each classifier to dataset
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            print('current file in progress ...: {}'.format(dataset_name))

            metrics_temp = apply_classifiers(classifiers, dataset_name,
                                             predict_data=X,
                                             predict_labels=y,
                                             train_data=X)

        metrics_df = pd.concat([metrics_df, metrics_temp])

    metrics_df.reset_index(drop=True)

current file in progress ...: cardio
current classifier in progress: HBOS
total run time: 2.453952300013043
current classifier in progress: HBOS
total run time: 0.0037052001571282744
current classifier in progress: IForest
total run time: 0.1695482999784872
current classifier in progress: KNN
total run time: 1.9343896999489516
current classifier in progress: LOF
total run time: 0.3189489000942558
current classifier in progress: MCD
total run time: 0.3408673999365419
current classifier in progress: OCSVM
total run time: 0.3343865000642836
current classifier in progress: Perception
total run time: 0.0007828000234439969


ValueError: could not convert string to float: 'Time'

In [None]:
# create dataframe for precision
df_precision = metrics_df[['Dataset', 'Classifier', 'Precision']]
df_precision = pd.pivot_table(df_precision, values = 'Precision', index=['Classifier'], columns='Dataset').reset_index()
df_precision.columns.name = None

cols = [col for col in df_precision.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.3f}"
formatdict.pop('Classifier', None)

sub = df_precision.columns.values.tolist()
sub.remove('Classifier')
sub

df_precision = df_precision.style.hide().apply(highlight_max, subset=sub).format(formatdict)

# img_title = "Precision results"
# path_save = image_save_path + "dataset_precision.png"

# dfi.export(df_precision,path_save)

df_precision

In [None]:
# create dataframe for recall
df_recall = metrics_df[['Dataset', 'Classifier', 'Recall']]
df_recall = pd.pivot_table(df_recall, values = 'Recall', index=['Classifier'], columns='Dataset').reset_index()
df_recall.columns.name = None

cols = [col for col in df_recall.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.2f}"
formatdict.pop('Classifier', None)

sub = df_recall.columns.values.tolist()
sub.remove('Classifier')
sub

df_recall = df_recall.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "Recall results"
path_save = image_save_path + "dataset_recall.png"

# dfi.export(df_recall,path_save)

df_recall

In [None]:
# create dataframe for F1-score
df_f1= metrics_df[['Dataset', 'Classifier', 'F1']]
df_f1 = pd.pivot_table(df_f1, values = 'F1', index=['Classifier'], columns='Dataset').reset_index()
df_f1.columns.name = None

cols = [col for col in df_f1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.3f}"
formatdict.pop('Classifier', None)

sub = df_f1.columns.values.tolist()
sub.remove('Classifier')
sub

df_f1 = df_f1.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "F1-score results"
path_save = image_save_path + "dataset_f1-score.png"

# dfi.export(df_f1,path_save)

df_f1

In [None]:
# create dataframe for Area under ROC curve
df1= metrics_df[['Dataset', 'Classifier', 'AUC']]
df1 = pd.pivot_table(df1, values = 'AUC', index=['Classifier'], columns='Dataset').reset_index()
df1.columns.name = None

cols = [col for col in df1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.2f}"
formatdict.pop('Classifier', None)

sub = df1.columns.values.tolist()
sub.remove('Classifier')
sub

df1 = df1.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "F1-score results"
path_save = image_save_path + "dataset_auc.png"

# dfi.export(df1,path_save)

df1

In [None]:
# create dataframe for total training and prediction time
df1= metrics_df[['Dataset', 'Classifier', 'Runtime']]
df1 = pd.pivot_table(df1, values = 'Runtime', index=['Classifier'], columns='Dataset').reset_index()
df1.columns.name = None

cols = [col for col in df1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.4f}"
formatdict.pop('Classifier', None)

sub = df1.columns.values.tolist()
sub.remove('Classifier')
sub

df1 = df1.style.hide().apply(highlight_min, subset=sub).format(formatdict)

path_save = image_save_path + "dataset_total_time.png"

# dfi.export(df1,path_save)

df1