# Benchmarking anomaly detection methods 

## Libraries import

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, auc, precision_recall_curve
from sklearn import metrics
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN

from numpy import random, where
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
import array
from statsmodels.stats.outliers_influence import variance_inflation_factor
import keras
from tensorflow.keras.layers import Dense, Dropout
from keras.models import Model, Sequential
from keras import initializers
from keras.layers import Input, Dense
import tensorflow as tf

import time
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
import baycomp 
from baycomp import SignedRankTest

## DBSCAN hyperparameter tuner
Based on Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (1998). Density-based clustering in spatial databases: The algorithm gdbscan and its applications. Data mining and knowledge discovery, 2(2), 169-194.

In [None]:
def dbscan_tuner(data,dim):
    a = data.loc[:, (data.columns != 'y')&(data.columns != 'Outlier')
                 &(data.columns != 'Class')&(data.columns != 'Unusual')
                 &(data.columns != 'class')]
    a = a.to_numpy()
    b = distance.cdist(a,a)
    
    dist = []
    for i in b:
        l = sorted(i)[dim]
        dist.append(l)
        
    c = data.loc[:, (data.columns != 'y')&(data.columns != 'Outlier')
                 &(data.columns != 'Class')&(data.columns != 'Unusual')
                 &(data.columns != 'class')]
    c['dist'] = dist
    c = c.sort_values(by='dist')
    d = array.array('i',(i for i in range(1,len(data)+1)))
    c['i'] = d
    
    plt.plot(c['i'],c['dist'])
    plt.xlabel("Observation index")
    plt.ylabel("k-distance")
    
    return c

## Prediction translation

The definition of a function that transforms the prediction of anomalies from -1 to 1.

In [None]:
def def_outlier(df):
    if (df['y_pred'] in [-1]):
        val = 1
    else:
        val = 0
    return val

## CADE threshold

The definition of a function that assigns an anomaly label depending on a threshold value in the classification step of the CADE algorithm.

In [None]:
def def_outlier_cade(df, threshold):
    if (df['Target'] > threshold):
        val = 1
    else:
        val = 0
    return val

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [None]:
data = pd.read_csv('./arrhythmia.csv', sep = ',')

In [None]:
# dropping columns that consist only of 0's
data = data.drop(columns = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141',
'Col147', 'Col152', 'Col153','Col160','Col200', 'Col260', 'Col270'])

In [None]:
data.head()

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=5, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,25)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 2.5, min_samples = 40)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args = (max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(128,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(128,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(128,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear kernel

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
arrhythmia_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                                 'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
                'precision':iforest_report['1']['precision'],
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve':iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
                   'precision':lof_report['1']['precision'],
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve':lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
                'precision':dbscan_report['1']['precision'],
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve':dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
                'precision':cade_report['1']['precision'],
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve':cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
                'precision':deep_autoencoders_report['1']['precision'],
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve':deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
                'precision':autoencoders_report['1']['precision'],
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve':ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
                'precision':rbf_oc_svm_report['1']['precision'],          
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
                'precision':lin_oc_svm_report['1']['precision'],          
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_lin_auc_precision_recall}

In [None]:
arrhythmia_performance = arrhythmia_performance.append(iforest_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(lof_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(dbscan_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(cade_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(deep_autoencoders_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(autoencoders_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(rbf_oc_svm_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(lin_oc_svm_performance, ignore_index = True)

## Cardiotocography

**Dataset source**: http://odds.cs.stonybrook.edu/cardiotocogrpahy-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [None]:
data = pd.read_csv('./Cardiotocography.csv')

In [None]:
data.shape

In [None]:
data['y'] = data['y'].astype(int)

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

In [None]:
data.head()

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=18, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start 
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,41)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.7, min_samples = 42)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade, args = (max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(10,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
cardio_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                             'precision','Time','AUC','AU precision-recall curve' ])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
                'precision':iforest_report['1']['precision'],
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve':iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                   'AU precision-recall curve':lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve':dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve':cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                 
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve':deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve':ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],         
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_lin_auc_precision_recall}

In [None]:
cardio_performance = cardio_performance.append(iforest_performance, ignore_index = True)
cardio_performance = cardio_performance.append(lof_performance, ignore_index = True)
cardio_performance = cardio_performance.append(dbscan_performance, ignore_index = True)
cardio_performance = cardio_performance.append(cade_performance, ignore_index = True)
cardio_performance = cardio_performance.append(deep_autoencoders_performance, ignore_index = True)
cardio_performance = cardio_performance.append(autoencoders_performance, ignore_index = True)
cardio_performance = cardio_performance.append(rbf_oc_svm_performance, ignore_index = True)
cardio_performance = cardio_performance.append(lin_oc_svm_performance, ignore_index = True)

## ForestCover

**Dataset source**: http://odds.cs.stonybrook.edu/forestcovercovertype-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [None]:
data = pd.read_csv('./ForestCover.csv')

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

In [None]:
data.head()

### iForest

In [None]:
train_data = data

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores 

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=2860, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
data_dbscan = train_data.sample(frac=0.1)

In [None]:
dist = dbscan_tuner(data_dbscan ,19)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.1, min_samples = 20)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters

In [None]:
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade, args = (max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(5,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
forestcover_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                                  'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve':iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve':lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve':dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve':cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve':deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve':ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_lin_auc_precision_recall}

In [None]:
forestcover_performance = forestcover_performance.append(iforest_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(lof_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(dbscan_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(cade_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(deep_autoencoders_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(autoencoders_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(rbf_oc_svm_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(lin_oc_svm_performance, ignore_index = True)

## Annthyroid

**Dataset source**: http://odds.cs.stonybrook.edu/annthyroid-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Aditional sources**:

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [None]:
data = pd.read_csv('./annthyroid.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=72, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data ,11)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.1, min_samples = 12)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters

In [None]:
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
annthyroid_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                                 'precision','Time','AUC','AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
                'precision':iforest_report['1']['precision'],
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve':ocsvm_lin_auc_precision_recall }

In [None]:
annthyroid_performance = annthyroid_performance.append(iforest_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(lof_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(dbscan_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(cade_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(deep_autoencoders_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(autoencoders_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(rbf_oc_svm_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(lin_oc_svm_performance, ignore_index = True)

## Kaggle Credit card fraud detection

**Dataset source**: https://www.kaggle.com/mlg-ulb/creditcardfraud

**Additional sources**:

Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon

Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE

Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)

Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier

Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing

Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019

Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019

Yann-Aël Le Borgne, Gianluca Bontempi Machine Learning for Credit Card Fraud Detection - Practical Handbook

In [None]:
data = pd.read_csv('./creditcard.csv')

In [None]:
data = data.drop(columns = ['Time'])

In [None]:
data.shape

In [None]:
data.head()

In [None]:
pd.pivot_table(data,
             values = 'V1',
               index = 'Class', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'Class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'Class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'Class'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=2848, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'Class'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
data_dbscan = train_data.sample(frac=0.1)

In [None]:
dist = dbscan_tuner(data_dbscan,57)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.15, min_samples = 58)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'Class'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('Class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['Class'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['Class'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'Class']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'Class'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['Class'])
target = data['Class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(14,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(7, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(7, activation='relu'),
          Dropout(0.1),
          Dense(14,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy()) 
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['Class'])
target = data['Class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(14,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy()) 
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['Class'])
target = data['Class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['Class'])
target = data['Class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
kaggle_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                             'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
kaggle_performance = kaggle_performance.append(iforest_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(lof_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(dbscan_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(cade_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(autoencoders_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(deep_autoencoders_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(rbf_oc_svm_performance, ignore_index = True)
kaggle_performance = kaggle_performance.append(lin_oc_svm_performance, ignore_index = True)

## Mammography

**Dataset source**: http://odds.cs.stonybrook.edu/mammography-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [None]:
data = pd.read_csv('./mammography.csv')

In [None]:
data.head()

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

In [None]:
data.shape

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=111, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,11)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.07, min_samples = 12)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
mammography_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                                  'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
mammography_performance = mammography_performance.append(iforest_performance, ignore_index = True)
mammography_performance = mammography_performance.append(lof_performance, ignore_index = True)
mammography_performance = mammography_performance.append(dbscan_performance, ignore_index = True)
mammography_performance = mammography_performance.append(cade_performance, ignore_index = True)
mammography_performance = mammography_performance.append(deep_autoencoders_performance, ignore_index = True)
mammography_performance = mammography_performance.append(autoencoders_performance, ignore_index = True)
mammography_performance = mammography_performance.append(rbf_oc_svm_performance, ignore_index = True)
mammography_performance = mammography_performance.append(lin_oc_svm_performance, ignore_index = True)

## Shuttle

**Dataset source**: http://odds.cs.stonybrook.edu/shuttle-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [None]:
data = pd.read_csv('./shuttle.csv', sep = ',')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

### Isolation Forest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=491, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data, 17)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 0.005, min_samples = 18)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                            distance.euclidean(x[original_columns].to_list(), 
                                                                            anomalies_data[original_columns].iloc[i]),1))
data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features .copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(5,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features .copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
shuttle_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                              'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve':ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                       'AU precision-recall curve':  ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
shuttle_performance = shuttle_performance.append(iforest_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(lof_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(dbscan_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(cade_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(deep_autoencoders_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(autoencoders_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(rbf_oc_svm_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(lin_oc_svm_performance, ignore_index = True)

## mnist

**Dataset source**: http://odds.cs.stonybrook.edu/mnist-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Bandaragoda, Tharindu R., et al. “Efficient Anomaly Detection by Isolation Using Nearest Neighbour Ensemble.” 2014 IEEE International Conference on Data Mining Workshop. IEEE, 2014.

In [None]:
data = pd.read_csv('./mnist.csv')

In [None]:
# dropping columns that consist only of constant values
data = data.drop(columns = ['Col1','Col4', 'Col7', 'Col22', 'Col27', 'Col29', 'Col38', 'Col41', 'Col51', 'Col53', 'Col54', 'Col61', 'Col62', 'Col71', 'Col73', 'Col79', 'Col87', 'Col88', 'Col89', 'Col90',
'Col92', 'Col100'])

In [None]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

In [None]:
data.shape

In [None]:
data.head()

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=76, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,155)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 2.5, min_samples = 156)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(39,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(19, activation='relu'),
          Dropout(0.1),
          Dense(10, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(19, activation='relu'),
          Dropout(0.1),
          Dense(39,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(39,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
plt.plot(history.history['loss'])
plt.xlabel('Epochs')
plt.ylabel('MSLE Loss')
plt.legend(['loss'])
plt.show()

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
mnist_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                            'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
mnist_performance = mnist_performance.append(iforest_performance, ignore_index = True)
mnist_performance = mnist_performance.append(lof_performance, ignore_index = True)
mnist_performance = mnist_performance.append(dbscan_performance, ignore_index = True)
mnist_performance = mnist_performance.append(cade_performance, ignore_index = True)
mnist_performance = mnist_performance.append(deep_autoencoders_performance, ignore_index = True)
mnist_performance = mnist_performance.append(autoencoders_performance, ignore_index = True)
mnist_performance = mnist_performance.append(rbf_oc_svm_performance, ignore_index = True)
mnist_performance = mnist_performance.append(lin_oc_svm_performance, ignore_index = True)

## vowels

**Dataset source**: http://odds.cs.stonybrook.edu/japanese-vowels-data/

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**: 

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [None]:
data = pd.read_csv('./vowels.csv')

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

In [None]:
data.shape

In [None]:
data.head()

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=15, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,23)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 0.4, min_samples = 24)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(6, input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu'),
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(6, activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(6, input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
vowels_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                             'precision','Time','AUC','AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
vowels_performance = vowels_performance.append(iforest_performance, ignore_index = True)
vowels_performance = vowels_performance.append(lof_performance, ignore_index = True)
vowels_performance = vowels_performance.append(dbscan_performance, ignore_index = True)
vowels_performance = vowels_performance.append(cade_performance, ignore_index = True)
vowels_performance = vowels_performance.append(deep_autoencoders_performance, ignore_index = True)
vowels_performance = vowels_performance.append(autoencoders_performance, ignore_index = True)
vowels_performance = vowels_performance.append(rbf_oc_svm_performance, ignore_index = True)
vowels_performance = vowels_performance.append(lin_oc_svm_performance, ignore_index = True)

## Seismic

**Dataset source**: http://odds.cs.stonybrook.edu/seismic-dataset/ (data is transformed from .arff to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**: 

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [None]:
data = pd.read_csv('./seismic.csv', sep = ',')

In [None]:
# dropping the columns that consist only of 0s
data = data.drop(columns = ['nbumps6','nbumps7','nbumps89'])

In [None]:
dummies = pd.get_dummies(data[['seismic','seismoacoustic','shift','ghazard']])

In [None]:
data = pd.concat([data, dummies], axis = 1)

In [None]:
data = data.drop(columns = ['seismic','seismoacoustic','shift','ghazard'])

In [None]:
pd.pivot_table(data,
             values = 'genergy',
               index = 'class', 
              aggfunc = 'count')

In [None]:
data.shape

In [None]:
data.head()

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=26, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,41)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.25, min_samples = 42)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['class'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['class'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'class']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'class'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(10,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
seismic_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                              'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
seismic_performance = seismic_performance.append(iforest_performance, ignore_index = True)
seismic_performance = seismic_performance.append(lof_performance, ignore_index = True)
seismic_performance = seismic_performance.append(dbscan_performance, ignore_index = True)
seismic_performance = seismic_performance.append(cade_performance, ignore_index = True)
seismic_performance = seismic_performance.append(deep_autoencoders_performance, ignore_index = True)
seismic_performance = seismic_performance.append(autoencoders_performance, ignore_index = True)
seismic_performance = seismic_performance.append(rbf_oc_svm_performance, ignore_index = True)
seismic_performance = seismic_performance.append(lin_oc_svm_performance, ignore_index = True)

## Musk

**Dataset source**: http://odds.cs.stonybrook.edu/musk-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

In [None]:
data = pd.read_csv('./musk.csv', sep = ',')

In [None]:
data.head()

In [None]:
data['y'] = data['y'].astype(int)

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=31, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,331)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 3.2, min_samples = 332)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'y']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'y'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['y'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(83,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(41, activation='relu'),
          Dropout(0.1),
          Dense(20, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(41, activation='relu'),
          Dropout(0.1),
          Dense(83,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
confusion_matrix(target, predictions)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(83,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['y'])
target = data['y']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
musk_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC', 'AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
musk_performance = musk_performance.append(iforest_performance, ignore_index = True)
musk_performance = musk_performance.append(lof_performance, ignore_index = True)
musk_performance = musk_performance.append(dbscan_performance, ignore_index = True)
musk_performance = musk_performance.append(cade_performance, ignore_index = True)
musk_performance = musk_performance.append(deep_autoencoders_performance, ignore_index = True)
musk_performance = musk_performance.append(autoencoders_performance, ignore_index = True)
musk_performance = musk_performance.append(rbf_oc_svm_performance, ignore_index = True)
musk_performance = musk_performance.append(lin_oc_svm_performance, ignore_index = True)

## bank 

**Dataset source**: https://github.com/GuansongPang/ADRepository-Anomaly-detection-datasets/tree/main/categorical%20data

Pang, G., Shen, C., Cao, L., & Hengel, A. V. D. (2021). Deep learning for anomaly detection: A review. ACM Computing Surveys (CSUR), 54(2), 1-38.

In [None]:
data = pd.read_csv('./bank.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
pd.pivot_table(data,
             values = 'age',
               index = 'class', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
iforest_auc_precision_recall = metrics.auc(recall, precision)
print(iforest_auc_precision_recall)

### LOF

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=412, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lof_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
lof_auc_precision_recall = metrics.auc(recall, precision)
print(lof_auc_precision_recall)

### DBSCAN

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = data.columns

In [None]:
dist = dbscan_tuner(train_data,123)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 2, min_samples = 124)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = data.columns.to_list()
original_columns.remove('class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

In [None]:
for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['class'], data_for_auprc['score'])
dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
dbscan_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['class'], data_for_auprc['score'])
dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(dbscan_auc_precision_recall)

### CADE

Source: Aric LaBarr, webinar "Modern Approaches To Anomaly Detection", November 2, 2021

In [None]:
train_data = data.copy()

In [None]:
fake = pd.DataFrame()
data_for_fake = train_data.loc[:, train_data.columns != 'class']
data_for_fake['Target'] = 0
for i in data_for_fake.columns:
    fake[i] = np.random.uniform(np.min(data_for_fake[i]), np.max(data_for_fake[i]), size = len(data_for_fake[i]))
fake['Target'] = 1

In [None]:
data_combined = pd.concat([data_for_fake, fake], axis = 0)

In [None]:
start = time.process_time()
model = RandomForestClassifier(n_estimators = 100)
model.fit(data_combined.loc[:, data_combined.columns != 'Target'],data_combined.loc[:, data_combined.columns == 'Target']['Target'] )
train_data['Target'] = model.predict_proba(train_data.loc[:, train_data.columns != 'class'])[:,1]
end = time.process_time()
cade_time = end - start
print(end - start)

In [None]:
auc = {}
for i in np.arange(0,0.55,0.05):
    train_data['prediction'] = train_data.apply(def_outlier_cade, args=(i,), axis = 1)
    fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['prediction'])
    auc[i] = metrics.auc(fpr, tpr)
max_key = max(auc, key=auc.get)

In [None]:
train_data['prediction'] = train_data.apply(def_outlier_cade,args=(max_key,), axis = 1)

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['Target'])
cade_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cade_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['Target'])
cade_auc_precision_recall = metrics.auc(recall, precision)
print(cade_auc_precision_recall)

### Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(31,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(7, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(31,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
deep_autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(deep_ae_auc_precision_recall)

### Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(31,input_dim = output_units, activation='relu'),
          Dropout(0.1),
        ])
        self.decoder = Sequential([
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [None]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
autoencoders_time = end - start
print(end - start)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, scores)
ae_auc_precision_recall = metrics.auc(recall, precision)
print(ae_auc_precision_recall)

### One-Class SVM rbf

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
rbf_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
rbf_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
rbf_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_rbf_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_rbf_auc_precision_recall)

### One-Class SVM linear

In [None]:
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
start = time.process_time()

model = OneClassSVM(kernel = 'linear', nu=0.1)
model.fit(x_train_scaled)
yhat = model.predict(x_train_scaled)
y_scores = -model.decision_function(x_train_scaled)

end = time.process_time()
lin_oc_svm_time = end - start
print(end - start)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['y_pred']).apply(def_outlier, axis = 1)

In [None]:
fpr, tpr, _ = metrics.roc_curve(target, y_scores)
lin_oc_svm_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
lin_oc_svm_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(target, y_scores)
ocsvm_lin_auc_precision_recall = metrics.auc(recall, precision)
print(ocsvm_lin_auc_precision_recall)

### Performance

In [None]:
bank_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

In [None]:
iforest_performance = {'method':'iForest',
               'f1-score':iforest_report['1']['f1-score'], 
               'sensitivity':iforest_report['1']['recall'],
               'precision':iforest_report['1']['precision'],        
              'Time':iforest_time,
              'AUC':iforest_auc,
                      'AU precision-recall curve': iforest_auc_precision_recall}
lof_performance = {'method':'LOF',
               'f1-score':lof_report['1']['f1-score'], 
               'sensitivity':lof_report['1']['recall'],
               'precision':lof_report['1']['precision'],    
              'Time':lof_time,
              'AUC':lof_auc,
                  'AU precision-recall curve': lof_auc_precision_recall}
dbscan_performance = { 'method':'DBSCAN',
               'f1-score':dbscan_report['1']['f1-score'], 
               'sensitivity':dbscan_report['1']['recall'],
               'precision':dbscan_report['1']['precision'],       
              'Time':dbscan_time,
              'AUC':dbscan_auc,
                     'AU precision-recall curve': dbscan_auc_precision_recall}
cade_performance = { 'method':'CADE',
               'f1-score':cade_report['1']['f1-score'], 
               'sensitivity':cade_report['1']['recall'],
               'precision':cade_report['1']['precision'],     
              'Time':cade_time,
              'AUC':cade_auc,
                   'AU precision-recall curve': cade_auc_precision_recall}
deep_autoencoders_performance = {'method':'Deep Autoencoders',
               'f1-score':deep_autoencoders_report['1']['f1-score'], 
               'sensitivity':deep_autoencoders_report['1']['recall'],
               'precision':deep_autoencoders_report['1']['precision'],                  
              'Time':deep_autoencoders_time,
              'AUC':deep_autoencoders_auc,
                                'AU precision-recall curve': deep_ae_auc_precision_recall}
autoencoders_performance = {'method':'Autoencoders',
               'f1-score':autoencoders_report['1']['f1-score'], 
               'sensitivity':autoencoders_report['1']['recall'],
               'precision':autoencoders_report['1']['precision'],             
              'Time':autoencoders_time,
              'AUC':autoencoders_auc,
                           'AU precision-recall curve': ae_auc_precision_recall}
rbf_oc_svm_performance = {'method':'OC-SVM rbf',
               'f1-score':rbf_oc_svm_report['1']['f1-score'], 
               'sensitivity':rbf_oc_svm_report['1']['recall'],
               'precision':rbf_oc_svm_report['1']['precision'],           
              'Time':rbf_oc_svm_time,
              'AUC':rbf_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_rbf_auc_precision_recall}
lin_oc_svm_performance = {'method':'OC-SVM linear',
               'f1-score':lin_oc_svm_report['1']['f1-score'], 
               'sensitivity':lin_oc_svm_report['1']['recall'],
               'precision':lin_oc_svm_report['1']['precision'],           
              'Time':lin_oc_svm_time,
              'AUC':lin_oc_svm_auc,
                         'AU precision-recall curve': ocsvm_lin_auc_precision_recall}

In [None]:
bank_performance = bank_performance.append(iforest_performance, ignore_index = True)
bank_performance = bank_performance.append(lof_performance, ignore_index = True)
bank_performance = bank_performance.append(dbscan_performance, ignore_index = True)
bank_performance = bank_performance.append(cade_performance, ignore_index = True)
bank_performance = bank_performance.append(deep_autoencoders_performance, ignore_index = True)
bank_performance = bank_performance.append(autoencoders_performance, ignore_index = True)
bank_performance = bank_performance.append(rbf_oc_svm_performance, ignore_index = True)
bank_performance = bank_performance.append(lin_oc_svm_performance, ignore_index = True)

## Overall performance

In [None]:
arrhythmia_performance['dataset'] = 'arrhythmia'
cardio_performance['dataset'] = 'cardio'
forestcover_performance['dataset'] = 'forestcover'
annthyroid_performance['dataset'] = 'annthyroid'
kaggle_performance['dataset'] = 'kaggle'
mammography_performance['dataset'] = 'mammography'
shuttle_performance['dataset'] = 'shuttle'
mnist_performance['dataset'] = 'mnist'
vowels_performance['dataset'] = 'vowels'
seismic_performance['dataset'] = 'seismic'
musk_performance['dataset'] = 'musk'
bank_performance['dataset'] = 'bank'

In [None]:
final_performance = pd.concat([arrhythmia_performance,cardio_performance,forestcover_performance,
           annthyroid_performance, kaggle_performance, mammography_performance, shuttle_performance,
          mnist_performance,vowels_performance, seismic_performance, musk_performance, bank_performance])

## Performance comparison with Bayesian tests

**Source**: https://baycomp.readthedocs.io/en/latest/

For the comparison with the Bayesian tests, we have averaged the performance over three runs of the code above and made a separate Excel sheet for each performance measure with the datasets in rows, methods in columns and averaged performance measures in cells.

In [None]:
bayesian_prob = pd.DataFrame(columns = ['comparison','left', 'within', 'right', 'metric'])

### f1-score

We compare top three ranked models with each other.

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', 
                            sheet_name = 'F1 score')

In [None]:
# iForest vs Autoencoders
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'F1 score'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# Autoencoders vs DBSCAN
posterior = SignedRankTest(performance['Autoencoders'].to_numpy(),performance['DBSCAN'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'Autoencoders vs DBSCAN', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'F1 score'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("AE", "DBSCAN")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# iForest vs DBSCAN
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['DBSCAN'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs DBSCAN', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'F1 score'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "DBSCAN")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

### sensitivity

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', 
                            sheet_name = 'sensitivity')

In [None]:
# Deep Autoencoders vs iForest
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Deep Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Deep Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'sensitivity'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "Deep AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# iForest vs Autoencoders
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'sensitivity'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# Autoencoders vs Deep Autoencoders
posterior = SignedRankTest(performance['Autoencoders'].to_numpy(),performance['Deep Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'Autoencoders vs Deep Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'sensitivity'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("AE", "Deep AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

### precision

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', 
                            sheet_name = 'precision')

In [None]:
# iForest vs DBSCAN
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['DBSCAN'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs DBSCAN', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'precision'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "DBSCAN")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# iForest vs Autoencoders
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'precision'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
#  DBSCAN vs Autoencoders
posterior = SignedRankTest(performance['DBSCAN'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'DBSCAN vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'precision'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("DBSCAN", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

### AUC

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', 
                            sheet_name = 'AUC')

In [None]:
# iForest vs DBSCAN
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['DBSCAN'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs DBSCAN', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AUC'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "DBSCAN")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# iForest vs Autoencoders
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AUC'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# DBSCAN vs Autoencoders
posterior = SignedRankTest(performance['DBSCAN'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'DBSCAN vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AUC'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("DBSCAN", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

### AU precision-recall

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', 
                            sheet_name = 'AU precision-recall')

In [None]:
# iForest vs DBSCAN
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['DBSCAN'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs DBSCAN', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AU precision-recall'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "DBSCAN")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# iForest vs Autoencoders
posterior = SignedRankTest(performance['iForest'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'iForest vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AU precision-recall'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("iForest", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

In [None]:
# Autoencoders vs DBSCAN
posterior = SignedRankTest(performance['DBSCAN'].to_numpy(),performance['Autoencoders'].to_numpy(), rope=0.01)
left, within, right = posterior.probs()
bayesian_prob = bayesian_prob.append({'comparison':'DBSCAN vs Autoencoders', 
                                      'left': left, 'within':within, 
                                      'right': right,'metric':'AU precision-recall'}, ignore_index=True)
print(left, within, right)

In [None]:
names = ("DBSCAN", "AE")
fig = posterior.plot(names)
fig.set_size_inches(15, 8)
plt.show()

## Frequentist comparison

Firstly, the ranking of all the models is calculated per performance metric.
Consequamtly, all the models are compared with the best ranked model per performance metric using the Wilcoxon signed-rank test. For the F1-score, sensitivity and AUC metrics, the best ranked model is the Autoencoder model. For the precision metric, the best ranked model is the DBSCAN model, and for the AUPRC and time metrics, the best ranked model is the iForest model.

In addition, all the models are compared with each other using the Friedman chi square test.

Source: https://edisciplinas.usp.br/pluginfile.php/4129451/mod_resource/content/1/model_selection_evaluation.pdf

### F1 score

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'F1 score')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(autoencoders, iforest, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, lof, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, cade, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmlin, zero_method='zsplit')

### sensitivity

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'sensitivity')

In [None]:
performance

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(autoencoders, iforest, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, lof, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, cade, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmlin, zero_method='zsplit')

### precision

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'precision')

In [None]:
performance

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(dbscan, iforest, zero_method='zsplit')

In [None]:
wilcoxon(dbscan, lof, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(dbscan, cade, zero_method='zsplit')

In [None]:
wilcoxon(dbscan, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(dbscan, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(dbscan, ocsvmlin, zero_method='zsplit')

### AUC

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'AUC')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(autoencoders, iforest, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, lof, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, cade, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(autoencoders, ocsvmlin, zero_method='zsplit')

### Time

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'time')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(iforest, autoencoders, zero_method='zsplit')

In [None]:
wilcoxon(iforest, lof, zero_method='zsplit')

In [None]:
wilcoxon(iforest, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(iforest, cade, zero_method='zsplit')

In [None]:
wilcoxon(iforest, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(iforest, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(iforest, ocsvmlin, zero_method='zsplit')

### AU precision-recall curve

In [None]:
performance = pd.read_excel('./aggregated_performance.xlsx', sheet_name = 'AU precision-recall')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
ranks = {a:r for a, r in zip(algorithms_names, average_ranks)}
pd.DataFrame(ranks.items())

In [None]:
friedmanchisquare(*performances_array)

In [None]:
autoencoders = np.array(performance['Autoencoders'])
iforest = np.array(performance['iForest'])
lof = np.array(performance['LOF'])
dbscan = np.array(performance['DBSCAN'])
cade = np.array(performance['CADE'])
deepautoencoders = np.array(performance['Deep Autoencoders'])
ocsvmrbf = np.array(performance['OC-SVM rbf'])
ocsvmlin = np.array(performance['OC-SVM linear'])

In [None]:
wilcoxon(iforest, autoencoders, zero_method='zsplit')

In [None]:
wilcoxon(iforest, lof, zero_method='zsplit')

In [None]:
wilcoxon(iforest, dbscan, zero_method='zsplit')

In [None]:
wilcoxon(iforest, cade, zero_method='zsplit')

In [None]:
wilcoxon(iforest, deepautoencoders, zero_method='zsplit')

In [None]:
wilcoxon(iforest, ocsvmrbf, zero_method='zsplit')

In [None]:
wilcoxon(iforest, ocsvmlin, zero_method='zsplit')