# Deep autoencoder

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Libraries import

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn import metrics


from sklearn.preprocessing import  MinMaxScaler, OneHotEncoder
from tensorflow.keras.layers import Dense, Dropout
from keras.models import Model, Sequential
from keras.layers import Dense
import tensorflow as tf

import time


## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [5]:
data = pd.read_csv('./arrhythmia.csv', sep = ',')

In [6]:
# dropping columns that consist only of 0's
data = data.drop(columns = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141',
'Col147', 'Col152', 'Col153','Col160','Col200', 'Col260', 'Col270'])

In [7]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col265,Col266,Col267,Col268,Col269,Col271,Col272,Col273,Col274,y
0,75.0,0.0,190.0,80.0,91.0,193.0,371.0,174.0,121.0,-16.0,...,-0.3,0.0,9.0,-0.9,0.0,0.9,2.9,23.3,49.4,1
1,56.0,1.0,165.0,64.0,81.0,174.0,401.0,149.0,39.0,25.0,...,-0.5,0.0,8.5,0.0,0.0,0.2,2.1,20.4,38.8,0
2,54.0,0.0,172.0,95.0,138.0,163.0,386.0,185.0,102.0,96.0,...,0.9,0.0,9.5,-2.4,0.0,0.3,3.4,12.3,49.0,0
3,55.0,0.0,175.0,94.0,100.0,202.0,380.0,179.0,143.0,28.0,...,0.1,0.0,12.2,-2.2,0.0,0.4,2.6,34.6,61.6,0
4,75.0,0.0,190.0,80.0,88.0,181.0,360.0,177.0,103.0,-16.0,...,-0.4,0.0,13.1,-3.6,0.0,-0.1,3.9,25.4,62.8,1


In [8]:
data.shape

(452, 258)

In [9]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


### Deep Autoencoder

In [10]:
features = data.drop(columns = ['y'])
target = data['y']

In [11]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [12]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(128,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(128,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [13]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [14]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

end = time.process_time()
arrhythmia_deep_autoencoders_train_time = end - start
print(end - start)


start = time.process_time()
threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
arrhythmia_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
5.640625
Threshold: 0.01383861452177259
0.625


In [15]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
arrhythmia_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.8025200188412623

In [16]:
arrhythmia_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92       386
           1       0.55      0.42      0.48        66

    accuracy                           0.87       452
   macro avg       0.73      0.68      0.70       452
weighted avg       0.85      0.87      0.86       452



In [17]:
precision, recall, thresholds = precision_recall_curve(target, scores)
arrhythmia_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(arrhythmia_deep_ae_auc_precision_recall)

0.4652562396606436


## Cardiotocography

**Dataset source**: http://odds.cs.stonybrook.edu/cardiotocogrpahy-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [18]:
data = pd.read_csv('./Cardiotocography.csv')

In [19]:
data.shape

(1831, 22)

In [20]:
data['y'] = data['y'].astype(int)

In [21]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,1655
1,176


In [22]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col13,Col14,Col15,Col16,Col17,Col18,Col19,Col20,Col21,y
0,0.004912,0.693191,-0.20364,0.595322,0.35319,-0.061401,-0.278295,-1.650444,0.759072,-0.420487,...,-0.798376,1.854728,0.622631,0.963083,0.301464,0.193113,0.231498,-0.289786,-0.493294,0
1,0.110729,-0.079903,-0.20364,1.268942,0.396246,-0.061401,-0.278295,-1.71027,0.759072,-0.420487,...,-0.798376,1.854728,0.278625,0.963083,0.301464,0.129265,0.093563,-0.256385,-0.493294,0
2,0.216546,-0.272445,-0.20364,1.050988,0.148753,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,2.342663,-0.488279,0.061002,0.065417,0.024596,-0.256385,1.140018,0
3,0.004912,0.727346,-0.20364,1.212171,-0.683598,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,1.65465,-0.488279,0.061002,0.193113,0.093563,-0.323186,1.140018,0
4,-0.100905,0.363595,1.321366,1.02712,0.141359,-0.061401,-0.278295,-0.992364,-0.051613,-0.420487,...,-0.085638,-0.565334,0.278625,-0.488279,-0.059229,0.065417,0.024596,-0.456787,1.140018,0


### Deep Autoencoder

In [23]:
features = data.drop(columns = ['y'])
target = data['y']

In [24]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [25]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(10,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [26]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [27]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

end = time.process_time()
cardio_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()
threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
cardio_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
10.3125
Threshold: 0.02448342176878049
1.25


In [28]:
confusion_matrix(target, predictions)

array([[1536,  119],
       [  46,  130]], dtype=int64)

In [29]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
cardio_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.9459763801153529

In [30]:
cardio_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1655
           1       0.52      0.74      0.61       176

    accuracy                           0.91      1831
   macro avg       0.75      0.83      0.78      1831
weighted avg       0.93      0.91      0.92      1831



In [31]:
precision, recall, thresholds = precision_recall_curve(target, scores)
cardio_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(cardio_deep_ae_auc_precision_recall)

0.6714710076728642


## ForestCover

**Dataset source**: http://odds.cs.stonybrook.edu/forestcovercovertype-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [32]:
data = pd.read_csv('./ForestCover.csv')

In [33]:
data.shape

(286048, 11)

In [34]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col2
y,Unnamed: 1_level_1
0,283301
1,2747


In [35]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,y
0,2804,139,9,268,65,3180,234,238,135,6121,0
1,2785,155,18,242,118,3090,238,238,122,6211,0
2,2579,132,6,300,-15,67,230,237,140,6031,0
3,2886,151,11,371,26,5253,234,240,136,4051,0
4,2742,134,22,150,69,3215,248,224,92,6091,0


### Deep Autoencoder

In [36]:
features = data.drop(columns = ['y'])
target = data['y']

In [37]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [38]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(5,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [39]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [40]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
forestcover_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
forestcover_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
138.546875
Threshold: 0.013307273530092523
73.125


In [41]:
confusion_matrix(target, predictions)

array([[243271,  40030],
       [   509,   2238]], dtype=int64)

In [42]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
forestcover_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.9161161756783036

In [43]:
forestcover_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92    283301
           1       0.05      0.81      0.10      2747

    accuracy                           0.86    286048
   macro avg       0.53      0.84      0.51    286048
weighted avg       0.99      0.86      0.92    286048



In [44]:
precision, recall, thresholds = precision_recall_curve(target, scores)
forestcover_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(forestcover_deep_ae_auc_precision_recall)

0.09293236535045243


## Annthyroid

**Dataset source**: http://odds.cs.stonybrook.edu/annthyroid-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Aditional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [45]:
data = pd.read_csv('./annthyroid.csv')

In [46]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,y
0,0.73,0.0006,0.015,0.12,0.082,0.146,0
1,0.24,0.00025,0.03,0.143,0.133,0.108,0
2,0.47,0.0019,0.024,0.102,0.131,0.078,0
3,0.64,0.0009,0.017,0.077,0.09,0.085,0
4,0.23,0.00025,0.026,0.139,0.09,0.153,0


In [47]:
data.shape

(7200, 7)

In [48]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,6666
1,534


### Deep Autoencoder

In [49]:
features = data.drop(columns = ['y'])
target = data['y']

In [50]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [51]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [52]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [53]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
annthyroid_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
annthyroid_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1.890625
Threshold: 0.03053629603437302
0.8125


In [54]:
confusion_matrix(target, predictions)

array([[5942,  724],
       [ 365,  169]], dtype=int64)

In [55]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
annthyroid_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.6818476791499375

In [56]:
annthyroid_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92      6666
           1       0.19      0.32      0.24       534

    accuracy                           0.85      7200
   macro avg       0.57      0.60      0.58      7200
weighted avg       0.89      0.85      0.87      7200



In [57]:
precision, recall, thresholds = precision_recall_curve(target, scores)
annthyroid_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(annthyroid_deep_ae_auc_precision_recall)

0.19804444580417363


## Credit card

**Dataset source**: https://www.kaggle.com/mlg-ulb/creditcardfraud

**Additional sources:**

Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon

Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE

Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)

Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier

Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing

Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019

Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019

Yann-Aël Le Borgne, Gianluca Bontempi Machine Learning for Credit Card Fraud Detection - Practical Handbook

In [58]:
data = pd.read_csv('./creditcard.csv')

In [59]:
data = data.drop(columns = ['Time'])

In [60]:
data.shape

(284807, 30)

In [61]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [62]:
pd.pivot_table(data,
             values = 'V1',
               index = 'Class', 
              aggfunc = 'count')

Unnamed: 0_level_0,V1
Class,Unnamed: 1_level_1
0,284315
1,492


### Deep Autoencoder

In [63]:
features = data.drop(columns = ['Class'])
target = data['Class']

In [64]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [65]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(14,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(7, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(7, activation='relu'),
          Dropout(0.1),
          Dense(14,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [66]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy()) 
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [67]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
creditcard_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

#end = time.process_time()
end = time.process_time()
creditcard_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
50.9375
Threshold: 0.0016509321046353997
26.21875


In [68]:
confusion_matrix(target, predictions)

array([[272342,  11973],
       [    75,    417]], dtype=int64)

In [69]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
creditcard_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.9516381263824948

In [70]:
creditcard_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    284315
           1       0.03      0.85      0.06       492

    accuracy                           0.96    284807
   macro avg       0.52      0.90      0.52    284807
weighted avg       1.00      0.96      0.98    284807



In [71]:
precision, recall, thresholds = precision_recall_curve(target, scores)
creditcard_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_deep_ae_auc_precision_recall)

0.4081568874565893


## Mammography

**Dataset source**: http://odds.cs.stonybrook.edu/mammography-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [72]:
data = pd.read_csv('./mammography.csv')

In [73]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,y
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0


In [74]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,10923
1,260


In [75]:
data.shape

(11183, 7)

### Deep Autoencoder

In [76]:
features = data.drop(columns = ['y'])
target = data['y']

In [77]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [78]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(3,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [79]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [80]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
mammography_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
mammography_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1.828125
Threshold: 0.038420574694002355
1.171875


In [81]:
confusion_matrix(target, predictions)

array([[9645, 1278],
       [  50,  210]], dtype=int64)

In [82]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
mammography_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.881576278706188

In [83]:
mammography_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.99      0.88      0.94     10923
           1       0.14      0.81      0.24       260

    accuracy                           0.88     11183
   macro avg       0.57      0.85      0.59     11183
weighted avg       0.97      0.88      0.92     11183



In [84]:
precision, recall, thresholds = precision_recall_curve(target, scores)
mammography_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(mammography_deep_ae_auc_precision_recall)

0.18556587877781144


## Shuttle

**Dataset source**: http://odds.cs.stonybrook.edu/shuttle-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [85]:
data = pd.read_csv('./shuttle.csv', sep = ',')

In [86]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,y
0,50,21,77,0,28,0,27,48,22,1
1,53,0,82,0,52,-5,29,30,2,0
2,37,0,76,0,28,18,40,48,8,0
3,37,0,79,0,34,-26,43,46,2,0
4,85,0,88,-4,6,1,3,83,80,1


In [87]:
data.shape

(49097, 10)

In [88]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,45586
1,3511


### Deep Autoencoder

In [89]:
features = data.drop(columns = ['y'])
target = data['y']

In [90]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [91]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(5,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(5,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [92]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [93]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
shuttle_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
shuttle_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
7.546875
Threshold: 0.0017639756695981646
5.40625


In [94]:
confusion_matrix(target, predictions)

array([[41871,  3715],
       [ 1081,  2430]], dtype=int64)

In [95]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
shuttle_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.947003927700049

In [96]:
shuttle_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.92      0.95     45586
           1       0.40      0.69      0.50      3511

    accuracy                           0.90     49097
   macro avg       0.69      0.81      0.72     49097
weighted avg       0.93      0.90      0.91     49097



In [97]:
precision, recall, thresholds = precision_recall_curve(target, scores)
shuttle_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(shuttle_deep_ae_auc_precision_recall)

0.5504630082909637


## mnist

**Dataset source**: http://odds.cs.stonybrook.edu/mnist-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Bandaragoda, Tharindu R., et al. “Efficient Anomaly Detection by Isolation Using Nearest Neighbour Ensemble.” 2014 IEEE International Conference on Data Mining Workshop. IEEE, 2014.

In [98]:
data = pd.read_csv('./mnist.csv')

In [99]:
data = data.drop(columns = ['Col1','Col4', 'Col7', 'Col22', 'Col27', 'Col29', 'Col38', 'Col41', 'Col51', 'Col53', 'Col54', 'Col61', 'Col62', 'Col71', 'Col73', 'Col79', 'Col87', 'Col88', 'Col89', 'Col90',
'Col92', 'Col100'])

In [100]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col2
y,Unnamed: 1_level_1
0,6903
1,700


In [101]:
data.shape

(7603, 79)

In [102]:
data.head()

Unnamed: 0,Col2,Col3,Col5,Col6,Col8,Col9,Col10,Col11,Col12,Col13,...,Col86,Col91,Col93,Col94,Col95,Col96,Col97,Col98,Col99,y
0,-73.804153,198.205963,-13.124617,-1.1501,-0.141633,179.24939,114.661163,-80.736702,130.659348,162.649841,...,-15.392716,188.055649,-4.469967,158.381409,-137.100632,27.131416,-2.274633,-0.00065,-12.351267,0
1,-73.804153,197.205963,-13.124617,-1.1501,-0.141633,179.24939,-44.338833,-80.736702,128.659348,190.649841,...,-15.392716,186.055649,-4.469967,123.381416,-137.100632,157.131409,-2.274633,-0.00065,-12.351267,0
2,-73.804153,-53.794033,-13.124617,-1.1501,-0.141633,-73.750618,-44.338833,170.263306,130.659348,46.649849,...,-15.392716,188.055649,-4.469967,157.381409,-137.100632,-93.868584,-2.274633,-0.00065,-12.351267,0
3,-73.804153,86.205963,-13.124617,-1.1501,-0.141633,76.249382,208.661163,107.263298,130.659348,190.649841,...,-15.392716,188.055649,-4.469967,157.381409,-137.100632,74.131416,-2.274633,-0.00065,-12.351267,0
4,-27.804153,199.205963,-13.124617,-1.1501,-0.141633,179.24939,-44.338833,-80.736702,130.659348,91.649849,...,-15.392716,188.055649,-4.469967,22.381416,-137.100632,159.131409,-2.274633,-0.00065,-12.351267,0


### Deep Autoencoder

In [103]:
features = data.drop(columns = ['y'])
target = data['y']

In [104]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [105]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(39,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(19, activation='relu'),
          Dropout(0.1),
          Dense(10, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(19, activation='relu'),
          Dropout(0.1),
          Dense(39,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [106]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [107]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
mnist_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
mnist_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
4.09375
Threshold: 0.044778300030431234
1.171875


In [108]:
confusion_matrix(target, predictions)

array([[6002,  901],
       [ 405,  295]], dtype=int64)

In [109]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
mnist_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.769351420707353

In [110]:
mnist_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.94      0.87      0.90      6903
           1       0.25      0.42      0.31       700

    accuracy                           0.83      7603
   macro avg       0.59      0.65      0.61      7603
weighted avg       0.87      0.83      0.85      7603



In [111]:
precision, recall, thresholds = precision_recall_curve(target, scores)
mnist_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(mnist_deep_ae_auc_precision_recall)

0.26397821254092063


## vowels

**Dataset source**: http://odds.cs.stonybrook.edu/japanese-vowels-data/

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [112]:
data = pd.read_csv('./vowels.csv')

In [113]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0.0,1406
1.0,50


In [114]:
data.shape

(1456, 13)

In [115]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,y
0,0.580469,-0.902534,0.617899,-0.997942,-2.463799,-0.846455,2.349849,0.3754,-0.649334,1.604637,-0.62306,-0.383125,0.0
1,0.784375,-1.077366,0.615781,-0.921911,-2.388553,-0.638047,2.106684,0.361018,-0.714317,1.260236,-0.423339,-0.287791,0.0
2,0.791292,-1.086242,0.669773,-0.806112,-2.260781,-0.538491,2.053282,0.266492,-0.842815,1.081797,-0.267201,-0.172203,0.0
3,1.217306,-1.083425,0.855483,-0.724879,-2.155552,-0.101879,1.768597,0.303151,-1.04471,0.65529,0.214298,-0.34184,0.0
4,1.065352,-1.030178,0.773297,-0.452289,-1.955907,0.248205,1.530474,0.25374,-0.968961,-0.208287,0.331578,0.007288,0.0


### Deep Autoencoder

In [116]:
features = data.drop(columns = ['y'])
target = data['y']

In [117]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [118]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(6, input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu'),
        ])
        self.decoder = Sequential([
          Dense(3, activation='relu'),
          Dropout(0.1),
          Dense(6, activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [119]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [120]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

end = time.process_time()
vowels_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
vowels_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
2.046875
Threshold: 0.021549115668272844
0.3125


In [121]:
confusion_matrix(target, predictions)

array([[1203,  203],
       [  41,    9]], dtype=int64)

In [122]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
vowels_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.5009672830725462

In [123]:
vowels_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      1406
           1       0.04      0.18      0.07        50

    accuracy                           0.83      1456
   macro avg       0.50      0.52      0.49      1456
weighted avg       0.94      0.83      0.88      1456



In [124]:
precision, recall, thresholds = precision_recall_curve(target, scores)
vowels_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(vowels_deep_ae_auc_precision_recall)

0.03735112224710033


## Seismic

**Dataset source**: http://odds.cs.stonybrook.edu/seismic-dataset/ (data is transformed from .arff to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [125]:
data = pd.read_csv('./seismic.csv', sep = ',')

In [126]:
data = data.drop(columns = ['nbumps6','nbumps7','nbumps89'])

In [127]:
data.shape

(2584, 16)

In [128]:
drop_enc = OneHotEncoder(drop='first').fit_transform(data[['seismic','seismoacoustic','shift','ghazard']])

In [129]:
cat_var = pd.DataFrame(drop_enc.toarray())
cat_var.columns = ['seismic: b', 'seismoacoustic: b','seismoacoustic: c','shift: W','ghazard: b','ghazard: c']

In [130]:
data = pd.concat([data, cat_var], axis = 1)

In [131]:
data = data.drop(columns = ['seismic','seismoacoustic','shift','ghazard'])

In [132]:
data.head()

Unnamed: 0,genergy,gpuls,gdenergy,gdpuls,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,energy,maxenergy,class,seismic: b,seismoacoustic: b,seismoacoustic: c,shift: W,ghazard: b,ghazard: c
0,15180,48,-72,-72,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,14720,33,-70,-79,1,0,1,0,0,2000,2000,0,0.0,0.0,0.0,0.0,0.0,0.0
2,8050,30,-81,-78,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,28820,171,-23,40,1,0,1,0,0,3000,3000,0,0.0,0.0,0.0,0.0,0.0,0.0
4,12640,57,-63,-52,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
pd.pivot_table(data,
             values = 'genergy',
               index = 'class', 
              aggfunc = 'count')

Unnamed: 0_level_0,genergy
class,Unnamed: 1_level_1
0,2414
1,170


### Deep Autoencoder

In [134]:
features = data.drop(columns = ['class'])
target = data['class']

In [135]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [136]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(10,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(3, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(5, activation='relu'),
          Dropout(0.1),
          Dense(10,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [137]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [138]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)


end = time.process_time()
seismic_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
seismic_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
3.640625
Threshold: 0.03599470813590553
0.515625


In [139]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
seismic_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.6340245626005165

In [140]:
seismic_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91      2414
           1       0.10      0.20      0.13       170

    accuracy                           0.83      2584
   macro avg       0.52      0.54      0.52      2584
weighted avg       0.88      0.83      0.86      2584



In [141]:
precision, recall, thresholds = precision_recall_curve(target, scores)
seismic_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(seismic_deep_ae_auc_precision_recall)

0.09174293166073316


## Musk

**Dataset source**: http://odds.cs.stonybrook.edu/musk-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

In [142]:
data = pd.read_csv('./musk.csv', sep = ',')

In [143]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col158,Col159,Col160,Col161,Col162,Col163,Col164,Col165,Col166,y
0,46.0,-108.0,-60.0,-69.0,-117.0,49.0,38.0,-161.0,-8.0,5.0,...,-308.0,52.0,-7.0,39.0,126.0,156.0,-50.0,-112.0,96.0,1.0
1,41.0,-188.0,-145.0,22.0,-117.0,-6.0,57.0,-171.0,-39.0,-100.0,...,-59.0,-2.0,52.0,103.0,136.0,169.0,-61.0,-136.0,79.0,1.0
2,46.0,-194.0,-145.0,28.0,-117.0,73.0,57.0,-168.0,-39.0,-22.0,...,-134.0,-154.0,57.0,143.0,142.0,165.0,-67.0,-145.0,39.0,1.0
3,41.0,-188.0,-145.0,22.0,-117.0,-7.0,57.0,-170.0,-39.0,-99.0,...,-60.0,-4.0,52.0,104.0,136.0,168.0,-60.0,-135.0,80.0,1.0
4,41.0,-188.0,-145.0,22.0,-117.0,-7.0,57.0,-170.0,-39.0,-99.0,...,-60.0,-4.0,52.0,104.0,137.0,168.0,-60.0,-135.0,80.0,1.0


In [144]:
data['y'] = data['y'].astype(int)

In [145]:
data.shape

(3062, 167)

In [146]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,2965
1,97


### Deep Autoencoder

In [147]:
features = data.drop(columns = ['y'])
target = data['y']

In [148]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [149]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(83,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(41, activation='relu'),
          Dropout(0.1),
          Dense(20, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(41, activation='relu'),
          Dropout(0.1),
          Dense(83,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [150]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [151]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=64,
)

end = time.process_time()
musk_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
musk_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
6.609375
Threshold: 0.01072259934954277
0.65625


In [152]:
confusion_matrix(target, predictions)

array([[2491,  474],
       [  79,   18]], dtype=int64)

In [153]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
musk_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.603184923766972

In [154]:
musk_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90      2965
           1       0.04      0.19      0.06        97

    accuracy                           0.82      3062
   macro avg       0.50      0.51      0.48      3062
weighted avg       0.94      0.82      0.87      3062



In [155]:
precision, recall, thresholds = precision_recall_curve(target, scores)
musk_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(musk_deep_ae_auc_precision_recall)

0.07009372144034698


## bank

**Dataset source**: https://github.com/GuansongPang/ADRepository-Anomaly-detection-datasets/tree/main/categorical%20data

Pang, G., Shen, C., Cao, L., & Hengel, A. V. D. (2021). Deep learning for anomaly detection: A review. ACM Computing Surveys (CSUR), 54(2), 1-38.

In [156]:
data = pd.read_csv('./bank.csv')

In [157]:
data.head()

Unnamed: 0,age,job=housemaid,job=services,job=admin.,job=blue-collar,job=technician,job=retired,job=management,job=unemployed,job=self-employed,...,previous,poutcome=nonexistent,poutcome=failure,poutcome=success,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,class
0,0.209877,0,0,0,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.882307,0.376569,0.98073,1.0,0
1,0.296296,0,0,1,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.484412,0.615063,0.981183,1.0,0
2,0.246914,1,0,0,0,0,0,0,0,0,...,0.0,1,0,0,0.9375,0.698753,0.60251,0.957379,0.859735,0
3,0.160494,0,1,0,0,0,0,0,0,0,...,0.142857,0,1,0,0.333333,0.26968,0.192469,0.150759,0.512287,0
4,0.530864,0,0,0,1,0,0,0,0,0,...,0.0,1,0,0,0.333333,0.340608,0.154812,0.17479,0.512287,1


In [158]:
pd.pivot_table(data,
             values = 'age',
               index = 'class', 
              aggfunc = 'count')

Unnamed: 0_level_0,age
class,Unnamed: 1_level_1
0,36548
1,4640


### Deep Autoencoder

In [159]:
features = data.drop(columns = ['class'])
target = data['class']

In [160]:
# min max scale the input data
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [161]:
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(31,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(7, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(31,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [162]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds, errors

In [163]:
start = time.process_time()

model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

end = time.process_time()
bank_deep_autoencoders_train_time = end - start
print(end - start)

start = time.process_time()

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions, scores = get_predictions(model, x_train_scaled, threshold)

end = time.process_time()
bank_deep_autoencoders_test_time = end - start
print(end - start)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
15.34375
Threshold: 0.039915910213057765
4.84375


In [164]:
fpr, tpr, _ = metrics.roc_curve(target, scores)
bank_deep_autoencoders_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.6505762261626656

In [165]:
bank_deep_autoencoders_report = classification_report(target, predictions, target_names = ['0','1'], output_dict = True)
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89     36548
           1       0.23      0.30      0.26      4640

    accuracy                           0.81     41188
   macro avg       0.57      0.59      0.58     41188
weighted avg       0.83      0.81      0.82     41188



In [166]:
precision, recall, thresholds = precision_recall_curve(target, scores)
bank_deep_ae_auc_precision_recall = metrics.auc(recall, precision)
print(bank_deep_ae_auc_precision_recall)

0.20505699949271072


## Performance

In [167]:
performance = pd.DataFrame(columns = ['F1 score', 'recall', 'precision', 'AUC', 'AUPRC', 
                                      'Training time','Inference time','Total time'])

In [168]:
f1_score_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_report['1']['f1-score'],
                       'cardio':cardio_deep_autoencoders_report['1']['f1-score'], 
                        'forestcover':forestcover_deep_autoencoders_report['1']['f1-score'], 
                       'annthyroid':annthyroid_deep_autoencoders_report['1']['f1-score'],       
                        'creditcard':creditcard_deep_autoencoders_report['1']['f1-score'], 
                       'mammography':mammography_deep_autoencoders_report['1']['f1-score'], 
                        'shuttle':shuttle_deep_autoencoders_report['1']['f1-score'], 
                      'mnist':mnist_deep_autoencoders_report['1']['f1-score'], 
                  'vowels':vowels_deep_autoencoders_report['1']['f1-score'], 
                  'seismic':seismic_deep_autoencoders_report['1']['f1-score'], 
                  'musk':musk_deep_autoencoders_report['1']['f1-score'], 
                  'bank':bank_deep_autoencoders_report['1']['f1-score']}
f1_score_deep_ae_df = pd.DataFrame.from_dict(f1_score_deep_ae, orient='index', columns = ['F1 score']).reset_index()

In [169]:
recall_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_report['1']['recall'],
                       'cardio':cardio_deep_autoencoders_report['1']['recall'], 
                        'forestcover':forestcover_deep_autoencoders_report['1']['recall'], 
                       'annthyroid':annthyroid_deep_autoencoders_report['1']['recall'],       
                        'creditcard':creditcard_deep_autoencoders_report['1']['recall'], 
                       'mammography':mammography_deep_autoencoders_report['1']['recall'], 
                        'shuttle':shuttle_deep_autoencoders_report['1']['recall'], 
                      'mnist':mnist_deep_autoencoders_report['1']['recall'], 
                  'vowels':vowels_deep_autoencoders_report['1']['recall'], 
                  'seismic':seismic_deep_autoencoders_report['1']['recall'], 
                  'musk':musk_deep_autoencoders_report['1']['recall'], 
                  'bank':bank_deep_autoencoders_report['1']['recall'], }
recall_deep_ae_df = pd.DataFrame.from_dict(recall_deep_ae, orient='index', columns = ['Recall']).reset_index()

In [170]:
precision_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_report['1']['precision'],
                       'cardio':cardio_deep_autoencoders_report['1']['precision'], 
                        'forestcover':forestcover_deep_autoencoders_report['1']['precision'], 
                       'annthyroid':annthyroid_deep_autoencoders_report['1']['precision'],       
                        'creditcard':creditcard_deep_autoencoders_report['1']['precision'], 
                       'mammography':mammography_deep_autoencoders_report['1']['precision'], 
                        'shuttle':shuttle_deep_autoencoders_report['1']['precision'], 
                      'mnist':mnist_deep_autoencoders_report['1']['precision'], 
                  'vowels':vowels_deep_autoencoders_report['1']['precision'], 
                  'seismic':seismic_deep_autoencoders_report['1']['precision'], 
                  'musk':musk_deep_autoencoders_report['1']['precision'], 
                  'bank':bank_deep_autoencoders_report['1']['precision'], }
precision_deep_ae_df = pd.DataFrame.from_dict(precision_deep_ae, orient='index', columns = ['Precision']).reset_index()

In [171]:
auc_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_auc,
                       'cardio':cardio_deep_autoencoders_auc, 
                        'forestcover':forestcover_deep_autoencoders_auc, 
                       'annthyroid':annthyroid_deep_autoencoders_auc,       
                        'creditcard':creditcard_deep_autoencoders_auc, 
                       'mammography':mammography_deep_autoencoders_auc, 
                        'shuttle':shuttle_deep_autoencoders_auc, 
                      'mnist':mnist_deep_autoencoders_auc, 
                  'vowels':vowels_deep_autoencoders_auc, 
                  'seismic':seismic_deep_autoencoders_auc, 
                  'musk':musk_deep_autoencoders_auc, 
                  'bank':bank_deep_autoencoders_auc}
auc_deep_ae_df = pd.DataFrame.from_dict(auc_deep_ae, orient='index', columns = ['AUC']).reset_index()

In [172]:
auprc_deep_ae = {'arrhythmia':arrhythmia_deep_ae_auc_precision_recall,
                       'cardio':cardio_deep_ae_auc_precision_recall, 
                        'forestcover':forestcover_deep_ae_auc_precision_recall, 
                       'annthyroid':annthyroid_deep_ae_auc_precision_recall,       
                        'creditcard':creditcard_deep_ae_auc_precision_recall, 
                       'mammography':mammography_deep_ae_auc_precision_recall, 
                        'shuttle':shuttle_deep_ae_auc_precision_recall, 
                      'mnist':mnist_deep_ae_auc_precision_recall, 
                  'vowels':vowels_deep_ae_auc_precision_recall, 
                  'seismic':seismic_deep_ae_auc_precision_recall, 
                  'musk':musk_deep_ae_auc_precision_recall, 
                  'bank':bank_deep_ae_auc_precision_recall}
auprc_deep_ae_df = pd.DataFrame.from_dict(auprc_deep_ae, orient='index', columns = ['AUPRC']).reset_index()

In [173]:
training_time_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_train_time,
                       'cardio':cardio_deep_autoencoders_train_time, 
                        'forestcover':forestcover_deep_autoencoders_train_time, 
                       'annthyroid':annthyroid_deep_autoencoders_train_time,       
                        'creditcard': creditcard_deep_autoencoders_train_time, 
                       'mammography':mammography_deep_autoencoders_train_time, 
                        'shuttle':shuttle_deep_autoencoders_train_time, 
                      'mnist':mnist_deep_autoencoders_train_time, 
                  'vowels':vowels_deep_autoencoders_train_time, 
                  'seismic':seismic_deep_autoencoders_train_time, 
                  'musk':musk_deep_autoencoders_train_time, 
                  'bank':bank_deep_autoencoders_train_time}
training_time_deep_ae_df = pd.DataFrame.from_dict(training_time_deep_ae, orient='index', columns = ['Training time']).reset_index()

In [174]:
test_time_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_test_time,
                       'cardio':cardio_deep_autoencoders_test_time, 
                        'forestcover':forestcover_deep_autoencoders_test_time, 
                       'annthyroid':annthyroid_deep_autoencoders_test_time,       
                        'creditcard': creditcard_deep_autoencoders_test_time, 
                       'mammography':mammography_deep_autoencoders_test_time, 
                        'shuttle':shuttle_deep_autoencoders_test_time, 
                      'mnist':mnist_deep_autoencoders_test_time, 
                  'vowels':vowels_deep_autoencoders_test_time, 
                  'seismic':seismic_deep_autoencoders_test_time, 
                  'musk':musk_deep_autoencoders_test_time, 
                  'bank':bank_deep_autoencoders_test_time}
test_time_deep_ae_df = pd.DataFrame.from_dict(test_time_deep_ae, orient='index', columns = ['Testing time']).reset_index()

In [175]:
total_time_deep_ae = {'arrhythmia':arrhythmia_deep_autoencoders_train_time + arrhythmia_deep_autoencoders_test_time,
                       'cardio':cardio_deep_autoencoders_train_time + cardio_deep_autoencoders_test_time, 
                        'forestcover':forestcover_deep_autoencoders_train_time + forestcover_deep_autoencoders_test_time, 
                       'annthyroid':annthyroid_deep_autoencoders_train_time + annthyroid_deep_autoencoders_test_time,       
                        'creditcard': creditcard_deep_autoencoders_train_time + creditcard_deep_autoencoders_test_time, 
                       'mammography':mammography_deep_autoencoders_train_time + mammography_deep_autoencoders_test_time, 
                        'shuttle':shuttle_deep_autoencoders_train_time + shuttle_deep_autoencoders_test_time, 
                      'mnist':mnist_deep_autoencoders_train_time + mnist_deep_autoencoders_test_time, 
                  'vowels':vowels_deep_autoencoders_train_time + vowels_deep_autoencoders_test_time, 
                  'seismic':seismic_deep_autoencoders_train_time + seismic_deep_autoencoders_test_time, 
                  'musk':musk_deep_autoencoders_train_time + musk_deep_autoencoders_test_time, 
                  'bank':bank_deep_autoencoders_train_time + bank_deep_autoencoders_test_time}
total_time_deep_ae_df = pd.DataFrame.from_dict(total_time_deep_ae, orient='index', columns = ['Total time']).reset_index()

In [176]:
pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(f1_score_deep_ae_df, recall_deep_ae_df, how = 'inner'), 
                                    precision_deep_ae_df, how ='inner'),
         auc_deep_ae_df, how = 'inner'), auprc_deep_ae_df, how = 'inner'), training_time_deep_ae_df, how = 'inner'), 
         test_time_deep_ae_df, how = 'inner'),total_time_deep_ae_df, how = 'inner')

Unnamed: 0,index,F1 score,Recall,Precision,AUC,AUPRC,Training time,Testing time,Total time
0,arrhythmia,0.478632,0.424242,0.54902,0.80252,0.465256,5.640625,0.625,6.265625
1,cardio,0.611765,0.738636,0.522088,0.945976,0.671471,10.3125,1.25,11.5625
2,forestcover,0.099434,0.814707,0.052948,0.916116,0.092932,138.546875,73.125,211.671875
3,annthyroid,0.236861,0.316479,0.18925,0.681848,0.198044,1.890625,0.8125,2.703125
4,creditcard,0.064741,0.847561,0.033656,0.951638,0.408157,50.9375,26.21875,77.15625
5,mammography,0.240275,0.807692,0.141129,0.881576,0.185566,1.828125,1.171875,3.0
6,shuttle,0.503314,0.692111,0.395443,0.947004,0.550463,7.546875,5.40625,12.953125
7,mnist,0.311181,0.421429,0.246656,0.769351,0.263978,4.09375,1.171875,5.265625
8,vowels,0.068702,0.18,0.042453,0.500967,0.037351,2.046875,0.3125,2.359375
9,seismic,0.134653,0.2,0.101493,0.634025,0.091743,3.640625,0.515625,4.15625
