In [None]:
import pandas as pd
import time
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from scipy.spatial import distance
from mlxtend.evaluate import mcnemar
from mlxtend.evaluate import mcnemar_table
import array
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
import seaborn as sb

In [None]:
#Prediction translation
def def_outlier(df):
    if (df['y_pred'] in [-1]):
        val = 1
    else:
        val = 0
    return val

In [None]:
# k-distances graph
def dbscan_tuner(data,dim):
    a = data.loc[:, (data.columns != 'y')&(data.columns != 'Outlier')
                 &(data.columns != 'Class')&(data.columns != 'Unusual')
                 &(data.columns != 'class')]
    a = a.to_numpy()
    b = distance.cdist(a,a)
    
    dist = []
    for i in b:
        l = sorted(i)[dim]
        dist.append(l)
        
    c = data.loc[:, (data.columns != 'y')&(data.columns != 'Outlier')
                 &(data.columns != 'Class')&(data.columns != 'Unusual')
                 &(data.columns != 'class')]
    c['dist'] = dist
    c = c.sort_values(by='dist')
    d = array.array('i',(i for i in range(1,len(data)+1)))
    c['i'] = d
    
    plt.plot(c['i'],c['dist'])
    plt.xlabel("Observation index")
    plt.ylabel("k-distance")
    
    return c

# Datasets

In [None]:
arrhythmia_data = pd.read_csv('./arrhythmia.csv', sep = ',')

In [None]:
# dropping columns that consist only of 0's
arrhythmia_data = arrhythmia_data.drop(columns = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141',
'Col147', 'Col152', 'Col153','Col160','Col200', 'Col260', 'Col270'])

In [None]:
arrhythmia_data.head()

In [None]:
cardiocotography_data = pd.read_csv('./Cardiotocography.csv')

In [None]:
cardiocotography_data['y'] = cardiocotography_data['y'].astype(int)

In [None]:
cardiocotography_data.head()

In [None]:
forestcover_data = pd.read_csv('./ForestCover.csv')

In [None]:
forestcover_data.head()

In [None]:
annthyroid_data = pd.read_csv('./annthyroid.csv')

In [None]:
annthyroid_data.head()

In [None]:
creditcard_data = pd.read_csv('./creditcard.csv')

In [None]:
creditcard_data = creditcard_data.drop(columns = ['Time'])

In [None]:
creditcard_data.head()

In [None]:
mammography_data = pd.read_csv('./mammography.csv')

In [None]:
mammography_data.head()

In [None]:
shuttle_data = pd.read_csv('./shuttle.csv', sep = ',')

In [None]:
shuttle_data.head()

In [None]:
mnist_data = pd.read_csv('./mnist.csv')

In [None]:
mnist_data = mnist_data.drop(columns = ['Col1','Col4', 'Col7', 'Col22', 'Col27', 'Col29', 'Col38', 'Col41', 'Col51', 'Col53', 'Col54', 'Col61', 'Col62', 'Col71', 'Col73', 'Col79', 'Col87', 'Col88', 'Col89', 'Col90',
'Col92', 'Col100'])

In [None]:
mnist_data.head()

In [None]:
vowels_data = pd.read_csv('./vowels.csv')

In [None]:
vowels_data.head()

In [None]:
seismic_data = pd.read_csv('./seismic.csv', sep = ',')

In [None]:
seismic_data = seismic_data.drop(columns = ['nbumps6','nbumps7','nbumps89'])

In [None]:
dummies = pd.get_dummies(seismic_data[['seismic','seismoacoustic','shift','ghazard']])
seismic_data = pd.concat([seismic_data, dummies], axis = 1)
seismic_data = seismic_data.drop(columns = ['seismic','seismoacoustic','shift','ghazard'])

In [None]:
seismic_data.head()

In [None]:
musk_data = pd.read_csv('./musk.csv', sep = ',')

In [None]:
musk_data['y'] = musk_data['y'].astype(int)

In [None]:
musk_data.head()

In [None]:
bank_data = pd.read_csv('./bank.csv')

In [None]:
bank_data.head()

# Isolation Forest

## Arrhythmia

In [None]:
train_data = arrhythmia_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])

y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
arrhythmia_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
arrhythmia_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
arrhythmia_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
arrhythmia_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(arrhythmia_iforest_auc_precision_recall)

In [None]:
arrhythmia_iforest_y = train_data['prediction']
arrhythmia_y_true = train_data['y']

## Cardiocotography

In [None]:
train_data = cardiocotography_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
cardiocotography_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
cardiocotography_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cardiocotography_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
cardiocotography_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(cardiocotography_iforest_auc_precision_recall)

In [None]:
cardio_iforest_y = train_data['prediction']
cardio_y_true = train_data['y']

## ForestCover

In [None]:
train_data = forestcover_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
forestcover_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores 

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
forestcover_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
forestcover_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
forestcover_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(forestcover_iforest_auc_precision_recall)

In [None]:
forestcover_iforest_y = train_data['prediction']
forestcover_y_true = train_data['y']

## Annthyroid

In [None]:
train_data = annthyroid_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
annthyroid_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
annthyroid_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
annthyroid_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
annthyroid_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(annthyroid_iforest_auc_precision_recall)

In [None]:
annthyroid_iforest_y = train_data['prediction']
annthyroid_y_true = train_data['y']

## Credit card

In [None]:
train_data = creditcard_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'Class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'Class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'Class'])

end = time.process_time()
creditcard_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
creditcard_iforest_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_iforest_auc_precision_recall)

In [None]:
creditcard_iforest_y = train_data['prediction']
creditcard_y_true = train_data['Class']

## Mammography

In [None]:
train_data = mammography_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
mammography_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
mammography_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mammography_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
mammography_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(mammography_iforest_auc_precision_recall)

In [None]:
mammography_iforest_y = train_data['prediction']
mammography_y_true = train_data['y']

## Shuttle

In [None]:
train_data = shuttle_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
shuttle_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
shuttle_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
shuttle_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
shuttle_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(shuttle_iforest_auc_precision_recall)

In [None]:
shuttle_iforest_y = train_data['prediction']
shuttle_y_true = train_data['y']

## Mnist

In [None]:
train_data = mnist_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
mnist_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
mnist_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mnist_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
mnist_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(mnist_iforest_auc_precision_recall)

In [None]:
mnist_iforest_y = train_data['prediction']
mnist_y_true = train_data['y']

## Vowels

In [None]:
train_data = vowels_data.copy()

In [None]:
start = time.process_time()

rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
vowels_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
vowels_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
vowels_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
vowels_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(vowels_iforest_auc_precision_recall)

In [None]:
vowels_iforest_y = train_data['prediction']
vowels_y_true = train_data['y']

## Seismic

In [None]:
train_data = seismic_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
seismic_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
seismic_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
seismic_iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
seismic_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(seismic_iforest_auc_precision_recall)

In [None]:
seismic_iforest_y = train_data['prediction']
seismic_y_true = train_data['class']

## Musk

In [None]:
train_data = musk_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'y'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
musk_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
musk_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
musk_iforest_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
musk_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(musk_iforest_auc_precision_recall)

In [None]:
musk_iforest_y = train_data['prediction']
musk_y_true = train_data['y']

## Bank

In [None]:
train_data = bank_data.copy()

In [None]:
start = time.process_time()
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples = 256, random_state=rng, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])
y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
bank_iforest_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(bank_iforest_auc_precision_recall)

In [None]:
bank_iforest_y = train_data['prediction']
bank_y_true = train_data['class']

# LOF

## Arrhythmia

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(arrhythmia_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = arrhythmia_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=10, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
arrhythmia_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
arrhythmia_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
arrhythmia_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
arrhythmia_lof_auc_precision_recall = metrics.auc(recall, precision)
print(arrhythmia_lof_auc_precision_recall)

In [None]:
arrhythmia_lof_y = train_data['prediction']

## Cardiocotography

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(cardiocotography_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = cardiocotography_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=18, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
cardiocotography_lof_time = end - start 
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
cardiocotography_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cardiocotography_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
cardiocotography_lof_auc_precision_recall = metrics.auc(recall, precision)
print(cardiocotography_lof_auc_precision_recall)

In [None]:
cardio_lof_y = train_data['prediction']

## ForestCover

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(forestcover_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = forestcover_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=2860, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
forestcover_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
forestcover_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
forestcover_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
forestcover_lof_auc_precision_recall = metrics.auc(recall, precision)
print(forestcover_lof_auc_precision_recall)

In [None]:
forestcover_lof_y = train_data['prediction']

## Annthyroid

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(annthyroid_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = annthyroid_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=72, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
annthyroid_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
annthyroid_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
annthyroid_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
annthyroid_lof_auc_precision_recall = metrics.auc(recall, precision)
print(annthyroid_lof_auc_precision_recall)

In [None]:
annthyroid_lof_y = train_data['prediction']

## Credit card

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(creditcard_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = creditcard_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=2848, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'Class'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
creditcard_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
creditcard_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
creditcard_lof_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
creditcard_lof_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_lof_auc_precision_recall)

In [None]:
creditcard_lof_y = train_data['prediction']

## Mammography

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(mammography_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = mammography_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=111, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
mammography_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
mammography_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mammography_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
mammography_lof_auc_precision_recall = metrics.auc(recall, precision)
print(mammography_lof_auc_precision_recall)

In [None]:
mammography_lof_y = train_data['prediction']

## Shuttle

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(shuttle_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = shuttle_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=491, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
shuttle_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
shuttle_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
shuttle_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
shuttle_lof_auc_precision_recall = metrics.auc(recall, precision)
print(shuttle_lof_auc_precision_recall)

In [None]:
shuttle_lof_y = train_data['prediction']

## Mnist

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(mnist_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = mnist_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=76, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
mnist_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
mnist_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mnist_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
mnist_lof_auc_precision_recall = metrics.auc(recall, precision)
print(mnist_lof_auc_precision_recall)

In [None]:
mnist_lof_y = train_data['prediction']

## Vowels

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(vowels_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = vowels_data.columns

In [None]:
start = time.process_time()

lof = LocalOutlierFactor(n_neighbors=15, contamination=.1, novelty=False)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_

end = time.process_time()
vowels_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
vowels_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
vowels_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
vowels_lof_auc_precision_recall = metrics.auc(recall, precision)
print(vowels_lof_auc_precision_recall)

In [None]:
vowels_lof_y = train_data['prediction']

## Seismic

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(seismic_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = seismic_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=26, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
seismic_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
seismic_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
seismic_lof_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
seismic_lof_auc_precision_recall = metrics.auc(recall, precision)
print(seismic_lof_auc_precision_recall)

In [None]:
seismic_lof_y = train_data['prediction']

## Musk

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(musk_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = musk_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=31, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'y'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
musk_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['y'], train_data['y_scores'])
musk_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
musk_lof_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['y'], train_data['y_scores'])
musk_lof_auc_precision_recall = metrics.auc(recall, precision)
print(musk_lof_auc_precision_recall)

In [None]:
musk_lof_y = train_data['prediction']

## Bank

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(bank_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = bank_data.columns

In [None]:
start = time.process_time()
lof = LocalOutlierFactor(n_neighbors=412, contamination=.1)
y_pred = lof.fit_predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = lof.negative_outlier_factor_
end = time.process_time()
bank_lof_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
bank_lof_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_lof_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
bank_lof_auc_precision_recall = metrics.auc(recall, precision)
print(bank_lof_auc_precision_recall)

In [None]:
bank_lof_y = train_data['prediction']

# DBSCAN

## Arrhythmia

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(arrhythmia_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = arrhythmia_data.columns

In [None]:
dist = dbscan_tuner(train_data,25)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 2.5, min_samples = 40)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
arrhythmia_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = arrhythmia_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
arrhythmia_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
arrhythmia_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
arrhythmia_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(arrhythmia_dbscan_auc_precision_recall)

In [None]:
arrhythmia_dbscan_y = train_data['prediction']

## Cardiocotography

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(cardiocotography_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = cardiocotography_data.columns

In [None]:
dist = dbscan_tuner(train_data,41)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.7, min_samples = 42)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
cardiocotography_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = cardiocotography_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
cardiocotography_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
cardiocotography_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
cardiocotography_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(cardiocotography_dbscan_auc_precision_recall)

In [None]:
cardio_dbscan_y = train_data['prediction']

## ForestCover

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(forestcover_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = forestcover_data.columns

In [None]:
data_dbscan = train_data.sample(frac=0.1)
dist = dbscan_tuner(data_dbscan ,19)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.1, min_samples = 20)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
forestcover_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = forestcover_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
forestcover_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
forestcover_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
forestcover_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(forestcover_dbscan_auc_precision_recall)

In [None]:
forestcover_dbscan_y = train_data['prediction']

## Annthyroid

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(annthyroid_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = annthyroid_data.columns

In [None]:
dist = dbscan_tuner(train_data ,11)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.1, min_samples = 12)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
annthyroid_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = annthyroid_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
annthyroid_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
annthyroid_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
annthyroid_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(annthyroid_dbscan_auc_precision_recall)

In [None]:
annthyroid_dbscan_y = train_data['prediction']

## Credit card

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(creditcard_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = creditcard_data.columns

In [None]:
data_dbscan = train_data.sample(frac=0.1)
dist = dbscan_tuner(data_dbscan,57)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.15, min_samples = 58)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'Class'])
end = time.process_time()
creditcard_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = creditcard_data.columns.to_list()
original_columns.remove('Class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['Class'], data_for_auprc['score'])
creditcard_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
creditcard_dbscan_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['Class'], data_for_auprc['score'])
creditcard_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_dbscan_auc_precision_recall)

In [None]:
creditcard_dbscan_y = train_data['prediction']

## Mammography

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(mammography_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = mammography_data.columns

In [None]:
dist = dbscan_tuner(train_data,11)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.07, min_samples = 12)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
mammography_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = mammography_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
mammography_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mammography_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
mammography_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(mammography_dbscan_auc_precision_recall)

In [None]:
mammography_dbscan_y = train_data['prediction']

## Shuttle

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(shuttle_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = shuttle_data.columns

In [None]:
dist = dbscan_tuner(train_data, 17)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 0.005, min_samples = 18)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
shuttle_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = shuttle_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                            distance.euclidean(x[original_columns].to_list(), 
                                                                            anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
shuttle_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
shuttle_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
shuttle_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(shuttle_dbscan_auc_precision_recall)

In [None]:
shuttle_dbscan_y = train_data['prediction']

## Mnist

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(mnist_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = mnist_data.columns

In [None]:
dist = dbscan_tuner(train_data,155)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 2.5, min_samples = 156)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
mnist_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = mnist_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
mnist_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
mnist_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
mnist_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(mnist_dbscan_auc_precision_recall)

In [None]:
mnist_dbscan_y = train_data['prediction']

## Vowels

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(vowels_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = vowels_data.columns

In [None]:
dist = dbscan_tuner(train_data,23)

In [None]:
start = time.process_time()

dbscan = DBSCAN(eps = 0.4, min_samples = 24)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])

end = time.process_time()
vowels_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = vowels_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
vowels_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
vowels_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
vowels_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(vowels_dbscan_auc_precision_recall)

In [None]:
vowels_dbscan_y = train_data['prediction']

## Seismic

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(seismic_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = seismic_data.columns

In [None]:
dist = dbscan_tuner(train_data,41)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 0.25, min_samples = 42)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
seismic_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = seismic_data.columns.to_list()
original_columns.remove('class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['class'], data_for_auprc['score'])
seismic_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
seismic_dbscan_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['class'], data_for_auprc['score'])
seismic_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(seismic_dbscan_auc_precision_recall)

In [None]:
seismic_dbscan_y = train_data['prediction']

## Musk

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(musk_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = musk_data.columns

In [None]:
dist = dbscan_tuner(train_data,331)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 3.2, min_samples = 332)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'y'])
end = time.process_time()
musk_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = musk_data.columns.to_list()
original_columns.remove('y')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['y'], data_for_auprc['score'])
musk_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
musk_dbscan_report = classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['y'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['y'], data_for_auprc['score'])
musk_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(musk_dbscan_auc_precision_recall)

In [None]:
musk_dbscan_y = train_data['prediction']

## Bank

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_data = min_max_scaler.fit_transform(bank_data.copy())
train_data = pd.DataFrame(train_data)
train_data.columns = bank_data.columns

In [None]:
dist = dbscan_tuner(train_data,123)

In [None]:
start = time.process_time()
dbscan = DBSCAN(eps = 2, min_samples = 124)
clusters = dbscan.fit_predict(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
bank_dbscan_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = clusters
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)

In [None]:
original_columns = bank_data.columns.to_list()
original_columns.remove('class')

data_w_score = pd.DataFrame(columns = train_data.columns.to_list())
data_w_score['score'] = pd.NaT
for i in set(clusters):
    if i == -1:
        continue
    score_data = train_data[train_data['y_pred'] == i].copy()
    center = score_data[original_columns].mean().to_list()
    score_data['score'] = score_data.apply(lambda x : distance.euclidean(x[original_columns].to_list(), center),1)
    data_w_score = data_w_score.append(score_data.copy(), ignore_index = True)

anomalies_data = train_data[train_data['y_pred'] == -1].copy()
anomalies_data['score'] = ''
cluster_centers = pd.pivot_table(train_data,
              index = 'y_pred',
              aggfunc = 'mean')

for i in range(len(anomalies_data)):
    anomalies_data['score'].iloc[i] = np.min(cluster_centers[cluster_centers.index != -1].apply(lambda x : 
                                                                                                distance.euclidean(x[original_columns].to_list(), 
                                                                                                                   anomalies_data[original_columns].iloc[i]),1))

data_for_auprc = pd.concat([data_w_score, anomalies_data])

In [None]:
fpr, tpr, _ = metrics.roc_curve(data_for_auprc['class'], data_for_auprc['score'])
bank_dbscan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_dbscan_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
precision, recall, thresholds = precision_recall_curve(data_for_auprc['class'], data_for_auprc['score'])
bank_dbscan_auc_precision_recall = metrics.auc(recall, precision)
print(bank_dbscan_auc_precision_recall)

In [None]:
bank_dbscan_y = train_data['prediction']

# Performance

In [None]:
arrhythmia_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

arrhythmia_iforest_performance = {'method':'iForest',
               'f1-score':arrhythmia_iforest_report['1']['f1-score'], 
               'sensitivity':arrhythmia_iforest_report['1']['recall'],
               'precision':arrhythmia_iforest_report['1']['precision'],        
              'Time':arrhythmia_iforest_time,
              'AUC':arrhythmia_iforest_auc,
                      'AU precision-recall curve': arrhythmia_iforest_auc_precision_recall}
arrhythmia_lof_performance = {'method':'LOF',
               'f1-score':arrhythmia_lof_report['1']['f1-score'], 
               'sensitivity':arrhythmia_lof_report['1']['recall'],
               'precision':arrhythmia_lof_report['1']['precision'],    
              'Time':arrhythmia_lof_time,
              'AUC':arrhythmia_lof_auc,
                  'AU precision-recall curve': arrhythmia_lof_auc_precision_recall}
arrhythmia_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':arrhythmia_dbscan_report['1']['f1-score'], 
               'sensitivity':arrhythmia_dbscan_report['1']['recall'],
               'precision':arrhythmia_dbscan_report['1']['precision'],       
              'Time':arrhythmia_dbscan_time,
              'AUC':arrhythmia_dbscan_auc,
                     'AU precision-recall curve': arrhythmia_dbscan_auc_precision_recall}

arrhythmia_performance = arrhythmia_performance.append(arrhythmia_iforest_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(arrhythmia_lof_performance, ignore_index = True)
arrhythmia_performance = arrhythmia_performance.append(arrhythmia_dbscan_performance, ignore_index = True)

In [None]:
cardiocotography_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

cardiocotography_iforest_performance = {'method':'iForest',
               'f1-score':cardiocotography_iforest_report['1']['f1-score'], 
               'sensitivity':cardiocotography_iforest_report['1']['recall'],
               'precision':cardiocotography_iforest_report['1']['precision'],        
              'Time':cardiocotography_iforest_time,
              'AUC':cardiocotography_iforest_auc,
                      'AU precision-recall curve': cardiocotography_iforest_auc_precision_recall}
cardiocotography_lof_performance = {'method':'LOF',
               'f1-score':cardiocotography_lof_report['1']['f1-score'], 
               'sensitivity':cardiocotography_lof_report['1']['recall'],
               'precision':cardiocotography_lof_report['1']['precision'],    
              'Time':cardiocotography_lof_time,
              'AUC':cardiocotography_lof_auc,
                  'AU precision-recall curve': cardiocotography_lof_auc_precision_recall}
cardiocotography_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':cardiocotography_dbscan_report['1']['f1-score'], 
               'sensitivity':cardiocotography_dbscan_report['1']['recall'],
               'precision':cardiocotography_dbscan_report['1']['precision'],       
              'Time':cardiocotography_dbscan_time,
              'AUC':cardiocotography_dbscan_auc,
                     'AU precision-recall curve': cardiocotography_dbscan_auc_precision_recall}

cardiocotography_performance = cardiocotography_performance.append(cardiocotography_iforest_performance, ignore_index = True)
cardiocotography_performance = cardiocotography_performance.append(cardiocotography_lof_performance, ignore_index = True)
cardiocotography_performance = cardiocotography_performance.append(cardiocotography_dbscan_performance, ignore_index = True)

In [None]:
forestcover_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

forestcover_iforest_performance = {'method':'iForest',
               'f1-score':forestcover_iforest_report['1']['f1-score'], 
               'sensitivity':forestcover_iforest_report['1']['recall'],
               'precision':forestcover_iforest_report['1']['precision'],        
              'Time':forestcover_iforest_time,
              'AUC':forestcover_iforest_auc,
                      'AU precision-recall curve': forestcover_iforest_auc_precision_recall}
forestcover_lof_performance = {'method':'LOF',
               'f1-score':forestcover_lof_report['1']['f1-score'], 
               'sensitivity':forestcover_lof_report['1']['recall'],
               'precision':forestcover_lof_report['1']['precision'],    
              'Time':forestcover_lof_time,
              'AUC':forestcover_lof_auc,
                  'AU precision-recall curve': forestcover_lof_auc_precision_recall}
forestcover_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':forestcover_dbscan_report['1']['f1-score'], 
               'sensitivity':forestcover_dbscan_report['1']['recall'],
               'precision':forestcover_dbscan_report['1']['precision'],       
              'Time':forestcover_dbscan_time,
              'AUC':forestcover_dbscan_auc,
                     'AU precision-recall curve': forestcover_dbscan_auc_precision_recall}

forestcover_performance = forestcover_performance.append(forestcover_iforest_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(forestcover_lof_performance, ignore_index = True)
forestcover_performance = forestcover_performance.append(forestcover_dbscan_performance, ignore_index = True)

In [None]:
annthyroid_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

annthyroid_iforest_performance = {'method':'iForest',
               'f1-score':annthyroid_iforest_report['1']['f1-score'], 
               'sensitivity':annthyroid_iforest_report['1']['recall'],
               'precision':annthyroid_iforest_report['1']['precision'],        
              'Time':annthyroid_iforest_time,
              'AUC':annthyroid_iforest_auc,
                      'AU precision-recall curve': annthyroid_iforest_auc_precision_recall}
annthyroid_lof_performance = {'method':'LOF',
               'f1-score':annthyroid_lof_report['1']['f1-score'], 
               'sensitivity':annthyroid_lof_report['1']['recall'],
               'precision':annthyroid_lof_report['1']['precision'],    
              'Time':annthyroid_lof_time,
              'AUC':annthyroid_lof_auc,
                  'AU precision-recall curve': annthyroid_lof_auc_precision_recall}
annthyroid_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':annthyroid_dbscan_report['1']['f1-score'], 
               'sensitivity':annthyroid_dbscan_report['1']['recall'],
               'precision':annthyroid_dbscan_report['1']['precision'],       
              'Time':annthyroid_dbscan_time,
              'AUC':annthyroid_dbscan_auc,
                     'AU precision-recall curve': annthyroid_dbscan_auc_precision_recall}

annthyroid_performance = annthyroid_performance.append(annthyroid_iforest_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(annthyroid_lof_performance, ignore_index = True)
annthyroid_performance = annthyroid_performance.append(annthyroid_dbscan_performance, ignore_index = True)

In [None]:
creditcard_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

creditcard_iforest_performance = {'method':'iForest',
               'f1-score':creditcard_iforest_report['1']['f1-score'], 
               'sensitivity':creditcard_iforest_report['1']['recall'],
               'precision':creditcard_iforest_report['1']['precision'],        
              'Time':creditcard_iforest_time,
              'AUC':creditcard_iforest_auc,
                      'AU precision-recall curve': creditcard_iforest_auc_precision_recall}
creditcard_lof_performance = {'method':'LOF',
               'f1-score':creditcard_lof_report['1']['f1-score'], 
               'sensitivity':creditcard_lof_report['1']['recall'],
               'precision':creditcard_lof_report['1']['precision'],    
              'Time':creditcard_lof_time,
              'AUC':creditcard_lof_auc,
                  'AU precision-recall curve': creditcard_lof_auc_precision_recall}
creditcard_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':creditcard_dbscan_report['1']['f1-score'], 
               'sensitivity':creditcard_dbscan_report['1']['recall'],
               'precision':creditcard_dbscan_report['1']['precision'],       
              'Time':creditcard_dbscan_time,
              'AUC':creditcard_dbscan_auc,
                     'AU precision-recall curve': creditcard_dbscan_auc_precision_recall}

creditcard_performance = creditcard_performance.append(creditcard_iforest_performance, ignore_index = True)
creditcard_performance = creditcard_performance.append(creditcard_lof_performance, ignore_index = True)
creditcard_performance = creditcard_performance.append(creditcard_dbscan_performance, ignore_index = True)

In [None]:
mammography_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

mammography_iforest_performance = {'method':'iForest',
               'f1-score':mammography_iforest_report['1']['f1-score'], 
               'sensitivity':mammography_iforest_report['1']['recall'],
               'precision':mammography_iforest_report['1']['precision'],        
              'Time':mammography_iforest_time,
              'AUC':mammography_iforest_auc,
                      'AU precision-recall curve': mammography_iforest_auc_precision_recall}
mammography_lof_performance = {'method':'LOF',
               'f1-score':mammography_lof_report['1']['f1-score'], 
               'sensitivity':mammography_lof_report['1']['recall'],
               'precision':mammography_lof_report['1']['precision'],    
              'Time':mammography_lof_time,
              'AUC':mammography_lof_auc,
                  'AU precision-recall curve': mammography_lof_auc_precision_recall}
mammography_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':mammography_dbscan_report['1']['f1-score'], 
               'sensitivity':mammography_dbscan_report['1']['recall'],
               'precision':mammography_dbscan_report['1']['precision'],       
              'Time':mammography_dbscan_time,
              'AUC':mammography_dbscan_auc,
                     'AU precision-recall curve': mammography_dbscan_auc_precision_recall}

mammography_performance = mammography_performance.append(mammography_iforest_performance, ignore_index = True)
mammography_performance = mammography_performance.append(mammography_lof_performance, ignore_index = True)
mammography_performance = mammography_performance.append(mammography_dbscan_performance, ignore_index = True)

In [None]:
shuttle_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

shuttle_iforest_performance = {'method':'iForest',
               'f1-score':shuttle_iforest_report['1']['f1-score'], 
               'sensitivity':shuttle_iforest_report['1']['recall'],
               'precision':shuttle_iforest_report['1']['precision'],        
              'Time':shuttle_iforest_time,
              'AUC':shuttle_iforest_auc,
                      'AU precision-recall curve': shuttle_iforest_auc_precision_recall}
shuttle_lof_performance = {'method':'LOF',
               'f1-score':shuttle_lof_report['1']['f1-score'], 
               'sensitivity':shuttle_lof_report['1']['recall'],
               'precision':shuttle_lof_report['1']['precision'],    
              'Time':shuttle_lof_time,
              'AUC':shuttle_lof_auc,
                  'AU precision-recall curve': shuttle_lof_auc_precision_recall}
shuttle_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':shuttle_dbscan_report['1']['f1-score'], 
               'sensitivity':shuttle_dbscan_report['1']['recall'],
               'precision':shuttle_dbscan_report['1']['precision'],       
              'Time':shuttle_dbscan_time,
              'AUC':shuttle_dbscan_auc,
                     'AU precision-recall curve': shuttle_dbscan_auc_precision_recall}

shuttle_performance = shuttle_performance.append(shuttle_iforest_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(shuttle_lof_performance, ignore_index = True)
shuttle_performance = shuttle_performance.append(shuttle_dbscan_performance, ignore_index = True)

In [None]:
mnist_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

mnist_iforest_performance = {'method':'iForest',
               'f1-score':mnist_iforest_report['1']['f1-score'], 
               'sensitivity':mnist_iforest_report['1']['recall'],
               'precision':mnist_iforest_report['1']['precision'],        
              'Time':mnist_iforest_time,
              'AUC':mnist_iforest_auc,
                      'AU precision-recall curve': mnist_iforest_auc_precision_recall}
mnist_lof_performance = {'method':'LOF',
               'f1-score':mnist_lof_report['1']['f1-score'], 
               'sensitivity':mnist_lof_report['1']['recall'],
               'precision':mnist_lof_report['1']['precision'],    
              'Time':mnist_lof_time,
              'AUC':mnist_lof_auc,
                  'AU precision-recall curve': mnist_lof_auc_precision_recall}
mnist_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':mnist_dbscan_report['1']['f1-score'], 
               'sensitivity':mnist_dbscan_report['1']['recall'],
               'precision':mnist_dbscan_report['1']['precision'],       
              'Time':mnist_dbscan_time,
              'AUC':mnist_dbscan_auc,
                     'AU precision-recall curve': mnist_dbscan_auc_precision_recall}

mnist_performance = mnist_performance.append(mnist_iforest_performance, ignore_index = True)
mnist_performance = mnist_performance.append(mnist_lof_performance, ignore_index = True)
mnist_performance = mnist_performance.append(mnist_dbscan_performance, ignore_index = True)

In [None]:
vowels_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

vowels_iforest_performance = {'method':'iForest',
               'f1-score':vowels_iforest_report['1']['f1-score'], 
               'sensitivity':vowels_iforest_report['1']['recall'],
               'precision':vowels_iforest_report['1']['precision'],        
              'Time':vowels_iforest_time,
              'AUC':vowels_iforest_auc,
                      'AU precision-recall curve': vowels_iforest_auc_precision_recall}
vowels_lof_performance = {'method':'LOF',
               'f1-score':vowels_lof_report['1']['f1-score'], 
               'sensitivity':vowels_lof_report['1']['recall'],
               'precision':vowels_lof_report['1']['precision'],    
              'Time':vowels_lof_time,
              'AUC':vowels_lof_auc,
                  'AU precision-recall curve': vowels_lof_auc_precision_recall}
vowels_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':vowels_dbscan_report['1']['f1-score'], 
               'sensitivity':vowels_dbscan_report['1']['recall'],
               'precision':vowels_dbscan_report['1']['precision'],       
              'Time':vowels_dbscan_time,
              'AUC':vowels_dbscan_auc,
                     'AU precision-recall curve': vowels_dbscan_auc_precision_recall}

vowels_performance = vowels_performance.append(vowels_iforest_performance, ignore_index = True)
vowels_performance = vowels_performance.append(vowels_lof_performance, ignore_index = True)
vowels_performance = vowels_performance.append(vowels_dbscan_performance, ignore_index = True)

In [None]:
seismic_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

seismic_iforest_performance = {'method':'iForest',
               'f1-score':seismic_iforest_report['1']['f1-score'], 
               'sensitivity':seismic_iforest_report['1']['recall'],
               'precision':seismic_iforest_report['1']['precision'],        
              'Time':seismic_iforest_time,
              'AUC':seismic_iforest_auc,
                      'AU precision-recall curve': seismic_iforest_auc_precision_recall}
seismic_lof_performance = {'method':'LOF',
               'f1-score':seismic_lof_report['1']['f1-score'], 
               'sensitivity':seismic_lof_report['1']['recall'],
               'precision':seismic_lof_report['1']['precision'],    
              'Time':seismic_lof_time,
              'AUC':seismic_lof_auc,
                  'AU precision-recall curve': seismic_lof_auc_precision_recall}
seismic_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':seismic_dbscan_report['1']['f1-score'], 
               'sensitivity':seismic_dbscan_report['1']['recall'],
               'precision':seismic_dbscan_report['1']['precision'],       
              'Time':seismic_dbscan_time,
              'AUC':seismic_dbscan_auc,
                     'AU precision-recall curve': seismic_dbscan_auc_precision_recall}

seismic_performance = seismic_performance.append(seismic_iforest_performance, ignore_index = True)
seismic_performance = seismic_performance.append(seismic_lof_performance, ignore_index = True)
seismic_performance = seismic_performance.append(seismic_dbscan_performance, ignore_index = True)

In [None]:
musk_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

musk_iforest_performance = {'method':'iForest',
               'f1-score':musk_iforest_report['1']['f1-score'], 
               'sensitivity':musk_iforest_report['1']['recall'],
               'precision':musk_iforest_report['1']['precision'],        
              'Time':musk_iforest_time,
              'AUC':musk_iforest_auc,
                      'AU precision-recall curve': musk_iforest_auc_precision_recall}
musk_lof_performance = {'method':'LOF',
               'f1-score':musk_lof_report['1']['f1-score'], 
               'sensitivity':musk_lof_report['1']['recall'],
               'precision':musk_lof_report['1']['precision'],    
              'Time':musk_lof_time,
              'AUC':musk_lof_auc,
                  'AU precision-recall curve': musk_lof_auc_precision_recall}
musk_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':musk_dbscan_report['1']['f1-score'], 
               'sensitivity':musk_dbscan_report['1']['recall'],
               'precision':musk_dbscan_report['1']['precision'],       
              'Time':musk_dbscan_time,
              'AUC':musk_dbscan_auc,
                     'AU precision-recall curve': musk_dbscan_auc_precision_recall}

musk_performance = musk_performance.append(musk_iforest_performance, ignore_index = True)
musk_performance = musk_performance.append(musk_lof_performance, ignore_index = True)
musk_performance = musk_performance.append(musk_dbscan_performance, ignore_index = True)

In [None]:
bank_performance = pd.DataFrame(columns = ['method','f1-score','sensitivity',
                                           'precision','Time','AUC','AU precision-recall curve'])

bank_iforest_performance = {'method':'iForest',
               'f1-score':bank_iforest_report['1']['f1-score'], 
               'sensitivity':bank_iforest_report['1']['recall'],
               'precision':bank_iforest_report['1']['precision'],        
              'Time':bank_iforest_time,
              'AUC':bank_iforest_auc,
                      'AU precision-recall curve': bank_iforest_auc_precision_recall}
bank_lof_performance = {'method':'LOF',
               'f1-score':bank_lof_report['1']['f1-score'], 
               'sensitivity':bank_lof_report['1']['recall'],
               'precision':bank_lof_report['1']['precision'],    
              'Time':bank_lof_time,
              'AUC':bank_lof_auc,
                  'AU precision-recall curve': bank_lof_auc_precision_recall}
bank_dbscan_performance = { 'method':'DBSCAN',
               'f1-score':bank_dbscan_report['1']['f1-score'], 
               'sensitivity':bank_dbscan_report['1']['recall'],
               'precision':bank_dbscan_report['1']['precision'],       
              'Time':bank_dbscan_time,
              'AUC':bank_dbscan_auc,
                     'AU precision-recall curve': bank_dbscan_auc_precision_recall}

bank_performance = bank_performance.append(bank_iforest_performance, ignore_index = True)
bank_performance = bank_performance.append(bank_lof_performance, ignore_index = True)
bank_performance = bank_performance.append(bank_dbscan_performance, ignore_index = True)

# AUPRC

In [None]:
performance = pd.read_excel('./performance.xlsx', sheet_name = 'AUPRC')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
print('\n'.join('{} average rank: {}'.format(a, r) for a, r in zip(algorithms_names, average_ranks)))

# Time

In [None]:
performance = pd.read_excel('./performance.xlsx', sheet_name = 'Time')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
print('\n'.join('{} average rank: {}'.format(a, r) for a, r in zip(algorithms_names, average_ranks)))

# F1 score

In [None]:
performance = pd.read_excel('./performance.xlsx', sheet_name = 'F1 score')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
print('\n'.join('{} average rank: {}'.format(a, r) for a, r in zip(algorithms_names, average_ranks)))

# Recall

In [None]:
performance = pd.read_excel('./performance.xlsx', sheet_name = 'recall')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
print('\n'.join('{} average rank: {}'.format(a, r) for a, r in zip(algorithms_names, average_ranks)))

# Precision

In [None]:
performance = pd.read_excel('./performance.xlsx', sheet_name = 'precision')

In [None]:
algorithms_names = performance.drop('Dataset', axis=1).columns
performances_array = performance[algorithms_names].values
ranks = np.array([rankdata(-p) for p in performances_array])
average_ranks = np.mean(ranks, axis=0)
print('\n'.join('{} average rank: {}'.format(a, r) for a, r in zip(algorithms_names, average_ranks)))

# Datasets visualization

In [None]:
datasets = pd.read_excel('./performance.xlsx', sheet_name = 'Datasets')

In [None]:
sb.set(font_scale=1.5)
fig, (ax1, ax2, ax3) = plt.subplots(figsize=(35, 20),ncols=3)


g1 = sb.heatmap(pd.DataFrame(datasets.set_index('Dataset')['# observations']).sort_values(by = '# observations'), cmap="Greys", annot=True, fmt=".0f",
          ax=ax1, annot_kws={"fontsize":22})
g1.set(ylabel=None)
cbar = ax1.collections[0].colorbar
cbar.set_ticks([50000, 100000, 150000, 200000, 250000])
cbar.set_ticklabels(['50K', '100K', '150K', '200K','250K'])
ax1.tick_params(rotation=30)


g2= sb.heatmap(pd.DataFrame(datasets.set_index('Dataset')['% anomalies']).sort_values(by = '% anomalies'), cmap="Greys", annot=True, fmt=".2%",
          ax=ax2, annot_kws={"fontsize":22})
g2.set(ylabel=None)
cbar = ax2.collections[0].colorbar
cbar.set_ticks([.01, .05, .1, 0.14])
cbar.set_ticklabels(['1%', '5%', '10%', '14%'])
ax2.tick_params(rotation=30)

g3= sb.heatmap(pd.DataFrame(datasets.set_index('Dataset')['# attributes ']).sort_values(by = '# attributes '), cmap="Greys", annot=True, fmt=".0f",
          ax = ax3, annot_kws={"fontsize":22})
g3.set(ylabel=None)
ax3.tick_params(rotation=30)


#plt.show()
plt.savefig("Datasets.svg", format = 'svg', dpi=300, bbox_inches='tight')

# McNemar's test

## Arrhythmia

In [None]:
print("Test results:")
table = mcnemar_table(y_target=arrhythmia_y_true, y_model1=arrhythmia_iforest_y, y_model2=arrhythmia_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=arrhythmia_y_true, y_model1=arrhythmia_iforest_y, y_model2=arrhythmia_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=arrhythmia_y_true, y_model1=arrhythmia_dbscan_y, y_model2=arrhythmia_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Cardiocotography

In [None]:
print("Test results:")
table = mcnemar_table(y_target=cardio_y_true, y_model1=cardio_iforest_y, y_model2=cardio_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=cardio_y_true, y_model1=cardio_iforest_y, y_model2=cardio_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=cardio_y_true, y_model1=cardio_dbscan_y, y_model2=cardio_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## ForestCover

In [None]:
print("Test results:")
table = mcnemar_table(y_target=forestcover_y_true, y_model1=forestcover_iforest_y, y_model2=forestcover_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=forestcover_y_true, y_model1=forestcover_iforest_y, y_model2=forestcover_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=forestcover_y_true, y_model1=forestcover_dbscan_y, y_model2=forestcover_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Annthyroid

In [None]:
print("Test results:")
table = mcnemar_table(y_target=annthyroid_y_true, y_model1=annthyroid_iforest_y, y_model2=annthyroid_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=annthyroid_y_true, y_model1=annthyroid_iforest_y, y_model2=annthyroid_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=annthyroid_y_true, y_model1=annthyroid_dbscan_y, y_model2=annthyroid_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Credit card

In [None]:
print("Test results:")
table = mcnemar_table(y_target=creditcard_y_true, y_model1=creditcard_iforest_y, y_model2=creditcard_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=creditcard_y_true, y_model1=creditcard_iforest_y, y_model2=creditcard_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=creditcard_y_true, y_model1=creditcard_dbscan_y, y_model2=creditcard_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Mammography

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mammography_y_true, y_model1=mammography_iforest_y, y_model2=mammography_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mammography_y_true, y_model1=mammography_iforest_y, y_model2=mammography_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mammography_y_true, y_model1=mammography_dbscan_y, y_model2=mammography_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Shuttle

In [None]:
print("Test results:")
table = mcnemar_table(y_target=shuttle_y_true, y_model1=shuttle_iforest_y, y_model2 = shuttle_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=shuttle_y_true, y_model1=shuttle_iforest_y, y_model2=shuttle_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=shuttle_y_true, y_model1=shuttle_dbscan_y, y_model2=shuttle_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Mnist

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mnist_y_true, y_model1=mnist_iforest_y, y_model2 = mnist_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mnist_y_true, y_model1=mnist_iforest_y, y_model2=mnist_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=mnist_y_true, y_model1=mnist_dbscan_y, y_model2=mnist_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Vowels

In [None]:
print("Test results:")
table = mcnemar_table(y_target=vowels_y_true, y_model1=vowels_iforest_y, y_model2 = vowels_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=vowels_y_true, y_model1=vowels_iforest_y, y_model2=vowels_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=vowels_y_true, y_model1=vowels_dbscan_y, y_model2=vowels_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Seismic

In [None]:
print("Test results:")
table = mcnemar_table(y_target=seismic_y_true, y_model1=seismic_iforest_y, y_model2 = seismic_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=seismic_y_true, y_model1=seismic_iforest_y, y_model2=seismic_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=seismic_y_true, y_model1=seismic_dbscan_y, y_model2=seismic_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Musk

In [None]:
print("Test results:")
table = mcnemar_table(y_target=musk_y_true, y_model1=musk_iforest_y, y_model2 = musk_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=musk_y_true, y_model1=musk_iforest_y, y_model2=musk_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=musk_y_true, y_model1=musk_dbscan_y, y_model2=musk_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

## Bank

In [None]:
print("Test results:")
table = mcnemar_table(y_target=bank_y_true, y_model1=bank_iforest_y, y_model2 = bank_dbscan_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=bank_y_true, y_model1=bank_iforest_y, y_model2=bank_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")

In [None]:
print("Test results:")
table = mcnemar_table(y_target=bank_y_true, y_model1=bank_dbscan_y, y_model2=bank_lof_y)
chi2, p = mcnemar(ary=table, corrected=True)
print(f"chi squared statistic: {chi2}")
print(f"p-value: {p}\n")