In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.neighbors import LocalOutlierFactor
from sklearn import metrics
from sklearn.model_selection import train_test_split

np.random.seed(42)

In [2]:
data = pd.read_csv('mammography.csv', header=None)

In [3]:
def predictAndEvaluate(X, ground_truth):
    clf = LocalOutlierFactor()#n_neighbors=20, contamination=0.1)
    y_pred = clf.fit_predict(X)
    fpr, tpr, thresholds = metrics.roc_curve(ground_truth, clf.negative_outlier_factor_, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    f1 = metrics.f1_score(ground_truth, y_pred)
    return (auc, f1)

In [4]:
results = [] # (name, auc, f1)

In [5]:
## Need to flip class label, LOF outputs 1 for an inlier and -1 for an outlier
ground_truth = data.iloc[:,-1].apply(lambda x: -int(x.strip("'"))).values
data.iloc[:, -1] = ground_truth
X = data.values[:,:-1]
print(data.shape, X.shape, ground_truth.shape)

(11183, 7) (11183, 6) (11183,)


In [6]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,0.230020,5.072578,-0.276061,0.832444,-0.377866,0.480322,1
1,0.155491,-0.169390,0.670652,-0.859553,-0.377866,-0.945723,1
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,1
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,1
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,1
...,...,...,...,...,...,...,...
11178,-0.250012,-0.377300,-0.321142,1.269157,3.652984,1.092791,-1
11179,0.281343,-0.417112,-0.366224,0.851010,2.789649,1.345700,-1
11180,1.204988,1.763724,-0.501468,1.562408,6.489072,0.931294,-1
11181,0.736644,-0.222474,-0.050653,1.509665,0.539269,1.315229,-1


In [7]:
counter = Counter(ground_truth)
for k,v in counter.items():
	per = v / len(ground_truth) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

Class=1, Count=10923, Percentage=97.675%
Class=-1, Count=260, Percentage=2.325%


In [8]:
auc, f1 = predictAndEvaluate(X, ground_truth)
results.append(("Full data", auc, f1))
print(auc, f1)

0.7203702138747455 0.9831637541634348


In [9]:
def dofiftyfiftyTests(data, n):
    results = []
    for i in range(n):
        # split normal dataset 50/50
        X, y = train_test_split(data[data.iloc[:,-1] == 1].values[:,:-1], 
                            train_size=(len(data) // 2), shuffle=True, random_state=42 + i)
        # append all abnormal data to the test set. 
        #No need to shuffle these, LOF does no updating based on these observations
        y_test = np.concatenate((y, data[data.iloc[:, -1] == -1].values[:,:-1]))
        # create ground truth for the test set
        ground_truth = np.concatenate((np.ones(len(y)),-np.ones(sum(data.iloc[:, -1] == -1))))
        clf = LocalOutlierFactor(novelty=True)#n_neighbors=20, contamination=0.1)
        clf.fit(X)
        y_pred = clf.predict(y_test)
        nof = clf.score_samples(y_test)
        fpr, tpr, thresholds = metrics.roc_curve(ground_truth, nof, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        f1 = metrics.f1_score(ground_truth, y_pred)
        results.append((i, auc, f1))
    return results

In [10]:
n=10
r = dofiftyfiftyTests(data, n)
print("%-5s %5s %5s" %("Run", "AUC", "F1"))
for run, auc, f1 in r:
    print("%-5d %.3f %.3f" %(run, auc, f1))
results.append(("50/50 data avg of %d runs" %n, ) + tuple(np.array(r).mean(axis=0)[1:].tolist()))

Run     AUC    F1
0     0.852 0.975
1     0.870 0.976
2     0.835 0.973
3     0.835 0.973
4     0.858 0.975
5     0.838 0.975
6     0.862 0.975
7     0.867 0.974
8     0.854 0.974
9     0.870 0.975


In [11]:
n=10
r = dofiftyfiftyTests(data.drop_duplicates(), n)
print("%-5s %5s %5s" %("Run", "AUC", "F1"))
for run, auc, f1 in r:
    print("%-5d %.3f %.3f" %(run, auc, f1))
results.append(("50/50 dedup avg of %d runs" %n, ) + tuple(np.array(r).mean(axis=0)[1:].tolist()))

Run     AUC    F1
0     0.844 0.967
1     0.814 0.967
2     0.833 0.965
3     0.805 0.964
4     0.817 0.966
5     0.809 0.965
6     0.815 0.968
7     0.824 0.965
8     0.804 0.965
9     0.830 0.961


In [12]:
print("%-30s %5s %5s" %("Dataset", "AUC", "F1"))
for s, auc, f1 in results:
    print("%-30s %.3f %.3f" %(s, auc, f1))

Dataset                          AUC    F1
Full data                      0.720 0.983
50/50 data avg of 10 runs      0.854 0.974
50/50 dedup avg of 10 runs     0.820 0.965
