In [1]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

import pathlib

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import auc, roc_auc_score
from sklearn.utils import check_random_state
from joblib import Parallel, delayed

from sklearn.ensemble import IsolationForest

base_dir = pathlib.Path('/Users/vaibhav/MiscProjects/anomaly-detection')

In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from scipy.spatial.distance import pdist, squareform

In [3]:
df = pd.read_csv(base_dir/'datasets/musk.csv')

df_n = df.loc[df['class'] == 0]
df_o = df.loc[df['class'] == 1]

df_o_vals = df_o.values

# sample outliers
np.random.shuffle(df_o_vals)

df_o_subset = pd.DataFrame(df_o_vals[0:200, :], columns=list(df_o.columns))

df2 = pd.concat([df_n, df_o_subset], axis=0)

#np.unique(df['class'], return_counts=True)
#(array([0, 1]), array([5581, 1017]))

cols = ['f' + str(i) for i in range(1, 167)]

x = np.array(df2[cols].values, dtype='float')
y = np.array(df2['class'].values, dtype='int')
      
# standardize x
avg = np.mean(x, axis=0)
std = np.std(x, axis=0)

x = (x - avg)/std
    
n_samples, n_features = x.shape

In [4]:
x.shape

(5781, 166)

In [17]:
# calculate pairwise dist
# for each point, get neighbor idx

In [7]:
dists = pdist(x, metric='euclidean')

In [8]:
dists2 = squareform(dists)

In [12]:
allnbrs = np.zeros(dists2.shape, dtype='int')

In [13]:
for i in range(0, n_samples):
    allnbrs[i, :] = np.argsort(dists2[i, :])

In [63]:
nbrs_s = np.zeros(dists2.shape, dtype='int')

In [64]:
for k in range(0, n_samples):
    idx, ct = np.unique(allnbrs[:, k], return_counts=True)
    nbrs_s[idx, k] = ct

In [92]:
rho = 0.25
nrho = n_samples*rho

cfof = np.zeros((n_samples,), dtype='int')

for i in range(0, n_samples):
    nn = 0
    for k in range(0, n_samples):
        nn += nbrs_s[i, k]
        if nn >= nrho:
            cfof[i] = k
            break

print(roc_auc_score(y_true=y, y_score=-cfof))

0.5150739114853968


In [73]:
print(roc_auc_score(y_true=y, y_score=-cfof))

0.7994216986203189


In [86]:
print(roc_auc_score(y_true=y, y_score=-cfof))

0.7001276652929582
