In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import get_scorer_names, roc_auc_score
from sklearn.model_selection import (GridSearchCV, ShuffleSplit,
                                     train_test_split)
from sklearn.neighbors import LocalOutlierFactor

In [11]:
url = 'https://drive.google.com/file/d/1E9bFoMAwMVPMkZHmMQfWvSgaWoCLtKp4/view?usp=share_link'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
y_test_autoencoder = pd.read_pickle(url)

url = 'https://drive.google.com/file/d/1dknbQLJhQsbSFDNIpwjxZQwPc3fF14yF/view?usp=share_link'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
y_train_autoencoder = pd.read_pickle(url)


url = 'https://drive.google.com/file/d/105vmXogIQU8srmtdzfCHYQ9G-T7Iu5hl/view?usp=share_link'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
X_test_autoencoder = pd.read_pickle(url)

url = 'https://drive.google.com/file/d/1QemU274E0c14R8uknuipP20rNgDSLzcg/view?usp=share_link'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
X_train_autoencoder = pd.read_pickle(url)

In [12]:
print(X_train_autoencoder.shape)
print(X_test_autoencoder.shape)
print(y_train_autoencoder.shape)
print(y_test_autoencoder.shape)

(796896, 10)
(103495, 10)
(796896,)
(103495,)


According to the paper [Automatic Hyperparameter Tuning Method for Local Outlier Factor, with Applications to Anomaly Detection](https://arxiv.org/abs/1902.00567) by Zekun Xu et al., we are tuning hyperparameters (neighborhood size, containmination) simultanesouly. The first thing they did was check for skewness and log transform the data to alleviate the effect of extreme outliers, we did the same.

In [13]:
# show skewness
print(X_train_autoencoder.skew())

0    0.507113
1    0.488364
2    0.945407
3    1.036647
4    0.677018
5    0.338238
6    0.662746
7    0.976914
8    0.681232
9    1.016614
dtype: float32


We did noticed the same issue. Log Transform.

In [14]:
# log transform
X_train_autoencoder = np.log1p(X_train_autoencoder) # avoid dvide by zero
X_test_autoencoder = np.log1p(X_test_autoencoder)

The below grid search code doesn't work because [LocalOutlierFactor does not have a predict method](https://stackoverflow.com/questions/50013828/scikitlearn-model-giving-localoutlierfactor-object-has-no-attribute-predict), it instead have **private** method `_predict` that returns the labels (1 inlier, -1 outlier). Therefore, we need to write our own...

In [16]:
"""model = LocalOutlierFactor(n_jobs=-1) # uses all processors

# TODO; tune for contamination as well.
# Define the parameter grid for grid search
param_grid = {'n_neighbors': [3, 5, 10, 15],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# Perform grid search with cross validation
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_autoencoder, y_train_autoencoder)

# Print the best parameters and the corresponding ROC AUC score
print('Best Parameters:', grid_search.best_params_)
print('Best ROC AUC Score:', grid_search.best_score_)
"""

"model = LocalOutlierFactor(n_jobs=-1) # uses all processors\n\n# TODO; tune for contamination as well.\n# Define the parameter grid for grid search\nparam_grid = {'n_neighbors': [3, 5, 10, 15],\n              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}\n\n# Perform grid search with cross validation\ngrid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)\ngrid_search.fit(X_train_autoencoder, y_train_autoencoder)\n\n# Print the best parameters and the corresponding ROC AUC score\nprint('Best Parameters:', grid_search.best_params_)\nprint('Best ROC AUC Score:', grid_search.best_score_)\n"

Testing: converting outliers as fraud (-1->1) and inliers to not fraud (1->0)

In [34]:
# Fit the Local Outlier Factor algorithm on the features
X_train = X_train_autoencoder
y_train = y_train_autoencoder
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.01)
y_pred = clf.fit_predict(X_train)
y_pred_binary = [1 if p == -1 else 0 for p in y_pred]
y_pred_binary = np.array(y_pred_binary)
auc = roc_auc_score(y_train, y_pred_binary)
print('AUC: %.3f' % auc)

AUC: 0.536


Per the paper's recommendation, we tune for both simultanously. The original paper by Breunig et al recommended at least min(k) = 10. 

In [28]:
# initialize hyperparameter grid and AUC
param_grid = {'n_neighbors': [],
              'contamination': [0.1, 0.2, 0.3, 0.4, 0.5]}
best_params = {}
best_auc = 0

# split X_train using cv
cv = ShuffleSplit(n_splits=3, test_size=0.33, random_state=42) # 3-fold cv with 3 repeats``
for train_index, test_index in cv.split(X_train):
    # using same split for each hyperparameter
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    # grid search
    for n_neighbors in param_grid['n_neighbors']:
        for contamination in param_grid['contamination']:
            # fit model
            clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
            y_pred = clf.fit_predict(X_train_cv)
            y_pred_binary = [1 if p == -1 else 0 for p in y_pred]
            y_pred_binary = np.array(y_pred_binary)
            auc = roc_auc_score(y_train_cv, y_pred_binary)
            print('AUC: %.3f' % auc)

            # update best params
            if auc > best_auc:
                best_params['n_neighbors'] = n_neighbors
                best_params['contamination'] = contamination
                best_auc = auc


TRAIN: 533920 TEST: 262976
TRAIN: 533920 TEST: 262976
TRAIN: 533920 TEST: 262976


In [None]:
# Retrain the model using the best parameters on the entire training set
best_model = LocalOutlierFactor(n_neighbors=grid_search.best_params_['n_neighbors'], 
                                algorithm=grid_search.best_params_['algorithm'], 
                                novelty=True, n_jobs=-1)
best_model.fit(X_train_autoencoder, y_train_autoencoder)

In [None]:

# Evaluate the performance on the test set
y_pred = best_model.predict(X_test_autoencoder)
roc_auc = roc_auc_score(y_test_autoencoder, y_pred)
print('ROC AUC Score on Test Set:', roc_auc)