In [1]:
# Importing core libraries
import numpy as np
import pandas as pd
import os
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

# Importing custom estimators
from CustomEstimator import MultivariateGaussian, MultivariateTDistribution

In [14]:
# Importing the Credit Card Fraud dataset
data = pd.read_csv('creditcard.csv')
y_data = data.copy()['Class'].values
original_data = data.copy()

# Clean data set
normal_only_data = data[data['Class']==0]
print('Normal only data shape: ', normal_only_data.shape)
# Fraud data set
fraud_only_data = data[data['Class']==1]
print('Fraud only data shape: ', fraud_only_data.shape)

# Shuffling the data
normal_only_data = normal_only_data.sample(frac=1, random_state=42)
fraud_only_data = fraud_only_data.sample(frac=1, random_state=42)

# 80/10/10 data split for normal data
train_set, dev_set, test_set = np.split(normal_only_data, [int(0.8*len(normal_only_data)), int(0.9*len(normal_only_data))])
train_set = train_set.drop('Class', axis=1)

# 50/50 data split for fraud data
fraud_set_1, fraud_set_2 = np.split(fraud_only_data, [int(0.5*len(fraud_only_data))])

# Appending fraud data to dev and test set
dev_set = dev_set.append(fraud_set_1)
y_dev_set = dev_set['Class']
dev_set = dev_set.drop('Class', axis=1)
test_set = test_set.append(fraud_set_2)
y_test_set = test_set['Class']
test_set = test_set.drop('Class', axis=1)

# Showing shapes
for name, data in zip(['Train data shape: ', 'Dev data shape: ', 'Test data shape: '],[train_set, dev_set, test_set]):
    print(name, data.shape)

Normal only data shape:  (284315, 31)
Fraud only data shape:  (492, 31)
Train data shape:  (227452, 30)
Dev data shape:  (28677, 30)
Test data shape:  (28678, 30)


In [None]:
# Training Multivariate Gaussian Anomaly Detector
mvg = MultivariateGaussian(epsilon=0.05**30)
mvg.fit(train_set)

In [None]:
# Evaluating on the Dev set
mvg_y_dev_preds = mvg.predict(dev_set)
print('Recall score:\n', recall_score(y_dev_set, mvg_y_dev_preds))
print('Precision score:\n', precision_score(y_dev_set, mvg_y_dev_preds))
print('Confusion matrix:\n', confusion_matrix(y_dev_set, mvg_y_dev_preds))

In [None]:
# Training Multivariate T Anomaly Detector
mvt = MultivariateTDistribution(epsilon=0.05**30, df=3)
mvt.fit(train_set)

In [None]:
# Evaluating on the Dev Set
mvt_y_dev_preds = mvt.predict(dev_set)
print('Recall score:\n', recall_score(y_dev_set, mvt_y_dev_preds))
print('Precision score:\n', precision_score(y_dev_set, mvt_y_dev_preds))
print('Confusion matrix:\n', confusion_matrix(y_dev_set, mvt_y_dev_preds))

In [None]:
# Training Local Outlier Factor Anomaly Detector
lof = LocalOutlierFactor(novelty=True, metric='euclidean')
lof.fit(train_set)

In [None]:
# Evaluating on the Dev set
lof_y_dev_preds = lof.predict(dev_set)
lof_y_dev_preds[lof_y_dev_preds==1] = 0
lof_y_dev_preds[lof_y_dev_preds==-1] = 1
print('Recall score:\n', recall_score(y_dev_set, lof_y_dev_preds))
print('Precision score:\n', precision_score(y_dev_set, lof_y_dev_preds))
print('Confusion matrix:\n', confusion_matrix(y_dev_set, lof_y_dev_preds))

In [5]:
# Training Isolation Forest Anomaly Detector
ifr = IsolationForest(random_state=42)
ifr.fit(train_set)



IsolationForest(behaviour='old', bootstrap=False, contamination='legacy',
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

In [6]:
# Evaluating on the Dev Set
ifr_y_dev_preds = ifr.predict(dev_set)
ifr_y_dev_preds[ifr_y_dev_preds==1] = 0
ifr_y_dev_preds[ifr_y_dev_preds==-1] = 1
print('Recall score:\n', recall_score(y_dev_set, ifr_y_dev_preds))
print('Precision score:\n', precision_score(y_dev_set, ifr_y_dev_preds))
print('Confusion matrix:\n', confusion_matrix(y_dev_set, ifr_y_dev_preds))



Recall score:
 0.8861788617886179
Precision score:
 0.06982703395259449
Confusion matrix:
 [[25527  2904]
 [   28   218]]


In [16]:
# Trying out Isolation Forest on contaminated dataset
ifr_2 = IsolationForest(random_state=42)
ifr_2_y_preds = ifr_2.fit_predict(original_data)
ifr_2_y_preds[ifr_2_y_preds==1] = 0
ifr_2_y_preds[ifr_2_y_preds==-1] = 1
print('Recall score:\n', recall_score(y_data, ifr_2_y_preds))
print('Precision score:\n', precision_score(y_data, ifr_2_y_preds))
print('Confusion matrix:\n', confusion_matrix(y_data, ifr_2_y_preds))



Recall score:
 0.9065040650406504
Precision score:
 0.015659562515361117
Confusion matrix:
 [[256280  28035]
 [    46    446]]
