<a href="https://colab.research.google.com/github/yashs79/financialaranomaly/blob/main/04_Modeling_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **4. Model Selection and Implementation - Anomaly Detection Methods**


#### Importing libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")

# pre-processing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# over-sampling
from imblearn.over_sampling import SMOTE

# classification models
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.mixture import GaussianMixture

# performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, make_scorer
from sklearn.metrics import auc, roc_curve, average_precision_score, PrecisionRecallDisplay
from imblearn.metrics import classification_report_imbalanced
np.random.seed(23)

#### Loading data

In [None]:
raw_data = pd.read_csv('/content/creditcard.csv')

# Stratified Random Sampling
X_train_strat = pd.read_csv('/content/X_train_strat.csv')
X_test_strat = pd.read_csv('/content/X_test_strat.csv')
y_train_strat = pd.read_csv('/content/y_train_strat.csv')
y_test_strat = pd.read_csv('/content/y_test_strat.csv')

# Over Sampling
X_train_os = pd.read_csv('/content/X_train_os.csv')
X_test_os = pd.read_csv('/content/X_test_os.csv')
y_train_os = pd.read_csv('/content/y_train_os.csv')
y_test_os = pd.read_csv('/content/y_test_os.csv')

# Over Sampling with SMOTE
X_train_smote = pd.read_csv('/content/X_train_smote.csv')
X_test_smote = pd.read_csv('/content/X_test_smote.csv')
y_train_smote = pd.read_csv('/content/y_train_smote.csv')
y_test_smote = pd.read_csv('/content/y_test_smote.csv')

## 1. Outlier Detection(Unsupervised Anomaly Detection)

In [None]:
X, y =  raw_data.iloc[:,:-1], raw_data.iloc[:,-1]

ct = ColumnTransformer([('somename', StandardScaler(), [0,-1])], remainder='passthrough')

X_full = ct.fit_transform(X)

print(f'X:{X.shape}, y:{y.shape}')

X:(284807, 30), y:(284807,)


### 1.1 Isolation Random Forest

- Found the following as best params with GridSearchCV with scoring = `Average Precision`

In [None]:
%%time

# Train the isolation forest model
if_model = IsolationForest(n_estimators=120,
                           contamination='auto',
                           random_state=23,
                           bootstrap=True,
                           max_features=5,
                           max_samples=125,)

if_model.fit(X_full)

# Predict the anomalies
if_prediction = if_model.predict(X_full)

# Change the anomalies' values to make it consistent with the true values
if_prediction = [1 if i==-1 else 0 for i in if_prediction]

# Check the model performance
print(confusion_matrix(y, if_prediction))
print(classification_report(y, if_prediction))

[[266777  17538]
 [    65    427]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    284315
           1       0.02      0.87      0.05       492

    accuracy                           0.94    284807
   macro avg       0.51      0.90      0.51    284807
weighted avg       1.00      0.94      0.97    284807

CPU times: user 2.84 s, sys: 129 ms, total: 2.97 s
Wall time: 2.96 s


## 2. Local Outlier Factor

In [None]:
X_train_normal = X_train_strat.loc[y_train_strat['Class']==0]
X_train_anomaly = X_train_strat.loc[y_train_strat['Class']==1]

y_train_normal = y_train_strat[y_train_strat['Class']==0]
y_train_anomaly = y_train_strat[y_train_strat['Class']==1]

In [None]:
clf = LocalOutlierFactor(n_neighbors=1000)
clf_prediction = clf.fit_predict(X_full)
clf_prediction = [1 if i==-1 else 0 for i in clf_prediction]

print(confusion_matrix(y, clf_prediction))
print(classification_report(y, clf_prediction))

[[275351   8964]
 [    69    423]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    284315
           1       0.05      0.86      0.09       492

    accuracy                           0.97    284807
   macro avg       0.52      0.91      0.53    284807
weighted avg       1.00      0.97      0.98    284807

