In [48]:
import os
import numpy as np
import pandas as pd
import csv

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report

In [49]:
raw_data =  pd.read_csv("creditcard.csv") 

In [50]:
raw_data.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [51]:
X = raw_data.loc[:,raw_data.columns != "Class"] #dataframe of feature variables
y = raw_data["Class"] #dataframe of the target variable 

In [52]:
print(np.unique( y,return_counts = True))

(array([0, 1], dtype=int64), array([284315,    492], dtype=int64))


## I want to then split these dataframes into the testing and training data
## 20 percent of data reserved for testing and the rest for training

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20)

In [74]:
y_train.value_counts()

0    227447
1       398
Name: Class, dtype: int64

## As we can see, this is a highly imbalanced dataset as one class has many more values than the other.

## I will first undersample the majority class then oversample the minority class

In [75]:
rus = RandomUnderSampler(sampling_strategy = 1/300, random_state=42)
X_train2, y_train2 = rus.fit_resample(X_train, y_train)

In [76]:
y_train2.value_counts()

0    119399
1       398
Name: Class, dtype: int64

## An imblanced dataset will be highly biased towards the majority class. Therefore I will employ an oversampling technique where I oversample the minority class to reduce this bias further

In [78]:
sm = SMOTE(sampling_strategy = 1/200,random_state=42)
X_train3, y_train3 = sm.fit_resample(X_train2, y_train2)
y_train3.value_counts()

0    119399
1       596
Name: Class, dtype: int64

In [92]:
y_train3.count()

119995

In [94]:
type(y_train3.count())

numpy.int64

In [9]:
rfc = RandomForestClassifier()

## Now that we made our dataset less imbalanced, we have to find the hyperparameters that results in the greatest model performance

## For this I am using an hyperparameter tuning method known as Random Search

In [113]:
t1 = [float(x) for x in np.linspace(start = 0.02, stop = 0.10, num = 9)]

In [117]:
t2 = [round(num* y_train3.count()) for num in t1]

In [118]:
t2

[2400, 3600, 4800, 6000, 7200, 8400, 9600, 10800, 12000]

In [174]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 100, num = 6)]
max_depth =  [int(x) for x in np.linspace(start = 5, stop = 20, num = 15)]
max_samples = [float(x) for x in np.linspace(start = 0.1, stop = 1.0, num = 10)]
min_samples_split = t2 #following industry standard, I set min_samples_split as 2 - 10 percent of dataset
#min_samples_split = [2, 5, 7]
min_samples_leaf = [1,2,3,4,5]

In [175]:
param_grid = {'n_estimators': n_estimators,
              'min_samples_split': min_samples_split,
              'max_depth': max_depth,
              'max_samples': max_samples,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf}
print(param_grid)

{'n_estimators': [50, 60, 70, 80, 90, 100], 'min_samples_split': [2400, 3600, 4800, 6000, 7200, 8400, 9600, 10800, 12000], 'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20], 'max_samples': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0], 'min_samples_leaf': [1, 2, 3, 4, 5]}


In [176]:
rf_random_grid = RandomizedSearchCV(estimator = rfc , 
                                    param_distributions = param_grid, 
                                    cv = 3, 
                                    verbose = 2,
                                    n_iter = 15,
                                    scoring = 'f1')

In [177]:
rf_random_grid.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END max_depth=12, max_samples=0.6, min_samples_leaf=1, min_samples_split=10800, n_estimators=90; total time= 1.4min
[CV] END max_depth=12, max_samples=0.6, min_samples_leaf=1, min_samples_split=10800, n_estimators=90; total time= 1.7min
[CV] END max_depth=12, max_samples=0.6, min_samples_leaf=1, min_samples_split=10800, n_estimators=90; total time= 2.3min
[CV] END max_depth=17, max_samples=0.1, min_samples_leaf=5, min_samples_split=6000, n_estimators=90; total time=  12.7s
[CV] END max_depth=17, max_samples=0.1, min_samples_leaf=5, min_samples_split=6000, n_estimators=90; total time=  12.5s
[CV] END max_depth=17, max_samples=0.1, min_samples_leaf=5, min_samples_split=6000, n_estimators=90; total time=  12.7s
[CV] END max_depth=13, max_samples=0.8, min_samples_leaf=3, min_samples_split=8400, n_estimators=90; total time= 3.0min
[CV] END max_depth=13, max_samples=0.8, min_samples_leaf=3, min_samples_split=8400, n_estimators

In [178]:
rf_random_grid.best_params_

{'n_estimators': 70,
 'min_samples_split': 4800,
 'min_samples_leaf': 3,
 'max_samples': 0.4,
 'max_depth': 5}

## Now that we know the best number of trees and the best depth to choose we can now train the Random Forest classifier

In [203]:
rfc2 = RandomForestClassifier(n_estimators = 70, criterion = 'entropy', min_samples_split = 4800, min_samples_leaf = 3,
                              max_depth = 5, max_samples = 0.4) #, class_weight = 'balanced' )
#rfc2.fit(X_train, y_train)

In [204]:
rfc2.fit(X_train, y_train)

In [210]:
rfc3 = RandomForestClassifier(criterion = 'entropy')#, class_weight = 'balanced')

In [211]:
rfc3.fit(X_train, y_train)

In [206]:
rf_predict = rfc2.predict(X_test)

In [212]:
rf_predict3 = rfc3.predict(X_test)

In [213]:
print(classification_report(y_test, rf_predict3)) #nothing just rfc with entropy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.99      0.80      0.88        94

    accuracy                           1.00     56962
   macro avg       0.99      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [209]:
print(classification_report(y_test, rf_predict3)) #nothing just rfc with entropy and balanced weights

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.97      0.79      0.87        94

    accuracy                           1.00     56962
   macro avg       0.99      0.89      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [207]:
print(classification_report(y_test, rf_predict)) #just hyperparameter tuning

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       1.00      0.27      0.42        94

    accuracy                           1.00     56962
   macro avg       1.00      0.63      0.71     56962
weighted avg       1.00      1.00      1.00     56962



In [173]:
print(classification_report(y_test, rf_predict)) 
#with class_weight = 'balanced' & with no with no undersampling and oversampling & no hyperparameter tuning(on oversampled data)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.97      0.80      0.88        94

    accuracy                           1.00     56962
   macro avg       0.99      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [167]:
print(classification_report(y_test, rf_predict)) #with class_weight = 'balanced' & with no with no undersampling and oversampling

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.90      0.60      0.72        94

    accuracy                           1.00     56962
   macro avg       0.95      0.80      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [150]:
print(classification_report(y_test, rf_predict)) #with class_weight = None & with no undersampling and oversampling

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.89      0.62      0.73        94

    accuracy                           1.00     56962
   macro avg       0.95      0.81      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [143]:
print(classification_report(y_test, rf_predict)) #with class_weight = 'balanced' & with undersampling and oversampling

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56868
           1       0.06      0.89      0.11        94

    accuracy                           0.98     56962
   macro avg       0.53      0.93      0.55     56962
weighted avg       1.00      0.98      0.99     56962

