In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from sklearn.metrics import *

from google.colab import drive
from pprint import pprint

# **Set-up**
- train and test data

In [2]:
drive.mount('/content/drive', force_remount=True)
X_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_train.csv')
X_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_test.csv')
y_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_train.csv')
y_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_test.csv')

# Drop unnamed columns
X_train = X_train.drop(columns=['Unnamed: 0'])
X_test = X_test.drop(columns=['Unnamed: 0'])
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test.drop(columns=['Unnamed: 0'])


Mounted at /content/drive


In [3]:
X_train.shape # 469 rows, 12 columns
y_train

Unnamed: 0,Fraud
0,1
1,0
2,0
3,0
4,1
...,...
464,0
465,0
466,0
467,1


# **Selected Features**
- XGBoost selected features: 21 features
- RF selected features: 14
- Chi2 + ANOVA selected features: 30 features

In [4]:
xgb_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'paymentMethodType_card', 'paymentMethodProvider_Discover',
       'paymentMethodProvider_Maestro', 'paymentMethodProvider_Voyager',
       'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_Northeast',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

rf_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'No_Accounts', 'paymentMethodProvider_Maestro',
       'paymentMethodProvider_Voyager', 'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

chi_anova_features = ['customerBillingAddressRegion_Multiple',
 'ipCountry_Multiple',
 'customerBillingAddressRegion_South',
 'customerBillingAddressRegion_Midwest',
 'ipCountry_Hong Kong',
 'paymentMethodType_paypal',
 'paymentMethodProvider_Maestro',
 'customerBillingAddressRegion_Others',
 'ipCountry_Romania',
 'ipCountry_Morocco',
 'ipCountry_Italy',
 'ipCountry_United Kingdom',
 'ipCountry_China',
 'ipCountry_Taiwan',
 'paymentMethodProvider_Voyager',
 'paymentMethodProvider_VISA 13 digit',
 'ipCountry_United States',
 'customerBillingAddressRegion_Northeast',
 'ipCountry_Switzerland',
 'ipCountry_France',
 'ipCountry_Iran',
 'ipCountry_South Korea',
 'ipCountry_Armenia',
 'paymentMethodProvider_Diners Club / Carte Blanche',
 'paymentMethodProvider_JCB 15 digit',
 'No_Orders',
 'transactionAmount',
 'No_Payments',
 'No_Transactions',
 'No_Accounts']


# **Helper Functions**
- Evaluation function
- Plotting of feature importance

In [5]:
agg_accuracy_score_dict = {}
agg_f1_score_dict={}
agg_roc_auc_dict={}

train_accuracy_score_dict = {}
train_f1_score_dict={}
train_roc_auc_dict={}
train_recall_dict = {}

val_accuracy_score_dict = {}
val_f1_score_dict={}
val_roc_auc_dict={}

test_accuracy_score_dict = {}
test_f1_score_dict={}
test_roc_auc_dict={}
test_recall_dict = {}

def evaluation(true, pred, title):
    print("================== Evaluation on {} ==================".format(title))
    # accuracy 
    acc = accuracy_score(true, pred)
    print("accuracy: {}\n".format(acc))
    agg_accuracy_score_dict[title] = acc
    # f1 score
    print("classification report: \n{}\n".format(classification_report(true, pred)))
    print("F1 score: \n{}\n".format(f1_score(true, pred)))
    f1 = f1_score(true, pred)
    agg_f1_score_dict[title] = f1
    # confusion matrix
    print("Confusion matrix: \n{}\n".format(confusion_matrix(true, pred)))
    # roc auc result
    fpr, tpr, thresholds = roc_curve(true, pred)
    roc_auc = auc(fpr, tpr)
    print("ROC AUC: {}\n".format(roc_auc))
    agg_roc_auc_dict[title] = roc_auc

    recall = recall_score(true,pred)
    print("Recall: {}\n".format(recall))
    print("================== End of Evaluation on {} ==================".format(title))
    
    if "train dataset" in title:
        train_accuracy_score_dict[title] = acc
        train_f1_score_dict[title] = f1
        train_roc_auc_dict[title] = roc_auc
        train_recall_dict[title] = recall
    elif "test dataset" in title:
        test_accuracy_score_dict[title] = acc
        test_f1_score_dict[title] = f1
        test_roc_auc_dict[title] = roc_auc
        test_recall_dict[title] = recall
    else:
        val_accuracy_score_dict[title] = acc
        val_f1_score_dict[title] = f1
        val_roc_auc_dict[title] = roc_auc

# **RandomForest Model**
- RandomSearch params
- GridSearch params
- Best results

1. Using XGBoost selected features
2. Using RF selected features
3. Using Chi2 + ANOVA selected features

In [6]:
# XGB Features
xgb_train = X_train[xgb_features]
xgb_test = X_test[xgb_features]

# RF Features
rf_train = X_train[rf_features]
rf_test = X_test[rf_features]

# Chi2 + ANOVA features
chi_anova_train = X_train[chi_anova_features]
chi_anova_test = X_test[chi_anova_features]


## **Base Model**

In [7]:
rf_base = RandomForestClassifier(n_estimators = 100, random_state = 2022, min_samples_leaf=1, min_samples_split=5, max_depth=10)
rf_base.fit(X_train, y_train)

# Train Evaluation
evaluation(y_train, rf_base.predict(X_train), "Default RF on train dataset")
print('\n')
y_pred_class = rf_base.predict(X_test)
evaluation(y_test, y_pred_class, "Default RF on test dataset")

  


accuracy: 0.9317697228144989

classification report: 
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       278
           1       1.00      0.83      0.91       191

    accuracy                           0.93       469
   macro avg       0.95      0.92      0.93       469
weighted avg       0.94      0.93      0.93       469


F1 score: 
0.9085714285714286

Confusion matrix: 
[[278   0]
 [ 32 159]]

ROC AUC: 0.9162303664921466

Recall: 0.8324607329842932



accuracy: 0.8898305084745762

classification report: 
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        61
           1       1.00      0.77      0.87        57

    accuracy                           0.89       118
   macro avg       0.91      0.89      0.89       118
weighted avg       0.91      0.89      0.89       118


F1 score: 
0.8712871287128713

Confusion matrix: 
[[61  0]
 [13 44]]

ROC AUC: 0.8859649122807017

Reca

___
## **1. Using XGBoost selected features**


### **RandomSearch params**
- RandomSearch for best params
- Evaluation of results

In [8]:
###########################################################################
# Initializing RandomSearch for Best Params
###########################################################################

max_depth = [int(x) for x in np.linspace(1,12, num=12)]
max_depth.append(None)
random_grid = {'n_estimators': [int(x) for x in np.linspace(start=50, stop=150, num=10)],
               'max_features': ['auto', 'sqrt', 'log2'],
               'criterion': ['gini', 'entropy'],
               'max_depth': max_depth ,
               'min_samples_split': [2,5,10],
               'min_samples_leaf': [1,2,4],
               'bootstrap': [True, False]}
pprint(random_grid)


{'bootstrap': [True, False],
 'criterion': ['gini', 'entropy'],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [9]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=2022, n_jobs = -1)
rf_random.fit(xgb_train, y_train)

# best params
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'n_estimators': 127,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

In [10]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################

# RF Modelling
rf_rs = RandomForestClassifier(n_estimators = 127, random_state = 2022, min_samples_leaf=1, min_samples_split=5, max_depth=None, criterion='gini',
                                 bootstrap=False)
rf_rs.fit(xgb_train, y_train)

# Train Evaluation
evaluation(y_train, rf_rs.predict(xgb_train), "RF on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_rs.predict(xgb_test)
evaluation(y_test, y_pred_class, "RF on xgb_test dataset")

  


accuracy: 1.0

classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       278
           1       1.00      1.00      1.00       191

    accuracy                           1.00       469
   macro avg       1.00      1.00      1.00       469
weighted avg       1.00      1.00      1.00       469


F1 score: 
1.0

Confusion matrix: 
[[278   0]
 [  0 191]]

ROC AUC: 1.0

Recall: 1.0



accuracy: 0.9745762711864406

classification report: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        61
           1       1.00      0.95      0.97        57

    accuracy                           0.97       118
   macro avg       0.98      0.97      0.97       118
weighted avg       0.98      0.97      0.97       118


F1 score: 
0.972972972972973

Confusion matrix: 
[[61  0]
 [ 3 54]]

ROC AUC: 0.9736842105263157

Recall: 0.9473684210526315



### **GridSearch params**
- GridSearch for best params
- Evaluation of results

Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search:

In [11]:
###########################################################################
# Get best params for GridSearch
###########################################################################
gs_grid = {'n_estimators': [120, 125, 130, 135, 140],
            'min_samples_split': [2,3,4,5],
            'min_samples_leaf': [1,2],
            'max_features': ['log2'],
            'max_depth': [10,11,12,13,14],
            'criterion': ['gini'],
            'bootstrap': [False]}
rf = RandomForestClassifier(random_state=2022)

# Instantiate Grid Search Model
rf_grid = GridSearchCV(estimator = rf, param_grid = gs_grid,
                          cv = 3, n_jobs = -1, verbose = 2)
rf_grid.fit(xgb_train, y_train)

# best params
rf_grid.best_params_

Fitting 3 folds for each of 200 candidates, totalling 600 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 14,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 135}

In [12]:
###########################################################################
# Evaluation of GridSearch Best params
###########################################################################

# Evaluation for GS best params

# Train Evaluation
evaluation(y_train, rf_grid.best_estimator_.predict(xgb_train), "Best params (GS) RF on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_grid.best_estimator_.predict(xgb_test)
evaluation(y_test, y_pred_class, "Best params (GS) on xgb_test dataset")

accuracy: 0.9914712153518124

classification report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       278
           1       1.00      0.98      0.99       191

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469


F1 score: 
0.9894179894179894

Confusion matrix: 
[[278   0]
 [  4 187]]

ROC AUC: 0.9895287958115183

Recall: 0.9790575916230366



accuracy: 0.9661016949152542

classification report: 
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        61
           1       1.00      0.93      0.96        57

    accuracy                           0.97       118
   macro avg       0.97      0.96      0.97       118
weighted avg       0.97      0.97      0.97       118


F1 score: 
0.9636363636363636

Confusion matrix: 
[[61  0]
 [ 4 53]]

ROC AUC: 0.9649122807017544

Reca

### **Best params**

- Using the randomsearch params, we got the best results across all evaluation metrics

In [13]:
train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


xgb_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'xgb_rf_rs': [train_accuracy_score[-2]*100,
                                   train_f1_score[-2]*100,
                                   train_roc_auc[-2]*100,
                                   train_recall[-2]*100,
                                   test_accuracy_score[-2]*100,
                                   test_f1_score[-2]*100,
                                   test_roc_auc[-2]*100,
                                   test_recall[-2]*100],
                          'xgb_rf_gs': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})
xgb_table

Unnamed: 0,eval,base,xgb_rf_rs,xgb_rf_gs
0,train_acc,93.176972,100.0,99.147122
1,train_f1,90.857143,100.0,98.941799
2,train_roc_auc,91.623037,100.0,98.95288
3,train_recall,83.246073,100.0,97.905759
4,test_acc,88.983051,97.457627,96.610169
5,test_f1,87.128713,97.297297,96.363636
6,test_roc_auc,88.596491,97.368421,96.491228
7,test_recall,77.192982,94.736842,92.982456


___
## **2. Using RF selected features**

### **RandomSearch params**
- RandomSearch for best params
- Evaluation of results

In [14]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
rf_rf = RandomForestClassifier()
rf_rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=2022, n_jobs = -1)
# Fit using RF selected features
rf_rf_random.fit(rf_train, y_train)

# best params
rf_rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'n_estimators': 127,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 12,
 'criterion': 'gini',
 'bootstrap': False}

In [15]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################

# RF Modelling
rf_rf_rs = RandomForestClassifier(n_estimators = 127, random_state = 2022, min_samples_leaf=1, min_samples_split=5, max_depth=None, criterion='gini',
                                 bootstrap=False)
rf_rf_rs.fit(rf_train, y_train)

# Train Evaluation
evaluation(y_train, rf_rf_rs.predict(rf_train), "RF (RS) on rf_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_rf_rs.predict(rf_test)
evaluation(y_test, y_pred_class, "RF(RS) on rf_test dataset")

  


accuracy: 0.997867803837953

classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       278
           1       1.00      0.99      1.00       191

    accuracy                           1.00       469
   macro avg       1.00      1.00      1.00       469
weighted avg       1.00      1.00      1.00       469


F1 score: 
0.9973753280839895

Confusion matrix: 
[[278   0]
 [  1 190]]

ROC AUC: 0.9973821989528796

Recall: 0.9947643979057592



accuracy: 0.9745762711864406

classification report: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        61
           1       1.00      0.95      0.97        57

    accuracy                           0.97       118
   macro avg       0.98      0.97      0.97       118
weighted avg       0.98      0.97      0.97       118


F1 score: 
0.972972972972973

Confusion matrix: 
[[61  0]
 [ 3 54]]

ROC AUC: 0.9736842105263157

Recall

### **GridSearch params**
- GridSearch for best params
- Evaluation of results

In [16]:
###########################################################################
# Get best params for GridSearch
# Since parameters from RF same as the one for the XGB_data, we use the same gs grid
###########################################################################
gs_grid = {'n_estimators': [120, 125, 130, 135, 140],
            'min_samples_split': [2,3,4,5],
            'min_samples_leaf': [1,2],
            'max_features': ['log2'],
            'max_depth': [10,11,12,13,14],
            'criterion': ['gini'],
            'bootstrap': [False]}
rf = RandomForestClassifier(random_state=2022)

# Instantiate Grid Search Model
rf_rf_grid = GridSearchCV(estimator = rf, param_grid = gs_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_rf_grid.fit(rf_train, y_train)

# best params
rf_rf_grid.best_params_

Fitting 3 folds for each of 200 candidates, totalling 600 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 13,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 120}

In [17]:
###########################################################################
# Evaluation of GridSearch Best params
###########################################################################

# Evaluation for GS best params

# Train Evaluation
evaluation(y_train, rf_rf_grid.best_estimator_.predict(rf_train), "Best params (GS) RF on rf_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_rf_grid.best_estimator_.predict(rf_test)
evaluation(y_test, y_pred_class, "Best params (GS) on rf_test dataset")


accuracy: 0.9914712153518124

classification report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       278
           1       1.00      0.98      0.99       191

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469


F1 score: 
0.9894179894179894

Confusion matrix: 
[[278   0]
 [  4 187]]

ROC AUC: 0.9895287958115183

Recall: 0.9790575916230366



accuracy: 0.9661016949152542

classification report: 
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        61
           1       1.00      0.93      0.96        57

    accuracy                           0.97       118
   macro avg       0.97      0.96      0.97       118
weighted avg       0.97      0.97      0.97       118


F1 score: 
0.9636363636363636

Confusion matrix: 
[[61  0]
 [ 4 53]]

ROC AUC: 0.9649122807017544

Reca

### **Best params**
- Again the RandomSearch params performed best for the RF slected features dataset

In [18]:
train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


rf_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'rf_rf_rs': [train_accuracy_score[-2]*100,
                                   train_f1_score[-2]*100,
                                   train_roc_auc[-2]*100,
                                   train_recall[-2]*100,
                                   test_accuracy_score[-2]*100,
                                   test_f1_score[-2]*100,
                                   test_roc_auc[-2]*100,
                                   test_recall[-2]*100],
                          'rf_rf_gs': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})
rf_table

Unnamed: 0,eval,base,rf_rf_rs,rf_rf_gs
0,train_acc,93.176972,99.78678,99.147122
1,train_f1,90.857143,99.737533,98.941799
2,train_roc_auc,91.623037,99.73822,98.95288
3,train_recall,83.246073,99.47644,97.905759
4,test_acc,88.983051,97.457627,96.610169
5,test_f1,87.128713,97.297297,96.363636
6,test_roc_auc,88.596491,97.368421,96.491228
7,test_recall,77.192982,94.736842,92.982456


___
## **3. Using Chi2 + ANOVA selected features**

### **RandomSearch params**
- RandomSearch for best params
- Evaluation of results

In [19]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
rf_ca = RandomForestClassifier()
rf_ca_random = RandomizedSearchCV(estimator = rf_ca, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=2022, n_jobs = -1)
# Fit using RF selected features
rf_ca_random.fit(chi_anova_train, y_train)

# best params
rf_ca_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'n_estimators': 127,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

In [20]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################

# RF Modelling
rf_ca_rs = RandomForestClassifier(n_estimators = 127, random_state = 2022, min_samples_leaf=1, min_samples_split=5, max_depth=None, criterion='gini',
                                 bootstrap=False)
rf_ca_rs.fit(chi_anova_train, y_train)

# Train Evaluation
evaluation(y_train, rf_ca_rs.predict(chi_anova_train), "RF (RS) on chi_anova_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_ca_rs.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "RF(RS) on chi_anova_test dataset")

  


accuracy: 0.997867803837953

classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       278
           1       1.00      0.99      1.00       191

    accuracy                           1.00       469
   macro avg       1.00      1.00      1.00       469
weighted avg       1.00      1.00      1.00       469


F1 score: 
0.9973753280839895

Confusion matrix: 
[[278   0]
 [  1 190]]

ROC AUC: 0.9973821989528796

Recall: 0.9947643979057592



accuracy: 0.9745762711864406

classification report: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        61
           1       1.00      0.95      0.97        57

    accuracy                           0.97       118
   macro avg       0.98      0.97      0.97       118
weighted avg       0.98      0.97      0.97       118


F1 score: 
0.972972972972973

Confusion matrix: 
[[61  0]
 [ 3 54]]

ROC AUC: 0.9736842105263157

Recall

### **GridSearch params**
- GridSearch for best params
- Evaluation of results

In [21]:
###########################################################################
# Get best params for GridSearch
# Since parameters from RF same as the one for the XGB_data, we use the same gs grid
###########################################################################
gs_grid = {'n_estimators': [120, 125, 130, 135, 140],
            'min_samples_split': [2,3,4,5],
            'min_samples_leaf': [1,2],
            'max_features': ['log2'],
            'max_depth': [10,11,12,13,14],
            'criterion': ['gini'],
            'bootstrap': [False]}
rf = RandomForestClassifier(random_state=2022)

# Instantiate Grid Search Model
rf_ca_grid = GridSearchCV(estimator = rf, param_grid = gs_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_ca_grid.fit(chi_anova_train, y_train)

# best params
rf_ca_grid.best_params_

Fitting 3 folds for each of 200 candidates, totalling 600 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 14,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 125}

In [22]:
###########################################################################
# Evaluation of GridSearch Best params
###########################################################################

# Evaluation for GS best params

# Train Evaluation
evaluation(y_train, rf_ca_grid.best_estimator_.predict(chi_anova_train), "Best params (GS) RF on chi_anova_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_ca_grid.best_estimator_.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "Best params (GS) on chi_anova dataset")

accuracy: 0.9722814498933902

classification report: 
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       278
           1       1.00      0.93      0.96       191

    accuracy                           0.97       469
   macro avg       0.98      0.97      0.97       469
weighted avg       0.97      0.97      0.97       469


F1 score: 
0.964769647696477

Confusion matrix: 
[[278   0]
 [ 13 178]]

ROC AUC: 0.9659685863874345

Recall: 0.9319371727748691



accuracy: 0.940677966101695

classification report: 
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        61
           1       1.00      0.88      0.93        57

    accuracy                           0.94       118
   macro avg       0.95      0.94      0.94       118
weighted avg       0.95      0.94      0.94       118


F1 score: 
0.9345794392523363

Confusion matrix: 
[[61  0]
 [ 7 50]]

ROC AUC: 0.9385964912280702

Recall

### **Best params**
- Again the RandomSearch params performed best for the Chi_Anova features dataset

In [23]:
train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


chi_anova_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'chi_anova_rf_rs': [train_accuracy_score[-2]*100,
                                   train_f1_score[-2]*100,
                                   train_roc_auc[-2]*100,
                                   train_recall[-2]*100,
                                   test_accuracy_score[-2]*100,
                                   test_f1_score[-2]*100,
                                   test_roc_auc[-2]*100,
                                   test_recall[-2]*100],
                          'chi_anova_rf_gs': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})
chi_anova_table

Unnamed: 0,eval,base,chi_anova_rf_rs,chi_anova_rf_gs
0,train_acc,93.176972,99.78678,97.228145
1,train_f1,90.857143,99.737533,96.476965
2,train_roc_auc,91.623037,99.73822,96.596859
3,train_recall,83.246073,99.47644,93.193717
4,test_acc,88.983051,96.610169,97.457627
5,test_f1,87.128713,96.363636,97.297297
6,test_roc_auc,88.596491,96.491228,97.368421
7,test_recall,77.192982,92.982456,94.736842


# **Compiled Results**

In [24]:
compiled_table = pd.concat([xgb_table, rf_table[rf_table.columns[-2:]], chi_anova_table[chi_anova_table.columns[-2:]]], axis=1)
compiled_table

Unnamed: 0,eval,base,xgb_rf_rs,xgb_rf_gs,rf_rf_rs,rf_rf_gs,chi_anova_rf_rs,chi_anova_rf_gs
0,train_acc,93.176972,100.0,99.147122,99.78678,99.147122,99.78678,97.228145
1,train_f1,90.857143,100.0,98.941799,99.737533,98.941799,99.737533,96.476965
2,train_roc_auc,91.623037,100.0,98.95288,99.73822,98.95288,99.73822,96.596859
3,train_recall,83.246073,100.0,97.905759,99.47644,97.905759,99.47644,93.193717
4,test_acc,88.983051,97.457627,96.610169,97.457627,96.610169,96.610169,97.457627
5,test_f1,87.128713,97.297297,96.363636,97.297297,96.363636,96.363636,97.297297
6,test_roc_auc,88.596491,97.368421,96.491228,97.368421,96.491228,96.491228,97.368421
7,test_recall,77.192982,94.736842,92.982456,94.736842,92.982456,92.982456,94.736842
