# Import functions & load data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from keras import metrics
from keras.models import Sequential
from tensorflow.keras.models import Model,load_model
from keras.layers import Input, Dense, Embedding, SimpleRNN, Dropout, concatenate, Lambda, BatchNormalization
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf 
from pprint import pprint

from math import floor
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, precision_score,recall_score,roc_auc_score, make_scorer, f1_score, roc_curve, auc
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import pickle
from keras.layers import LeakyReLU
LeakyReLU = LeakyReLU(alpha=0.1)

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

from sklearn.feature_selection import chi2
from google.colab import drive

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
X_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_train.csv')
X_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_test.csv')
y_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_train.csv')
y_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_test.csv')

In [None]:
# drop unnamed cols
X_train = X_train.drop(columns=['Unnamed: 0'])
X_test = X_test.drop(columns=['Unnamed: 0'])
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test.drop(columns=['Unnamed: 0'])

# **Selected Features**
- XGBoost selected features: 21 features
- RF selected features: 14
- Chi2 + ANOVA selected features: 30 features

In [None]:
xgb_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'paymentMethodType_card', 'paymentMethodProvider_Discover',
       'paymentMethodProvider_Maestro', 'paymentMethodProvider_Voyager',
       'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_Northeast',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

rf_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'No_Accounts', 'paymentMethodProvider_Maestro',
       'paymentMethodProvider_Voyager', 'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

chi_anova_features = ['customerBillingAddressRegion_Multiple',
 'ipCountry_Multiple',
 'customerBillingAddressRegion_South',
 'customerBillingAddressRegion_Midwest',
 'ipCountry_Hong Kong',
 'paymentMethodType_paypal',
 'paymentMethodProvider_Maestro',
 'customerBillingAddressRegion_Others',
 'ipCountry_Romania',
 'ipCountry_Morocco',
 'ipCountry_Italy',
 'ipCountry_United Kingdom',
 'ipCountry_China',
 'ipCountry_Taiwan',
 'paymentMethodProvider_Voyager',
 'paymentMethodProvider_VISA 13 digit',
 'ipCountry_United States',
 'customerBillingAddressRegion_Northeast',
 'ipCountry_Switzerland',
 'ipCountry_France',
 'ipCountry_Iran',
 'ipCountry_South Korea',
 'ipCountry_Armenia',
 'paymentMethodProvider_Diners Club / Carte Blanche',
 'paymentMethodProvider_JCB 15 digit',
 'No_Orders',
 'transactionAmount',
 'No_Payments',
 'No_Transactions',
 'No_Accounts']


# **Helper Functions**
- Evaluation function
- Plotting of feature importance

In [None]:
agg_accuracy_score_dict = {}
agg_f1_score_dict={}
agg_roc_auc_dict={}

train_accuracy_score_dict = {}
train_f1_score_dict={}
train_roc_auc_dict={}

val_accuracy_score_dict = {}
val_f1_score_dict={}
val_roc_auc_dict={}

test_accuracy_score_dict = {}
test_f1_score_dict={}
test_roc_auc_dict={}

def evaluation(true, pred, title):
    print("================== Evaluation on {} ==================".format(title))
    # accuracy 
    acc = accuracy_score(true, pred)
    print("accuracy: {}\n".format(acc))
    agg_accuracy_score_dict[title] = acc
    # f1 score
    print("classification report: \n{}\n".format(classification_report(true, pred)))
    print("F1 score: \n{}\n".format(f1_score(true, pred)))
    f1 = f1_score(true, pred)
    agg_f1_score_dict[title] = f1
    # confusion matrix
    print("Confusion matrix: \n{}\n".format(confusion_matrix(true, pred)))
    # roc auc result
    fpr, tpr, thresholds = roc_curve(true, pred)
    roc_auc = auc(fpr, tpr)
    print("ROC AUC: {}\n".format(roc_auc))
    agg_roc_auc_dict[title] = roc_auc

    recall = recall_score(true,pred)
    print("Recall: {}\n".format(recall))
    print("================== End of Evaluation on {} ==================".format(title))
    
    if "train dataset" in title:
        train_accuracy_score_dict[title] = acc
        train_f1_score_dict[title] = f1
        train_roc_auc_dict[title] = roc_auc
    elif "test dataset" in title:
        test_accuracy_score_dict[title] = acc
        test_f1_score_dict[title] = f1
        test_roc_auc_dict[title] = roc_auc
    else:
        val_accuracy_score_dict[title] = acc
        val_f1_score_dict[title] = f1
        val_roc_auc_dict[title] = roc_auc

In [None]:
def plot_feature_importance(importance,names,model_type):

  #Create arrays from feature importance and feature names
  feature_importance = np.array(importance)
  feature_names = np.array(names)

  #Create a DataFrame using a Dictionary
  data={'feature_names':feature_names,'feature_importance':feature_importance}
  fi_df = pd.DataFrame(data)

  #Sort the DataFrame in order decreasing feature importance
  fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

  #Define size of bar plot
  plt.figure(figsize=(10,8))
  #Plot Searborn bar chart
  sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
  #Add chart labels
  plt.title(model_type + 'FEATURE IMPORTANCE')
  plt.xlabel('FEATURE IMPORTANCE')
  plt.ylabel('FEATURE NAMES')

In [None]:
def metric_row(true,pred,pred_prob,index):
  metrics_names=['Precision', 'Recall', 'F1', 'Accuracy', 'AUC']
  prec = precision_score(true,pred,zero_division=0)
  rec = recall_score(true,pred)
  f1 = f1_score(true,pred)
  acc = accuracy_score(true,pred)
  auc = roc_auc_score(true,pred_prob)
  met = pd.DataFrame([[prec,rec,f1,acc,auc]],
                            index=[index], 
                            columns=metrics_names)
  return met

In [None]:
# XGB Features
xgb_train = X_train[xgb_features]
xgb_test = X_test[xgb_features]

# RF Features
rf_train = X_train[rf_features]
rf_test = X_test[rf_features]

# Chi2 + ANOVA features
chi_anova_train = X_train[chi_anova_features]
chi_anova_test = X_test[chi_anova_features]

# **XGB Model**

In [None]:
from numpy import loadtxt
from numpy import sort
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.metrics import accuracy_score, recall_score
from sklearn.feature_selection import SelectFromModel

## Base Model

In [None]:
base_xgboost_model = XGBClassifier()
base_xgboost_model.fit(X_train, y_train)

evaluation(y_train, base_xgboost_model.predict(X_train), "Default Xgboost on train dataset")

y_pred_class = base_xgboost_model.predict(X_test)
evaluation(y_test, y_pred_class, "Default Xgboost on test dataset")

accuracy: 0.9232409381663113

classification report: 
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       278
           1       0.99      0.82      0.90       191

    accuracy                           0.92       469
   macro avg       0.94      0.91      0.92       469
weighted avg       0.93      0.92      0.92       469


F1 score: 
0.8971428571428572

Confusion matrix: 
[[276   2]
 [ 34 157]]

ROC AUC: 0.9073976420957475

Recall: 0.8219895287958116

accuracy: 0.8559322033898306

classification report: 
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        61
           1       0.93      0.75      0.83        57

    accuracy                           0.86       118
   macro avg       0.87      0.85      0.85       118
weighted avg       0.87      0.86      0.85       118


F1 score: 
0.8349514563106796

Confusion matrix: 
[[58  3]
 [14 43]]

ROC AUC: 0.8526028185217142

Recall

## 1. Using XGBoost selected features

### Base Model (with XGB selected features)

In [None]:
base_xgboost_model = XGBClassifier()
base_xgboost_model.fit(xgb_train, y_train)

evaluation(y_train, base_xgboost_model.predict(xgb_train), "Default Xgboost on train dataset")

y_pred_class = base_xgboost_model.predict(xgb_test)
evaluation(y_test, y_pred_class, "Default Xgboost on test dataset")

accuracy: 0.9339019189765458

classification report: 
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       278
           1       1.00      0.84      0.91       191

    accuracy                           0.93       469
   macro avg       0.95      0.92      0.93       469
weighted avg       0.94      0.93      0.93       469


F1 score: 
0.9116809116809117

Confusion matrix: 
[[278   0]
 [ 31 160]]

ROC AUC: 0.918848167539267

Recall: 0.837696335078534

accuracy: 0.8813559322033898

classification report: 
              precision    recall  f1-score   support

           0       0.81      1.00      0.90        61
           1       1.00      0.75      0.86        57

    accuracy                           0.88       118
   macro avg       0.91      0.88      0.88       118
weighted avg       0.90      0.88      0.88       118


F1 score: 
0.86

Confusion matrix: 
[[61  0]
 [14 43]]

ROC AUC: 0.8771929824561404

Recall: 0.754385964912

### GridSearch params

In [None]:
###########################################################################
# Initializing GridSearch for Best Params
###########################################################################
xgb_param_grid = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_grid)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from GridSearch
###########################################################################
xgb_grid_search = GridSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_grid,
                               scoring='f1_weighted')

xgb_grid_result = xgb_grid_search.fit(xgb_train, y_train)

xgb_grid_result.best_params_

{'alpha': 0.005, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 138}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_grid = XGBClassifier( **xgb_grid_result.best_params_, random_state=4240).fit(xgb_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_grid.predict(xgb_train), "XGB on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_grid.predict(xgb_test)
evaluation(y_test, y_pred_class, "XGB on xgb_test dataset")

accuracy: 0.9936034115138592

classification report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       278
           1       1.00      0.98      0.99       191

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469


F1 score: 
0.9920844327176782

Confusion matrix: 
[[278   0]
 [  3 188]]

ROC AUC: 0.9921465968586387

Recall: 0.9842931937172775



accuracy: 0.9576271186440678

classification report: 
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        61
           1       1.00      0.91      0.95        57

    accuracy                           0.96       118
   macro avg       0.96      0.96      0.96       118
weighted avg       0.96      0.96      0.96       118


F1 score: 
0.9541284403669724

Confusion matrix: 
[[61  0]
 [ 5 52]]

ROC AUC: 0.956140350877193

Recal

### RandomSearch params

In [None]:
###########################################################################
# Initializing RandomSearch for Best Params
###########################################################################
xgb_param_rand = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_rand)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
xgb_rand_search = RandomizedSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_rand,
                               scoring='f1_weighted')

xgb_rand_result = xgb_rand_search.fit(xgb_train, y_train)

# best params
xgb_rand_search.best_params_

{'n_estimators': 127, 'max_depth': 2, 'learning_rate': 0.1, 'alpha': 0.005}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_rand = XGBClassifier( **xgb_rand_result.best_params_, random_state=4240).fit(xgb_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_rand.predict(xgb_train), "XGB on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_rand.predict(xgb_test)
evaluation(y_test, y_pred_class, "XGB on xgb_test dataset")

accuracy: 0.9147121535181236

classification report: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       278
           1       1.00      0.79      0.88       191

    accuracy                           0.91       469
   macro avg       0.94      0.90      0.91       469
weighted avg       0.93      0.91      0.91       469


F1 score: 
0.8830409356725146

Confusion matrix: 
[[278   0]
 [ 40 151]]

ROC AUC: 0.8952879581151832

Recall: 0.7905759162303665



accuracy: 0.8813559322033898

classification report: 
              precision    recall  f1-score   support

           0       0.81      1.00      0.90        61
           1       1.00      0.75      0.86        57

    accuracy                           0.88       118
   macro avg       0.91      0.88      0.88       118
weighted avg       0.90      0.88      0.88       118


F1 score: 
0.86

Confusion matrix: 
[[61  0]
 [14 43]]

ROC AUC: 0.8771929824561404

Recall: 0.75438596

### **Best params**

- Using the RandomSearch patterns, we got the best results across all evaluation metrics. 

In [None]:
xgb_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'xgb_xgb_base': ['93.39%', '91.17%','91.88%','83.77%',
                                   '88.14%','86.00%','87.72%','75.44%'],
                          'xgb_xgb_gs': ['99.36%','99.21%','99.21%','98.43%',
                                    '95.76%','95.41%','95.61%','91.23%'],
                          'xgb_xgb_rs': ['91.47%','88.30%','89.53%','79.06%',
                                    '88.14%','86.00%','87.72%','75.44%']})
xgb_table

Unnamed: 0,eval,xgb_xgb_base,xgb_xgb_gs,xgb_xgb_rs
0,train_acc,93.39%,99.36%,91.47%
1,train_f1,91.17%,99.21%,88.30%
2,train_roc_auc,91.88%,99.21%,89.53%
3,train_recall,83.77%,98.43%,79.06%
4,test_acc,88.14%,95.76%,88.14%
5,test_f1,86.00%,95.41%,86.00%
6,test_roc_auc,87.72%,95.61%,87.72%
7,test_recall,75.44%,91.23%,75.44%


## 2. Using RF selected features

### Base Model (with XGB selected features)

In [None]:
base_xgboost_model_rf = XGBClassifier()
base_xgboost_model_rf.fit(rf_train, y_train)

evaluation(y_train, base_xgboost_model_rf.predict(rf_train), "Default Xgboost on train dataset")

y_pred_class = base_xgboost_model_rf.predict(rf_test)
evaluation(y_test, y_pred_class, "Default Xgboost on test dataset")

accuracy: 0.9232409381663113

classification report: 
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       278
           1       0.98      0.83      0.90       191

    accuracy                           0.92       469
   macro avg       0.94      0.91      0.92       469
weighted avg       0.93      0.92      0.92       469


F1 score: 
0.8983050847457628

Confusion matrix: 
[[274   4]
 [ 32 159]]

ROC AUC: 0.90903612188783

Recall: 0.8324607329842932

accuracy: 0.8813559322033898

classification report: 
              precision    recall  f1-score   support

           0       0.83      0.97      0.89        61
           1       0.96      0.79      0.87        57

    accuracy                           0.88       118
   macro avg       0.89      0.88      0.88       118
weighted avg       0.89      0.88      0.88       118


F1 score: 
0.8653846153846154

Confusion matrix: 
[[59  2]
 [12 45]]

ROC AUC: 0.8783433994823123

Recall: 

### GridSearch params

In [None]:
###########################################################################
# Initializing GridSearch for Best Params
###########################################################################
xgb_param_grid = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_grid)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from GridSearch
###########################################################################
xgb_grid_search_rf = GridSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_grid,
                               scoring='f1_weighted')

xgb_grid_result_rf = xgb_grid_search_rf.fit(rf_train, y_train)

xgb_grid_result_rf.best_params_

{'alpha': 0.005, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_grid_rf = XGBClassifier( **xgb_grid_result_rf.best_params_, random_state=4240).fit(rf_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_grid_rf.predict(rf_train), "XGB on rf_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_grid_rf.predict(rf_test)
evaluation(y_test, y_pred_class, "XGB on rf_test dataset")

accuracy: 0.9893390191897654

classification report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       278
           1       1.00      0.97      0.99       191

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469


F1 score: 
0.986737400530504

Confusion matrix: 
[[278   0]
 [  5 186]]

ROC AUC: 0.9869109947643979

Recall: 0.9738219895287958



accuracy: 0.9491525423728814

classification report: 
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        61
           1       0.98      0.91      0.95        57

    accuracy                           0.95       118
   macro avg       0.95      0.95      0.95       118
weighted avg       0.95      0.95      0.95       118


F1 score: 
0.9454545454545454

Confusion matrix: 
[[60  1]
 [ 5 52]]

ROC AUC: 0.9479436295657175

Recal

### RandomSearch params

In [None]:
###########################################################################
# Initializing RandomSearch for Best Params
###########################################################################
xgb_param_rand = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_rand)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
xgb_rand_search_rf = RandomizedSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_rand,
                               scoring='f1_weighted')

xgb_rand_result_rf = xgb_rand_search_rf.fit(rf_train, y_train)

# best params
xgb_rand_search_rf.best_params_

{'n_estimators': 83, 'max_depth': 5, 'learning_rate': 0.1, 'alpha': 0.005}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_rand_rf = XGBClassifier( **xgb_rand_result_rf.best_params_, random_state=4240).fit(rf_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_rand_rf.predict(rf_train), "XGB on rf_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_rand_rf.predict(rf_test)
evaluation(y_test, y_pred_class, "XGB on rf_test dataset")

accuracy: 0.9637526652452025

classification report: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       278
           1       0.99      0.92      0.95       191

    accuracy                           0.96       469
   macro avg       0.97      0.96      0.96       469
weighted avg       0.97      0.96      0.96       469


F1 score: 
0.9536784741144414

Confusion matrix: 
[[277   1]
 [ 16 175]]

ROC AUC: 0.9563166220949941

Recall: 0.9162303664921466



accuracy: 0.9067796610169492

classification report: 
              precision    recall  f1-score   support

           0       0.86      0.98      0.92        61
           1       0.98      0.82      0.90        57

    accuracy                           0.91       118
   macro avg       0.92      0.90      0.91       118
weighted avg       0.92      0.91      0.91       118


F1 score: 
0.8952380952380952

Confusion matrix: 
[[60  1]
 [10 47]]

ROC AUC: 0.9040839804429105

Reca

### **Best params**

- Using the GridSearch params, we got the best results across all evaluation metrics

In [None]:
rf_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'xgb_rf_base': ['92.32%', '89.83%','90.90%','83.25%',
                                   '88.14%','86.54%','87.83%','78.95%'],
                          'xgb_rf_gs': ['98.93%','98.67%','98.69%','97.38%',
                                    '94.92%','94.55%','94.79%','91.23%'],
                          'xgb_rf_rs': ['96.38%','95.37%','95.63%','91.62%',
                                    '90.68%','89.52%','90.41%','82.46%']})
rf_table

Unnamed: 0,eval,xgb_rf_base,xgb_rf_gs,xgb_rf_rs
0,train_acc,92.32%,98.93%,96.38%
1,train_f1,89.83%,98.67%,95.37%
2,train_roc_auc,90.90%,98.69%,95.63%
3,train_recall,83.25%,97.38%,91.62%
4,test_acc,88.14%,94.92%,90.68%
5,test_f1,86.54%,94.55%,89.52%
6,test_roc_auc,87.83%,94.79%,90.41%
7,test_recall,78.95%,91.23%,82.46%


## 3. Using Chi2 + ANOVA selected features

### Base Model (with XGB selected features)

In [None]:
base_xgboost_model_ca = XGBClassifier()
base_xgboost_model_ca.fit(chi_anova_train, y_train)

evaluation(y_train, base_xgboost_model_ca.predict(chi_anova_train), "Default Xgboost on train dataset")

y_pred_class = base_xgboost_model_ca.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "Default Xgboost on test dataset")

accuracy: 0.9275053304904051

classification report: 
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       278
           1       0.99      0.83      0.90       191

    accuracy                           0.93       469
   macro avg       0.94      0.91      0.92       469
weighted avg       0.93      0.93      0.93       469


F1 score: 
0.9028571428571429

Confusion matrix: 
[[277   1]
 [ 33 158]]

ROC AUC: 0.9118140042939471

Recall: 0.8272251308900523

accuracy: 0.864406779661017

classification report: 
              precision    recall  f1-score   support

           0       0.81      0.97      0.88        61
           1       0.96      0.75      0.84        57

    accuracy                           0.86       118
   macro avg       0.88      0.86      0.86       118
weighted avg       0.88      0.86      0.86       118


F1 score: 
0.8431372549019609

Confusion matrix: 
[[59  2]
 [14 43]]

ROC AUC: 0.8607995398331896

Recall:

### GridSearch params

In [None]:
###########################################################################
# Initializing GridSearch for Best Params
###########################################################################
xgb_param_grid = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_grid)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from GridSearch
###########################################################################
xgb_grid_search_ca = GridSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_grid,
                               scoring='f1_weighted')

xgb_grid_result_ca = xgb_grid_search_ca.fit(chi_anova_train, y_train)

xgb_grid_result_ca.best_params_

{'alpha': 0.005, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_grid_ca = XGBClassifier( **xgb_grid_result_ca.best_params_, random_state=4240).fit(chi_anova_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_grid_ca.predict(chi_anova_train), "XGB on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_grid_ca.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "XGB on chi_anova_test dataset")

accuracy: 0.9893390191897654

classification report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       278
           1       1.00      0.97      0.99       191

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469


F1 score: 
0.986737400530504

Confusion matrix: 
[[278   0]
 [  5 186]]

ROC AUC: 0.9869109947643979

Recall: 0.9738219895287958



accuracy: 0.940677966101695

classification report: 
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        61
           1       0.96      0.91      0.94        57

    accuracy                           0.94       118
   macro avg       0.94      0.94      0.94       118
weighted avg       0.94      0.94      0.94       118


F1 score: 
0.9369369369369369

Confusion matrix: 
[[59  2]
 [ 5 52]]

ROC AUC: 0.9397469082542421

Recall

### RandomSearch params

In [None]:
###########################################################################
# Initializing RandomSearch for Best Params
###########################################################################
xgb_param_rand = {'max_depth': [1,2,5],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150], # same as for rf model
                  'alpha':[0.005, 0.01, 0.05]}

pprint(xgb_param_rand)

{'alpha': [0.005, 0.01, 0.05],
 'learning_rate': [0.01, 0.05, 0.1],
 'max_depth': [1, 2, 5],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [None]:
###########################################################################
# Getting best params from RandomSearch
###########################################################################
xgb_rand_search_ca = RandomizedSearchCV(XGBClassifier(random_state=4240), 
                               xgb_param_rand,
                               scoring='f1_weighted')

xgb_rand_result_ca = xgb_rand_search_ca.fit(chi_anova_train, y_train)

# best params
xgb_rand_search_ca.best_params_

{'n_estimators': 61, 'max_depth': 5, 'learning_rate': 0.1, 'alpha': 0.05}

In [None]:
###########################################################################
# Evaluation on train and test dataset using best params
###########################################################################
xgb_rand_ca = XGBClassifier( **xgb_rand_result_ca.best_params_, random_state=4240).fit(chi_anova_train, y_train)

# Train Evaluation
evaluation(y_train, xgb_rand_ca.predict(chi_anova_train), "XGB on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_rand_ca.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "XGB on chi_anova_test dataset")

accuracy: 0.9530916844349681

classification report: 
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       278
           1       1.00      0.88      0.94       191

    accuracy                           0.95       469
   macro avg       0.96      0.94      0.95       469
weighted avg       0.96      0.95      0.95       469


F1 score: 
0.9388888888888889

Confusion matrix: 
[[278   0]
 [ 22 169]]

ROC AUC: 0.9424083769633508

Recall: 0.8848167539267016



accuracy: 0.8728813559322034

classification report: 
              precision    recall  f1-score   support

           0       0.83      0.95      0.89        61
           1       0.94      0.79      0.86        57

    accuracy                           0.87       118
   macro avg       0.88      0.87      0.87       118
weighted avg       0.88      0.87      0.87       118


F1 score: 
0.8571428571428572

Confusion matrix: 
[[58  3]
 [12 45]]

ROC AUC: 0.8701466781708369

Reca

### **Best params**

- Using the XXX params, we got the best results across all evaluation metrics

In [None]:
chi_anova_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'xgb_ca_base': ['92.75%', '90.29%','91.18%','82.72%',
                                   '86.44%','84.31%','86.08%','75.44%'],
                          'xgb_ca_gs': ['98.93%','98.67%','98.69%','97.38%',
                                    '94.07%','93.69%','93.97%','91.23%'],
                          'xgb_ca_rs': ['95.31%','93.89%','94.24%','88.48%',
                                    '87.29%','85.71%','87.01%','78.94%']})
chi_anova_table

Unnamed: 0,eval,xgb_ca_base,xgb_ca_gs,xgb_ca_rs
0,train_acc,92.75%,98.93%,95.31%
1,train_f1,90.29%,98.67%,93.89%
2,train_roc_auc,91.18%,98.69%,94.24%
3,train_recall,82.72%,97.38%,88.48%
4,test_acc,86.44%,94.07%,87.29%
5,test_f1,84.31%,93.69%,85.71%
6,test_roc_auc,86.08%,93.97%,87.01%
7,test_recall,75.44%,91.23%,78.94%


## Compiled Results

In [None]:
compiled_table = pd.concat([xgb_table, rf_table[rf_table.columns[-3:]], chi_anova_table[chi_anova_table.columns[-3:]]], axis=1)
compiled_table

Unnamed: 0,eval,xgb_xgb_base,xgb_xgb_gs,xgb_xgb_rs,xgb_rf_base,xgb_rf_gs,xgb_rf_rs,xgb_ca_base,xgb_ca_gs,xgb_ca_rs
0,train_acc,93.39%,99.36%,91.47%,92.32%,98.93%,96.38%,92.75%,98.93%,95.31%
1,train_f1,91.17%,99.21%,88.30%,89.83%,98.67%,95.37%,90.29%,98.67%,93.89%
2,train_roc_auc,91.88%,99.21%,89.53%,90.90%,98.69%,95.63%,91.18%,98.69%,94.24%
3,train_recall,83.77%,98.43%,79.06%,83.25%,97.38%,91.62%,82.72%,97.38%,88.48%
4,test_acc,88.14%,95.76%,88.14%,88.14%,94.92%,90.68%,86.44%,94.07%,87.29%
5,test_f1,86.00%,95.41%,86.00%,86.54%,94.55%,89.52%,84.31%,93.69%,85.71%
6,test_roc_auc,87.72%,95.61%,87.72%,87.83%,94.79%,90.41%,86.08%,93.97%,87.01%
7,test_recall,75.44%,91.23%,75.44%,78.95%,91.23%,82.46%,75.44%,91.23%,78.94%
