In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from sklearn.metrics import *
from sklearn import linear_model

from google.colab import drive
from pprint import pprint

# **Set-up**
- train and test data

In [10]:
drive.mount('/content/drive', force_remount=True)
X_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_train.csv')
X_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/X_test.csv')
y_train = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_train.csv')
y_test = pd.read_csv('/content/drive/My Drive/BT4012/processed dataset/v4/y_test.csv')

# Drop unnamed columns
X_train = X_train.drop(columns=['Unnamed: 0'])
X_test = X_test.drop(columns=['Unnamed: 0'])
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test.drop(columns=['Unnamed: 0'])

Mounted at /content/drive


# **Selected Features**
- XGBoost selected features: 21 features
- RF selected features: 14
- Chi2 + ANOVA selected features: 30 features

In [11]:
xgb_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'paymentMethodType_card', 'paymentMethodProvider_Discover',
       'paymentMethodProvider_Maestro', 'paymentMethodProvider_Voyager',
       'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_Northeast',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

rf_features = ['transactionAmount', 'No_Transactions', 'No_Orders', 'No_Payments',
       'No_Accounts', 'paymentMethodProvider_Maestro',
       'paymentMethodProvider_Voyager', 'customerBillingAddressRegion_Midwest',
       'customerBillingAddressRegion_South',
       'customerBillingAddressRegion_West', 'ipCountry_Hong Kong',
       'ipCountry_Switzerland', 'ipCountry_United States']

chi_anova_features = ['customerBillingAddressRegion_Multiple',
 'ipCountry_Multiple',
 'customerBillingAddressRegion_South',
 'customerBillingAddressRegion_Midwest',
 'ipCountry_Hong Kong',
 'paymentMethodType_paypal',
 'paymentMethodProvider_Maestro',
 'customerBillingAddressRegion_Others',
 'ipCountry_Romania',
 'ipCountry_Morocco',
 'ipCountry_Italy',
 'ipCountry_United Kingdom',
 'ipCountry_China',
 'ipCountry_Taiwan',
 'paymentMethodProvider_Voyager',
 'paymentMethodProvider_VISA 13 digit',
 'ipCountry_United States',
 'customerBillingAddressRegion_Northeast',
 'ipCountry_Switzerland',
 'ipCountry_France',
 'ipCountry_Iran',
 'ipCountry_South Korea',
 'ipCountry_Armenia',
 'paymentMethodProvider_Diners Club / Carte Blanche',
 'paymentMethodProvider_JCB 15 digit',
 'No_Orders',
 'transactionAmount',
 'No_Payments',
 'No_Transactions',
 'No_Accounts']

In [12]:
# XGB Features
xgb_train = X_train[xgb_features]
xgb_test = X_test[xgb_features]

# RF Features
rf_train = X_train[rf_features]
rf_test = X_test[rf_features]

# Chi2 + ANOVA features
chi_anova_train = X_train[chi_anova_features]
chi_anova_test = X_test[chi_anova_features]

# **Helper Functions**
- Evaluation function

In [13]:
agg_accuracy_score_dict = {}
agg_f1_score_dict={}
agg_roc_auc_dict={}

train_accuracy_score_dict = {}
train_f1_score_dict={}
train_roc_auc_dict={}
train_recall_dict = {}

val_accuracy_score_dict = {}
val_f1_score_dict={}
val_roc_auc_dict={}

test_accuracy_score_dict = {}
test_f1_score_dict={}
test_roc_auc_dict={}
test_recall_dict = {}

def evaluation(true, pred, title):
    print("================== Evaluation on {} ==================".format(title))
    # accuracy 
    acc = accuracy_score(true, pred)
    print("accuracy: {}\n".format(acc))
    agg_accuracy_score_dict[title] = acc
    # f1 score
    print("classification report: \n{}\n".format(classification_report(true, pred)))
    print("F1 score: \n{}\n".format(f1_score(true, pred)))
    f1 = f1_score(true, pred)
    agg_f1_score_dict[title] = f1
    # confusion matrix
    print("Confusion matrix: \n{}\n".format(confusion_matrix(true, pred)))
    # roc auc result
    fpr, tpr, thresholds = roc_curve(true, pred)
    roc_auc = auc(fpr, tpr)
    print("ROC AUC: {}\n".format(roc_auc))
    agg_roc_auc_dict[title] = roc_auc

    recall = recall_score(true,pred)
    print("Recall: {}\n".format(recall))
    print("================== End of Evaluation on {} ==================".format(title))
    
    if "train dataset" in title:
        train_accuracy_score_dict[title] = acc
        train_f1_score_dict[title] = f1
        train_roc_auc_dict[title] = roc_auc
        train_recall_dict[title] = recall
    elif "test dataset" in title:
        test_accuracy_score_dict[title] = acc
        test_f1_score_dict[title] = f1
        test_roc_auc_dict[title] = roc_auc
        test_recall_dict[title] = recall
    else:
        val_accuracy_score_dict[title] = acc
        val_f1_score_dict[title] = f1
        val_roc_auc_dict[title] = roc_auc

## **Base Model**

In [14]:
# Fit a logistic regression model with the training set
log = linear_model.LogisticRegression(penalty = 'l2', fit_intercept=False, C=500)
log.fit(X_train, y_train)

# Evaluate results

# Train evaluation
evaluation(y_train, log.predict(X_train), "Log Reg Model on train dataset")
print('\n')
# Test evaluation
y_pred_class = log.predict(X_test)
evaluation(y_test, y_pred_class, "Log Reg Model on test dataset")

accuracy: 0.8294243070362474

classification report: 
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       278
           1       0.82      0.75      0.78       191

    accuracy                           0.83       469
   macro avg       0.83      0.82      0.82       469
weighted avg       0.83      0.83      0.83       469


F1 score: 
0.7814207650273225

Confusion matrix: 
[[246  32]
 [ 48 143]]

ROC AUC: 0.8167915929036875

Recall: 0.7486910994764397



accuracy: 0.8050847457627118

classification report: 
              precision    recall  f1-score   support

           0       0.82      0.80      0.81        61
           1       0.79      0.81      0.80        57

    accuracy                           0.81       118
   macro avg       0.80      0.81      0.80       118
weighted avg       0.81      0.81      0.81       118


F1 score: 
0.8

Confusion matrix: 
[[49 12]
 [11 46]]

ROC AUC: 0.8051481161921196

Recall: 0.807017543

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


___
## **1. Using XGBoost selected features**
- Evaluation of results

In [15]:
# Fit a logistic regression model with the training set
xgb_log = linear_model.LogisticRegression(penalty = 'l2', fit_intercept=False, C=500)
xgb_log.fit(xgb_train, y_train)

# Evaluate results

# Train evaluation
evaluation(y_train, xgb_log.predict(xgb_train), "Log Reg Model on xgb_train dataset")
print('\n')
# Test evaluation
y_pred_class = xgb_log.predict(xgb_test)
evaluation(y_test, y_pred_class, "Log Reg Model on xgb_test dataset")

accuracy: 0.7953091684434968

classification report: 
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       278
           1       0.75      0.74      0.75       191

    accuracy                           0.80       469
   macro avg       0.79      0.79      0.79       469
weighted avg       0.79      0.80      0.80       469


F1 score: 
0.7473684210526317

Confusion matrix: 
[[231  47]
 [ 49 142]]

ROC AUC: 0.78719537459038

Recall: 0.743455497382199



accuracy: 0.7457627118644068

classification report: 
              precision    recall  f1-score   support

           0       0.75      0.77      0.76        61
           1       0.75      0.72      0.73        57

    accuracy                           0.75       118
   macro avg       0.75      0.74      0.75       118
weighted avg       0.75      0.75      0.75       118


F1 score: 
0.7321428571428571

Confusion matrix: 
[[47 14]
 [16 41]]

ROC AUC: 0.7448950244463618

Recall:

  y = column_or_1d(y, warn=True)


In [16]:
###########################################################################
# Table of results
###########################################################################

train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


xgb_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'xgb_log': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})

# chi_anova_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
#                                   'test_acc', 'test_f1','test_roc_auc','test_recall'],
#                           'base': ['91.68%', '89.08%','90.36%','83.25%',
#                                    '85.59%','83.81%','85.32%','77.19%'],
#                           'chi_anova_svm_gs': ['99.79%','99.74%','99.74%','99.48%',
#                                     '96.61%','96.36%','96.49%','92.98%']})
xgb_table

Unnamed: 0,eval,base,xgb_log
0,train_acc,82.942431,79.530917
1,train_f1,78.142077,74.736842
2,train_roc_auc,81.679159,78.719537
3,train_recall,74.86911,74.34555
4,test_acc,80.508475,74.576271
5,test_f1,80.0,73.214286
6,test_roc_auc,80.514812,74.489502
7,test_recall,80.701754,71.929825


___
## **2. Using RF selected features**
- Evaluation of results

In [17]:
# Fit a logistic regression model with the training set
rf_log = linear_model.LogisticRegression(penalty = 'l2', fit_intercept=False, C=500)
rf_log.fit(rf_train, y_train)

# Evaluate results

# Train evaluation
evaluation(y_train, rf_log.predict(rf_train), "Log Reg Model on rf_train dataset")
print('\n')
# Test evaluation
y_pred_class = rf_log.predict(rf_test)
evaluation(y_test, y_pred_class, "Log Reg Model on rf_test dataset")

accuracy: 0.8187633262260128

classification report: 
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       278
           1       0.80      0.73      0.77       191

    accuracy                           0.82       469
   macro avg       0.82      0.81      0.81       469
weighted avg       0.82      0.82      0.82       469


F1 score: 
0.7671232876712328

Confusion matrix: 
[[244  34]
 [ 51 140]]

ROC AUC: 0.8053410674601681

Recall: 0.7329842931937173



accuracy: 0.7711864406779662

classification report: 
              precision    recall  f1-score   support

           0       0.76      0.82      0.79        61
           1       0.79      0.72      0.75        57

    accuracy                           0.77       118
   macro avg       0.77      0.77      0.77       118
weighted avg       0.77      0.77      0.77       118


F1 score: 
0.7522935779816514

Confusion matrix: 
[[50 11]
 [16 41]]

ROC AUC: 0.769485188380788

Recal

  y = column_or_1d(y, warn=True)


In [18]:
###########################################################################
# Table of results
###########################################################################

train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


rf_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'rf_log': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})

# chi_anova_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
#                                   'test_acc', 'test_f1','test_roc_auc','test_recall'],
#                           'base': ['91.68%', '89.08%','90.36%','83.25%',
#                                    '85.59%','83.81%','85.32%','77.19%'],
#                           'chi_anova_svm_gs': ['99.79%','99.74%','99.74%','99.48%',
#                                     '96.61%','96.36%','96.49%','92.98%']})
rf_table

Unnamed: 0,eval,base,rf_log
0,train_acc,82.942431,81.876333
1,train_f1,78.142077,76.712329
2,train_roc_auc,81.679159,80.534107
3,train_recall,74.86911,73.298429
4,test_acc,80.508475,77.118644
5,test_f1,80.0,75.229358
6,test_roc_auc,80.514812,76.948519
7,test_recall,80.701754,71.929825


___
## **3. Using Chi2 + Anova selected features**
- Evaluation of results

In [19]:
# Fit a logistic regression model with the training set
chi_anova_log = linear_model.LogisticRegression(penalty = 'l2', fit_intercept=False, C=500)
chi_anova_log.fit(chi_anova_train, y_train)

# Evaluate results

# Train evaluation
evaluation(y_train, chi_anova_log.predict(chi_anova_train), "Log Reg Model on chi_anova_train dataset")
print('\n')
# Test evaluation
y_pred_class = chi_anova_log.predict(chi_anova_test)
evaluation(y_test, y_pred_class, "Log Reg Model on chi_anova_test dataset")

accuracy: 0.8230277185501066

classification report: 
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       278
           1       0.81      0.74      0.77       191

    accuracy                           0.82       469
   macro avg       0.82      0.81      0.81       469
weighted avg       0.82      0.82      0.82       469


F1 score: 
0.7726027397260274

Confusion matrix: 
[[245  33]
 [ 50 141]]

ROC AUC: 0.8097574296583675

Recall: 0.7382198952879581



accuracy: 0.7966101694915254

classification report: 
              precision    recall  f1-score   support

           0       0.79      0.82      0.81        61
           1       0.80      0.77      0.79        57

    accuracy                           0.80       118
   macro avg       0.80      0.80      0.80       118
weighted avg       0.80      0.80      0.80       118


F1 score: 
0.7857142857142858

Confusion matrix: 
[[50 11]
 [13 44]]

ROC AUC: 0.7958009778544722

Reca

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
###########################################################################
# Table of results
###########################################################################

train_accuracy_score = list(train_accuracy_score_dict.values())
train_f1_score = list(train_f1_score_dict.values())
train_roc_auc = list(train_roc_auc_dict.values())
train_recall = list(train_recall_dict.values()) 

test_accuracy_score = list(test_accuracy_score_dict.values())
test_f1_score = list(test_f1_score_dict.values())
test_roc_auc = list(test_roc_auc_dict.values())
test_recall = list(test_recall_dict.values()) 


chi_anova_table = pd.DataFrame({'eval':['train_acc', 'train_f1','train_roc_auc','train_recall',
                                  'test_acc', 'test_f1','test_roc_auc','test_recall'],
                          'base': [train_accuracy_score[0]*100,
                                   train_f1_score[0]*100,
                                   train_roc_auc[0]*100,
                                   train_recall[0]*100,
                                   test_accuracy_score[0]*100,
                                   test_f1_score[0]*100,
                                   test_roc_auc[0]*100,
                                   test_recall[0]*100],
                          'chi_anova_log': [train_accuracy_score[-1]*100,
                                   train_f1_score[-1]*100,
                                   train_roc_auc[-1]*100,
                                   train_recall[-1]*100,
                                   test_accuracy_score[-1]*100,
                                   test_f1_score[-1]*100,
                                   test_roc_auc[-1]*100,
                                   test_recall[-1]*100]})

chi_anova_table

Unnamed: 0,eval,base,chi_anova_log
0,train_acc,82.942431,82.302772
1,train_f1,78.142077,77.260274
2,train_roc_auc,81.679159,80.975743
3,train_recall,74.86911,73.82199
4,test_acc,80.508475,79.661017
5,test_f1,80.0,78.571429
6,test_roc_auc,80.514812,79.580098
7,test_recall,80.701754,77.192982


___
# **Compiled Results**

In [21]:
compiled_table = pd.concat([xgb_table, rf_table[rf_table.columns[-1:]], chi_anova_table[chi_anova_table.columns[-1:]]], axis=1)
compiled_table

Unnamed: 0,eval,base,xgb_log,rf_log,chi_anova_log
0,train_acc,82.942431,79.530917,81.876333,82.302772
1,train_f1,78.142077,74.736842,76.712329,77.260274
2,train_roc_auc,81.679159,78.719537,80.534107,80.975743
3,train_recall,74.86911,74.34555,73.298429,73.82199
4,test_acc,80.508475,74.576271,77.118644,79.661017
5,test_f1,80.0,73.214286,75.229358,78.571429
6,test_roc_auc,80.514812,74.489502,76.948519,79.580098
7,test_recall,80.701754,71.929825,71.929825,77.192982
