# IPFS Data Analytics Competition Submission
## Model Building
### Eric VanMeerhaeghe, Michael Gagliano, Zachary Bergquist, James Liem

## Import Packages

In [2]:
import numpy as np
import pandas as pd

#Machine Learning Packages
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier


#Model Selection/Assessment Packages
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix, auc
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#Other Packages
from imblearn.datasets import fetch_datasets
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',500)

## Read in Data

In [2]:
train = pd.read_csv('DataReadyForModel.csv')
train.head(2)

Unnamed: 0,Cancelled,Default_Charge,NSF_Charge,Premium,Down,AmtFin,FinChg,APR,Payments_Rcvd,Exposure,Borrower_CreditScore,Borrower_EnrolledInEForms,Agent_EnrolledInEForms,Agent_IntentEForm,Agent_RegisteredForCancelListReport,Agent_RegisteredForEimpendingReport,Agent_EnrolledInCreditProgram,Borrower_RegisteredOnWeb,Borrower_RegisteredForEForms,Borrower_RegisteredForCancellationWarning,RecurringACH_TF,state_propensity_to_cancel,Agent_Loan_Cancellation_Rate,bad_borrower,Is_Bad_Zip,Borrower_Industry_Construction,Borrower_Industry_General,Borrower_Industry_Government,Borrower_Industry_Manufacturing,Borrower_Industry_Mining,Borrower_Industry_Real Estate,Borrower_Industry_Retail Trade,Borrower_Industry_Services,Borrower_Industry_Transportation,Borrower_Industry_Wholesale Trade
0,0,0.0,0.0,2078.0,737.0,1341.0,113.94,19.95,9,0.0,3,0,1,0,0,1,1,0,0,1,0,0.089109,0.121026,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0.0,0.0,12033.18,3167.0,8866.18,438.52,10.65,10,442.96,3,0,1,0,0,1,1,0,0,1,0,0.100257,0.107263,0,1,0,0,0,0,0,1,0,0,0,0


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140574 entries, 0 to 140573
Data columns (total 35 columns):
Cancelled                                    140574 non-null int64
Default_Charge                               140574 non-null float64
NSF_Charge                                   140574 non-null float64
Premium                                      140574 non-null float64
Down                                         140574 non-null float64
AmtFin                                       140574 non-null float64
FinChg                                       140574 non-null float64
APR                                          140574 non-null float64
Payments_Rcvd                                140574 non-null int64
Exposure                                     140574 non-null float64
Borrower_CreditScore                         140574 non-null int64
Borrower_EnrolledInEForms                    140574 non-null int64
Agent_EnrolledInEForms                       140574 non-null int64
Ag

In [3]:
test = pd.read_csv('TestDataFinal.csv')
test.head()

Unnamed: 0,Cancelled,Default_Charge,NSF_Charge,Premium,Down,AmtFin,FinChg,APR,Payments_Rcvd,Exposure,Borrower_CreditScore,Borrower_EnrolledInEForms,Agent_EnrolledInEForms,Agent_IntentEForm,Agent_RegisteredForCancelListReport,Agent_RegisteredForEimpendingReport,Agent_EnrolledInCreditProgram,Borrower_RegisteredOnWeb,Borrower_RegisteredForEForms,Borrower_RegisteredForCancellationWarning,RecurringACH_TF,state_propensity_to_cancel,Agent_Loan_Cancellation_Rate,bad_borrower,Is_Bad_Zip,Borrower_Industry_Construction,Borrower_Industry_General,Borrower_Industry_Government,Borrower_Industry_Manufacturing,Borrower_Industry_Mining,Borrower_Industry_Real Estate,Borrower_Industry_Retail Trade,Borrower_Industry_Services,Borrower_Industry_Transportation,Borrower_Industry_Wholesale Trade
0,0,0.0,0.0,40240.2,10060.05,30180.15,902.43,7.12,9,262.15,2,0,0,0,0,0,1,1,0,1,0,0.10058,0.118143,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0.0,0.0,21513.76,5378.44,16135.32,660.68,7.5,10,0.0,2,1,0,0,0,0,1,0,1,1,0,0.040516,0.125,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0.0,0.0,102593.0,20598.6,81994.4,1491.8,3.95,10,4099.72,3,1,0,0,0,0,1,1,1,1,0,0.10058,0.111111,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0.0,0.0,475.0,79.0,396.0,29.5,15.95,10,61.6,3,0,1,1,0,1,0,0,0,1,0,0.083719,0.069767,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0.0,0.0,5177.21,1137.82,4039.39,93.11,5.0,10,673.23,1,0,1,1,1,1,1,0,0,1,0,0.10058,0.121951,0,0,0,0,0,0,0,1,0,0,0,0


In [27]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42172 entries, 0 to 42171
Data columns (total 35 columns):
Cancelled                                    42172 non-null int64
Default_Charge                               42172 non-null float64
NSF_Charge                                   42172 non-null float64
Premium                                      42172 non-null float64
Down                                         42172 non-null float64
AmtFin                                       42172 non-null float64
FinChg                                       42172 non-null float64
APR                                          42172 non-null float64
Payments_Rcvd                                42172 non-null int64
Exposure                                     42172 non-null float64
Borrower_CreditScore                         42172 non-null int64
Borrower_EnrolledInEForms                    42172 non-null int64
Agent_EnrolledInEForms                       42172 non-null int64
Agent_IntentEForm

## Section 1: Scaling the continuious/numeric features

In [4]:
train['Default_Charge']               = ((train['Default_Charge'] - train['Default_Charge'].mean())/(train['Default_Charge'].std()))
train['NSF_Charge']                   = ((train['NSF_Charge'] - train['NSF_Charge'].mean())/(train['NSF_Charge'].std()))
train['Premium']                      = ((train['Premium'] - train['Premium'].mean())/(train['Premium'].std()))
train['Down']                         = ((train['Down'] - train['Down'].mean())/(train['Down'].std()))
train['AmtFin']                       = ((train['AmtFin'] - train['AmtFin'].mean())/(train['AmtFin'].std()))
train['FinChg']                       = ((train['FinChg'] - train['FinChg'].mean())/(train['FinChg'].std()))
train['APR']                          = ((train['APR'] - train['APR'].mean())/(train['APR'].std()))
train['Payments_Rcvd']                = ((train['Payments_Rcvd'] - train['Payments_Rcvd'].mean())/(train['Payments_Rcvd'].std()))
train['Exposure']                     = ((train['Exposure'] - train['Exposure'].mean())/(train['Exposure'].std()))
train['state_propensity_to_cancel']   = ((train['state_propensity_to_cancel'] - train['state_propensity_to_cancel'].mean())/(train['state_propensity_to_cancel'].std()))
train['Agent_Loan_Cancellation_Rate'] = ((train['Agent_Loan_Cancellation_Rate'] - train['Agent_Loan_Cancellation_Rate'].mean())/(train['Agent_Loan_Cancellation_Rate'].std()))
train['Borrower_CreditScore']         = ((train['Borrower_CreditScore'] - train['Borrower_CreditScore'].mean())/(train['Borrower_CreditScore'].std()))

In [5]:
test['Default_Charge']               = ((test['Default_Charge'] - test['Default_Charge'].mean())/(test['Default_Charge'].std()))
test['NSF_Charge']                   = ((test['NSF_Charge'] - test['NSF_Charge'].mean())/(test['NSF_Charge'].std()))
test['Premium']                      = ((test['Premium'] - test['Premium'].mean())/(test['Premium'].std()))
test['Down']                         = ((test['Down'] - test['Down'].mean())/(test['Down'].std()))
test['AmtFin']                       = ((test['AmtFin'] - test['AmtFin'].mean())/(test['AmtFin'].std()))
test['FinChg']                       = ((test['FinChg'] - test['FinChg'].mean())/(test['FinChg'].std()))
test['APR']                          = ((test['APR'] - test['APR'].mean())/(test['APR'].std()))
test['Payments_Rcvd']                = ((test['Payments_Rcvd'] - test['Payments_Rcvd'].mean())/(test['Payments_Rcvd'].std()))
test['Exposure']                     = ((test['Exposure'] - test['Exposure'].mean())/(test['Exposure'].std()))
test['state_propensity_to_cancel']   = ((test['state_propensity_to_cancel'] - test['state_propensity_to_cancel'].mean())/(test['state_propensity_to_cancel'].std()))
test['Agent_Loan_Cancellation_Rate'] = ((test['Agent_Loan_Cancellation_Rate'] - test['Agent_Loan_Cancellation_Rate'].mean())/(test['Agent_Loan_Cancellation_Rate'].std()))
test['Borrower_CreditScore']         = ((test['Borrower_CreditScore'] - test['Borrower_CreditScore'].mean())/(test['Borrower_CreditScore'].std()))

## Section 2: Model Building

- We used a technique called SMOTE, which stands for Synthetic Minority Over-sampling Technique. This balances the number of cancelled loans so that it is a 50/50 split, instead of what it previously was, and it does this by creating synthetic data points for our models to learn from.

In [6]:
from imblearn.over_sampling import SMOTE

y_train = train['Cancelled']
y_test = test['Cancelled']
X_train = train.drop(['Cancelled'], axis=1)
X_test = test.drop(['Cancelled'], axis=1)


os = SMOTE(random_state=0)

columns = X_train.columns

os_train_data_X, os_train_data_y = os.fit_sample(X_train, y_train)
os_train_data_X = pd.DataFrame(data=os_train_data_X, columns=columns )
os_train_data_y = pd.DataFrame(data=os_train_data_y, columns=['y'])

In [7]:
# We will build a bunch of classifiers at first to identify the best possible model

classifiers = {
    "Logisitic Regression Classifier": LogisticRegression(n_jobs=3,random_state=0),
    "K-Nearest-Neighbors Classifier": KNeighborsClassifier(n_jobs=3),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=750, n_jobs=3, random_state=0),
    "Neural Network Classifier": MLPClassifier(random_state=0),
    "XGBoost Classifier": XGBClassifier(n_jobs=3),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=750, random_state=0),
    "AdaBoost Classifier": AdaBoostClassifier(n_estimators=100, base_estimator=RandomForestClassifier(),learning_rate=1)
}

In [8]:
# This is to make the output a little nicer and pop a little bit more
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [12]:
# Fitting each of the classifiers, printing out the confusion matrix, classification report
# and the recall, precision, and accuracy scores

for key, classifier in classifiers.items():
    classifier.fit(os_train_data_X, os_train_data_y)
    y_score = classifier.predict(X_test)
    print("="*125)
    print(color.BOLD + color.UNDERLINE + key + color.END, "\n")
    print(color.BOLD + "Confusion Matrix: \n" + color.END)
    print(confusion_matrix(y_test, y_score), "\n" )
    print(classification_report(y_test, y_score), "\n")
    acc = round(accuracy_score(y_test, y_score), 3)
    rec = round(recall_score(y_test, y_score), 3)
    prec = round(precision_score(y_test, y_score), 3)
    print(color.BOLD + f'Accuracy: {acc}')
    print(f'Recall: {rec}')
    print(f'Precision: {prec}' + color.END)
    print("="*125, '\n\n')
    

[1m[4mLogisitic Regression Classifier[0m 

[1mConfusion Matrix: 
[0m
[[34491  3888]
 [  469  3324]] 

              precision    recall  f1-score   support

           0       0.99      0.90      0.94     38379
           1       0.46      0.88      0.60      3793

   micro avg       0.90      0.90      0.90     42172
   macro avg       0.72      0.89      0.77     42172
weighted avg       0.94      0.90      0.91     42172
 

[1mAccuracy: 0.897
Recall: 0.876
Precision: 0.461[0m


[1m[4mK-Nearest-Neighbors Classifier[0m 

[1mConfusion Matrix: 
[0m
[[35587  2792]
 [  241  3552]] 

              precision    recall  f1-score   support

           0       0.99      0.93      0.96     38379
           1       0.56      0.94      0.70      3793

   micro avg       0.93      0.93      0.93     42172
   macro avg       0.78      0.93      0.83     42172
weighted avg       0.95      0.93      0.94     42172
 

[1mAccuracy: 0.928
Recall: 0.936
Precision: 0.56[0m


[1m[4mDecision

## Section 3: GridsearchCV to tune and optimize the hyperparameters of our 3 best models

### Section 3.1: Gridsearch on KNN model

In [13]:
knears_params = {"n_neighbors": list(range(2,7,1)), 'algorithm': ['auto'], 'n_jobs':[3]}

grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params, scoring = 'recall')
grid_knears.fit(os_train_data_X, os_train_data_y)
# KNears best estimator
print(grid_knears.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=3, p=2,
           weights='uniform')


In [20]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=3, p=2,
           weights='uniform')


knn.fit(os_train_data_X, os_train_data_y)

y_score = knn.predict(X_test)
print("="*125)
print(color.BOLD + color.UNDERLINE + 'KNN Classifier' + color.END, "\n")
print(color.BOLD + "Confusion Matrix: \n" + color.END)
print(confusion_matrix(y_test, y_score), "\n" )
print(classification_report(y_test, y_score), "\n")
acc = round(accuracy_score(y_test, y_score), 3)
rec = round(recall_score(y_test, y_score), 3)
prec = round(precision_score(y_test, y_score), 3)
print(color.BOLD + f'Accuracy: {acc}')
print(f'Recall: {rec}')
print(f'Precision: {prec}' + color.END)
print("="*125, '\n\n')

[1m[4mKNN Classifier[0m 

[1mConfusion Matrix: 
[0m
[[36308  2071]
 [  338  3455]] 

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     38379
           1       0.63      0.91      0.74      3793

   micro avg       0.94      0.94      0.94     42172
   macro avg       0.81      0.93      0.85     42172
weighted avg       0.96      0.94      0.95     42172
 

[1mAccuracy: 0.943
Recall: 0.911
Precision: 0.625[0m




### Section 3.2: Gridsearch on MLPClassifier

- The hidden layer sizes were chosen because 34 was equal to the number of predictors in our data, and 68 is double that.

In [14]:
nn_params = {'activation': ['logistic', 'tanh', 'relu'], 'solver': ['adam'], 'learning_rate_init': [0.0001, 0.00005],
             'hidden_layer_sizes': [(34,34,34), (68,68,68)]}

grid_nn = GridSearchCV(MLPClassifier(), nn_params, scoring = 'recall', n_jobs = 3)
grid_nn.fit(os_train_data_X, os_train_data_y)
# NN Best Estimator
print(grid_nn.best_estimator_)

MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(68, 68, 68), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [17]:
nn = MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(68, 68, 68), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


nn.fit(os_train_data_X, os_train_data_y)

y_score = nn.predict(X_test)
print("="*125)
print(color.BOLD + color.UNDERLINE + 'MLP Classifier' + color.END, "\n")
print(color.BOLD + "Confusion Matrix: \n" + color.END)
print(confusion_matrix(y_test, y_score), "\n" )
print(classification_report(y_test, y_score), "\n")
acc = round(accuracy_score(y_test, y_score), 3)
rec = round(recall_score(y_test, y_score), 3)
prec = round(precision_score(y_test, y_score), 3)
print(color.BOLD + f'Accuracy: {acc}')
print(f'Recall: {rec}')
print(f'Precision: {prec}' + color.END)
print("="*125, '\n\n')

[1m[4mMLP Classifier[0m 

[1mConfusion Matrix: 
[0m
[[35371  3008]
 [  251  3542]] 

              precision    recall  f1-score   support

           0       0.99      0.92      0.96     38379
           1       0.54      0.93      0.68      3793

   micro avg       0.92      0.92      0.92     42172
   macro avg       0.77      0.93      0.82     42172
weighted avg       0.95      0.92      0.93     42172
 

[1mAccuracy: 0.923
Recall: 0.934
Precision: 0.541[0m




### Section 3.3: Gridsearch on Logistic Regression Model

In [24]:
lr_params = {'penalty': ['l1', 'l2'], 'C': [1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1], 'n_jobs': [3], 'random_state': [0]}

grid_lr = GridSearchCV(LogisticRegression(), lr_params, scoring = 'recall', n_jobs = 3)
grid_lr.fit(os_train_data_X, os_train_data_y)
# NN Best Estimator
print(grid_lr.best_estimator_)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=3,
          penalty='l2', random_state=0, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)


In [25]:
lr = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=3,
          penalty='l2', random_state=0, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

lr.fit(os_train_data_X, os_train_data_y)

y_score = lr.predict(X_test)
print("="*125)
print(color.BOLD + color.UNDERLINE + 'Logistic Regression Classifier' + color.END, "\n")
print(color.BOLD + "Confusion Matrix: \n" + color.END)
print(confusion_matrix(y_test, y_score), "\n" )
print(classification_report(y_test, y_score), "\n")
acc = round(accuracy_score(y_test, y_score), 3)
rec = round(recall_score(y_test, y_score), 3)
prec = round(precision_score(y_test, y_score), 3)
print(color.BOLD + f'Accuracy: {acc}')
print(f'Recall: {rec}')
print(f'Precision: {prec}' + color.END)
print("="*125, '\n\n')

[1m[4mLogistic Regression Classifier[0m 

[1mConfusion Matrix: 
[0m
[[34485  3894]
 [  470  3323]] 

              precision    recall  f1-score   support

           0       0.99      0.90      0.94     38379
           1       0.46      0.88      0.60      3793

   micro avg       0.90      0.90      0.90     42172
   macro avg       0.72      0.89      0.77     42172
weighted avg       0.94      0.90      0.91     42172
 

[1mAccuracy: 0.897
Recall: 0.876
Precision: 0.46[0m




# We Beleive that the MLPClassifier is our best model based on it's higher accuracy and recall scores when compared to the KNN and Logistic Regression Models.