In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from matplotlib import pyplot as plt

from sklearn import model_selection as ms
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

import gzip

import copy

import import_ipynb
from My_Functions import null_cols, rmse

importing Jupyter notebook from My_Functions.ipynb


In [2]:
acc_vred = pd.read_csv("acc_vred.csv.gz",
                                      compression='gzip',
                                      header=0,
                                      sep=',',
                                      quotechar='"')

target_2_train = pd.read_csv("target_2_train_feateng_dwnsmpl.csv.gz",
                                         compression='gzip',
                                         header=0,
                                         sep=',',
                                         quotechar='"')

In [3]:
# separate and drop 'total_pymnt'

ttlpy_vred = copy.deepcopy(acc_vred[['total_pymnt']])

acc_vred = acc_vred.drop('total_pymnt', axis=1)

acc_vred

Unnamed: 0,annual_inc,fico_score,term,loan_amnt,home_ownership,int_rate,all_util,dti
0,0.008183,0.424628,0.0,0.036709,0.000000,0.276869,0.000000,0.00977
1,0.002681,0.360934,0.0,0.088608,0.000000,0.150312,0.000000,0.02428
2,0.003110,0.594480,0.0,0.367089,0.000000,0.221184,0.000000,0.03482
3,0.011819,0.212314,0.0,0.746835,0.666667,0.238707,0.462428,0.01900
4,0.007273,0.552017,0.0,0.402532,0.666667,0.084891,0.369942,0.01284
...,...,...,...,...,...,...,...,...
269355,0.005637,0.276008,1.0,0.594937,0.000000,0.291277,0.294798,0.01546
269356,0.004491,0.318471,1.0,0.351899,1.000000,0.221184,0.369942,0.01485
269357,0.011365,0.254777,1.0,0.797468,0.666667,0.338006,0.260116,0.03089
269358,0.008637,0.467091,1.0,0.908861,0.000000,0.357477,0.456647,0.02250


In [4]:
state=0

# downsampled:
xtrain, xtest, ytrain, ytest = ms.train_test_split(acc_vred,
                                                   target_2_train,
                                                   test_size=0.2,
                                                   random_state=state)

# Must flatten to fit
ytrain = ytrain.values.flatten()


# using standard kfold split
n_folds = ms.KFold(n_splits=5, random_state=state, shuffle=True)

## Downsampled Data

In [6]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [50, 100, 150]
minchildweight = [1, 5, 10]
maxdepth = [3, 4, 5]
learningrate = [0.01, 0.1, 1]
gamma_ = [0.5, 1, 2, 5]
subsample_ = [0.8, 1]
colsamplebytree = [0.8, 1]


gparam_xgb = {'objective': objective_,
              'eval_metric': evalmetric,
              'use_label_encoder': uselabelencoder,
              'tree_method': treemethod,
              'predictor': predictor_,
              'n_jobs': njobs,
              'booster': booster_,
              'n_estimators': nestimators,
              'min_child_weight': minchildweight,
              'max_depth': maxdepth,
              'learning_rate': learningrate,
              'gamma': gamma_,
              'subsample': subsample_,
              'colsample_bytree': colsamplebytree,
              'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
Wall time: 42min 38s


confusion matrix for train set
[[69213 38497]
 [34811 72967]]


confusion matrix for test set
[[17097  9873]
 [ 8857 18045]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.659805,0.652324,0.583263,0.589641,-0.006378,gbtree,0.8,auc,2,0.1,5,10,150,-1,binary:logistic,gpu_predictor,0,0.8,gpu_hist,False


In [9]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [125, 150, 200]
minchildweight = [8, 10, 15]
maxdepth = [4, 5, 6]
learningrate = [0.05, 0.1, 0.5]
gamma_ = [1.5, 2, 2.5]
subsample_ = [0.7, 0.8, 0.9]
colsamplebytree = [0.7, 0.8, 0.9]


gparam_xgb = {'objective': objective_,
                 'eval_metric': evalmetric,
                 'use_label_encoder': uselabelencoder,
                 'tree_method': treemethod,
                 'predictor': predictor_,
                 'n_jobs': njobs,
                 'booster': booster_,
                 'n_estimators': nestimators,
                 'min_child_weight': minchildweight,
                 'max_depth': maxdepth,
                 'learning_rate': learningrate,
                 'gamma': gamma_,
                 'subsample': subsample_,
                 'colsample_bytree': colsamplebytree,
                 'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
Wall time: 1h 59min 43s


confusion matrix for train set
[[69227 38483]
 [34438 73340]]


confusion matrix for test set
[[17131  9839]
 [ 8779 18123]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.661601,0.654403,0.581721,0.587875,-0.006154,gbtree,0.7,auc,1.5,0.05,6,10,200,-1,binary:logistic,gpu_predictor,0,0.8,gpu_hist,False


In [12]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [175, 200, 300, 400]
minchildweight = [9, 10, 11, 12]
maxdepth = [5, 6, 7]
learningrate = [0.025, 0.05, 0.75]
gamma_ = [1.25, 1.5, 1.75]
subsample_ = [0.75, 0.8, 0.85]
colsamplebytree = [0.6, 0.7, 0.75]


gparam_xgb = {'objective': objective_,
                 'eval_metric': evalmetric,
                 'use_label_encoder': uselabelencoder,
                 'tree_method': treemethod,
                 'predictor': predictor_,
                 'n_jobs': njobs,
                 'booster': booster_,
                 'n_estimators': nestimators,
                 'min_child_weight': minchildweight,
                 'max_depth': maxdepth,
                 'learning_rate': learningrate,
                 'gamma': gamma_,
                 'subsample': subsample_,
                 'colsample_bytree': colsamplebytree,
                 'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

Fitting 5 folds for each of 3888 candidates, totalling 19440 fits
Wall time: 8h 4min 55s


confusion matrix for train set
[[69503 38207]
 [33990 73788]]


confusion matrix for test set
[[17054  9916]
 [ 8799 18103]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.66496,0.652602,0.578826,0.589404,-0.010578,gbtree,0.6,auc,1.75,0.025,7,11,400,-1,binary:logistic,gpu_predictor,0,0.8,gpu_hist,False


In [13]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [350, 400, 500]
minchildweight = [10, 11]
maxdepth = [7, 8]
learningrate = [0.015, 0.025, 0.03, 0.075]
gamma_ = [1.6, 1.75, 1.9]
subsample_ = [0.775, 0.8, 0.825]
colsamplebytree = [0.5, 0.6, 0.65]


gparam_xgb = {'objective': objective_,
                 'eval_metric': evalmetric,
                 'use_label_encoder': uselabelencoder,
                 'tree_method': treemethod,
                 'predictor': predictor_,
                 'n_jobs': njobs,
                 'booster': booster_,
                 'n_estimators': nestimators,
                 'min_child_weight': minchildweight,
                 'max_depth': maxdepth,
                 'learning_rate': learningrate,
                 'gamma': gamma_,
                 'subsample': subsample_,
                 'colsample_bytree': colsamplebytree,
                 'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
Wall time: 6h 33min 54s


confusion matrix for train set
[[69593 38117]
 [33923 73855]]


confusion matrix for test set
[[17079  9891]
 [ 8794 18108]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.665689,0.653159,0.578196,0.588932,-0.010735,gbtree,0.5,auc,1.75,0.025,7,11,500,-1,binary:logistic,gpu_predictor,0,0.8,gpu_hist,False


In [5]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [500, 750, 1000]
minchildweight = [11]
maxdepth = [7]
learningrate = [0.02, 0.025, 0.0275]
gamma_ = [1.7, 1.75, 1.8]
subsample_ = [0.79, 0.8, 0.81]
colsamplebytree = [0.3, 0.4, 0.5, 0.6]


gparam_xgb = {'objective': objective_,
                 'eval_metric': evalmetric,
                 'use_label_encoder': uselabelencoder,
                 'tree_method': treemethod,
                 'predictor': predictor_,
                 'n_jobs': njobs,
                 'booster': booster_,
                 'n_estimators': nestimators,
                 'min_child_weight': minchildweight,
                 'max_depth': maxdepth,
                 'learning_rate': learningrate,
                 'gamma': gamma_,
                 'subsample': subsample_,
                 'colsample_bytree': colsamplebytree,
                 'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Wall time: 1h 49min 11s


confusion matrix for train set
[[69693 38017]
 [33853 73925]]


confusion matrix for test set
[[17100  9870]
 [ 8794 18108]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.666478,0.653549,0.577514,0.588601,-0.011087,gbtree,0.5,auc,1.8,0.02,7,11,750,-1,binary:logistic,gpu_predictor,0,0.81,gpu_hist,False
