In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from matplotlib import pyplot as plt

from sklearn import model_selection as ms
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

import gzip

import copy

import import_ipynb
from My_Functions import null_cols, rmse

importing Jupyter notebook from My_Functions.ipynb


In [2]:
acc_vred = pd.read_csv("acc_vred.csv.gz",
                                      compression='gzip',
                                      header=0,
                                      sep=',',
                                      quotechar='"')

target_2_train = pd.read_csv("target_2_train_feateng_dwnsmpl.csv.gz",
                                         compression='gzip',
                                         header=0,
                                         sep=',',
                                         quotechar='"')

In [3]:
# separate and drop 'total_pymnt'

ttlpy_vred = copy.deepcopy(acc_vred[['total_pymnt']])

acc_vred = acc_vred.drop('total_pymnt', axis=1)

acc_vred

Unnamed: 0,annual_inc,fico_score,term,loan_amnt,home_ownership,int_rate,all_util,dti
0,0.008183,0.424628,0.0,0.036709,0.000000,0.276869,0.000000,0.00977
1,0.002681,0.360934,0.0,0.088608,0.000000,0.150312,0.000000,0.02428
2,0.003110,0.594480,0.0,0.367089,0.000000,0.221184,0.000000,0.03482
3,0.011819,0.212314,0.0,0.746835,0.666667,0.238707,0.462428,0.01900
4,0.007273,0.552017,0.0,0.402532,0.666667,0.084891,0.369942,0.01284
...,...,...,...,...,...,...,...,...
269355,0.005637,0.276008,1.0,0.594937,0.000000,0.291277,0.294798,0.01546
269356,0.004491,0.318471,1.0,0.351899,1.000000,0.221184,0.369942,0.01485
269357,0.011365,0.254777,1.0,0.797468,0.666667,0.338006,0.260116,0.03089
269358,0.008637,0.467091,1.0,0.908861,0.000000,0.357477,0.456647,0.02250


In [4]:
state=0

# downsampled:
xtrain, xtest, ytrain, ytest = ms.train_test_split(acc_vred,
                                                   target_2_train,
                                                   test_size=0.2,
                                                   random_state=state)

# Must flatten to fit
ytrain = ytrain.values.flatten()


# using standard kfold split
n_folds = ms.KFold(n_splits=5, random_state=state, shuffle=True)

## Downsampled Data

In [6]:
xgb_clsfr = xgb.XGBClassifier()

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

objective_ = ['binary:logistic']
evalmetric = ['auc']
uselabelencoder = [False]
treemethod = ['gpu_hist']
predictor_ = ['gpu_predictor']
njobs = [-1]
randomstate = [state]
booster_ = ['gbtree']

nestimators = [50, 100, 150, 200, 300]
minchildweight = [1, 5, 10]
maxdepth = [3, 4, 5, 6, 7]
learningrate = [0.01, 0.05, 0.1, 0.5, 1]
gamma_ = [0.5, 1, 1.5, 2, 5]
subsample_ = [0.6, 0.8, 1]
colsamplebytree = [0.6, 0.8, 1]


gparam_xgb = {'objective': objective_,
              'eval_metric': evalmetric,
              'use_label_encoder': uselabelencoder,
              'tree_method': treemethod,
              'predictor': predictor_,
              'n_jobs': njobs,
              'booster': booster_,
              'n_estimators': nestimators,
              'min_child_weight': minchildweight,
              'max_depth': maxdepth,
              'learning_rate': learningrate,
              'gamma': gamma_,
              'subsample': subsample_,
              'colsample_bytree': colsamplebytree,
              'random_state': randomstate}


gs_xgb = ms.GridSearchCV(xgb_clsfr, gparam_xgb, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_xgb.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_xgb.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_xgb.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_xgb, ytrain, xtrain))
test_rmse.append(rmse(gs_xgb, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_xgb.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_xgb.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_xgb.predict(xtest)))

res_df

# to beat: 8779

Fitting 5 folds for each of 16875 candidates, totalling 84375 fits
Wall time: 16h 54min 11s


confusion matrix for train set
[[69571 38139]
 [33743 74035]]


confusion matrix for test set
[[17031  9939]
 [ 8780 18122]]


Unnamed: 0,TrainScores,TestScores,TrainRMSE,TestRMSE,DiffRMSE,booster,colsample_bytree,eval_metric,gamma,learning_rate,max_depth,min_child_weight,n_estimators,n_jobs,objective,predictor,random_state,subsample,tree_method,use_label_encoder
0,0.666422,0.652528,0.577562,0.589467,-0.011905,gbtree,0.6,auc,1,0.05,7,5,300,-1,binary:logistic,gpu_predictor,0,1,gpu_hist,False


In [None]:
log_clsfr = LogisticRegression(C=1e8, solver='liblinear',
                               class_weight='balanced',
                               max_iter=2000)

train_scores = []
test_scores = []
train_rmse = []
test_rmse = []
best_par_list = []

C_ = [1e8]
solver_ = ['liblinear']
classweight = ['balanced']
maxiter = [2000]
njobs = [-1]
randomstate = [state]



gparam_log = {'C': C_,
              'solver': solver_,
              'class_weight': classweight,
              'max_iter': maxiter,
              'n_jobs': njobs,
              'random_state': randomstate}


gs_log = ms.GridSearchCV(log_clsfr, gparam_log, cv=n_folds, refit=True,
                          scoring='roc_auc', return_train_score=True, verbose=1)

%time gs_log.fit(xtrain, ytrain)

# setting up dataframe for results
train_scores.append(gs_log.best_estimator_.score(xtrain, ytrain))
test_scores.append(gs_log.best_estimator_.score(xtest, ytest))


# use rmse function from Self_Written_Functions_Sheet_Recover
train_rmse.append(rmse(gs_log, ytrain, xtrain))
test_rmse.append(rmse(gs_log, ytest, xtest))

# add the best parameters to the df
best_par_list.append(gs_log.best_params_)

# find the difference btwn the rmses
diff_rmse = np.subtract(train_rmse, test_rmse)

# create dataframe
list_results = [train_scores, test_scores, train_rmse, test_rmse, diff_rmse]
res_df = pd.DataFrame(list_results).T
res_df.columns = ['TrainScores', 'TestScores', 'TrainRMSE', 'TestRMSE', 'DiffRMSE']
best_par_df = pd.DataFrame(best_par_list)
res_df = pd.concat([res_df, best_par_df], axis=1, sort=False)

# confusion matrix for train set
print('\n')
print('confusion matrix for train set')
print(confusion_matrix(ytrain, gs_log.predict(xtrain)))

# confusion matrix for test set
print('\n')
print('confusion matrix for test set')
print(confusion_matrix(ytest, gs_log.predict(xtest)))

res_df

# to beat: 8779