In [1]:
#Imports
import numpy as np
import pandas as pd
import scipy 
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

#Feature selection
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

#Cross-validation
from sklearn.model_selection import train_test_split

#Estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

#Model metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score

In [2]:
#Data
rawData = pd.read_csv('cleaned_credit_data.csv')
rawData.head()

Unnamed: 0,limit,sex,education,marriage,age,pay_hist_sep,pay_hist_aug,pay_hist_jul,pay_hist_jun,pay_hist_may,...,bill_amt_jun,bill_amt_may,bill_amt_apr,pay_amt_sep,pay_amt_aug,pay_amt_jul,pay_amt_jun,pay_amt_may,pay_amt_apr,default
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
# Make copy for cleaned data
cleanData = rawData.copy()

In [4]:
#Preprocessing
#Recode education levels to combine 0,4,5,6 values as 0
cleanData.education = cleanData.education.replace(to_replace=[4,5,6], value=0)

In [5]:
#Discretize age
est = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='uniform')
t = est.fit_transform(cleanData[['age']])
cleanData['age'] = t
cleanData.describe()

Unnamed: 0,limit,sex,education,marriage,age,pay_hist_sep,pay_hist_aug,pay_hist_jul,pay_hist_jun,pay_hist_may,...,bill_amt_jun,bill_amt_may,bill_amt_apr,pay_amt_sep,pay_amt_aug,pay_amt_jul,pay_amt_jun,pay_amt_may,pay_amt_apr,default
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.779867,1.551867,1.010167,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,129747.661567,0.489129,0.728486,0.52197,0.98649,1.123802,1.197186,1.196868,1.169139,1.133187,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,10000.0,1.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,140000.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,1000000.0,2.0,3.0,3.0,5.0,8.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [6]:
#Discretize credit limit
est2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
u = est2.fit_transform(cleanData[['limit']])
cleanData['limit'] = u
cleanData.describe()

Unnamed: 0,limit,sex,education,marriage,age,pay_hist_sep,pay_hist_aug,pay_hist_jul,pay_hist_jun,pay_hist_may,...,bill_amt_jun,bill_amt_may,bill_amt_apr,pay_amt_sep,pay_amt_aug,pay_amt_jul,pay_amt_jun,pay_amt_may,pay_amt_apr,default
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1.1267,1.603733,1.779867,1.551867,1.010167,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,1.248772,0.489129,0.728486,0.52197,0.98649,1.123802,1.197186,1.196868,1.169139,1.133187,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,0.0,1.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,9.0,2.0,3.0,3.0,5.0,8.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [7]:
#Define features for full data set (no features removed)
full_feat = cleanData.iloc[:, 0:23]
print('Summary of feature sample')
full_feat.head()

Summary of feature sample


Unnamed: 0,limit,sex,education,marriage,age,pay_hist_sep,pay_hist_aug,pay_hist_jul,pay_hist_jun,pay_hist_may,...,bill_amt_jul,bill_amt_jun,bill_amt_may,bill_amt_apr,pay_amt_sep,pay_amt_aug,pay_amt_jul,pay_amt_jun,pay_amt_may,pay_amt_apr
0,0.0,2,2,1,0.0,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,1.0,2,2,2,0.0,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,0.0,2,2,2,1.0,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,0.0,2,2,1,1.0,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,0.0,1,2,1,3.0,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [8]:
#Define dependent variable
depVar = cleanData['default']

In [9]:
#Cross validation
X_train, X_test, y_train, y_test = train_test_split(full_feat, depVar, test_size = 0.25, random_state=77)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(22500, 23) (22500,)
(7500, 23) (7500,)


In [40]:
#SVC full model fit
modelSVC = SVC()
modelSVC.fit(X_train, y_train)
print(cross_val_score(modelSVC, X_train, y_train))
svc_accuracy = modelSVC.score(X_train, y_train)
print(svc_accuracy)

[0.77777778 0.77844444 0.77844444 0.77844444 0.77844444]
0.7787555555555555


In [41]:
#RF full model fit
modelRF = RandomForestClassifier()
modelRF.fit(X_train, y_train)
print(cross_val_score(modelRF, X_train, y_train))
rf_accuracy = modelRF.score(X_train, y_train)
print(rf_accuracy)

[0.80755556 0.82088889 0.822      0.81866667 0.81977778]
0.9945333333333334


In [42]:
#KNN full model fit
modelKNN = KNeighborsClassifier()
modelKNN.fit(X_train, y_train)
print(cross_val_score(modelKNN, X_train, y_train))
knn_accuracy = modelKNN.score(X_train, y_train)
print(knn_accuracy)

[0.76355556 0.76244444 0.76044444 0.75511111 0.76088889]
0.8170222222222222


In [10]:
# Remove highly correlated features
cor_feat = set()
cor_mat = full_feat.corr()

for i in range(len(cor_mat.columns)):
    for j in range(i):
        if abs(cor_mat.iloc[i, j]) > 0.85:
              colname = cor_mat.columns[i]
              cor_feat.add(colname)
print(cor_feat)
uncor_feat = full_feat.drop(cor_feat, axis=1)
uncor_feat.head()

{'bill_amt_apr', 'bill_amt_aug', 'bill_amt_jun', 'bill_amt_may', 'bill_amt_jul'}


Unnamed: 0,limit,sex,education,marriage,age,pay_hist_sep,pay_hist_aug,pay_hist_jul,pay_hist_jun,pay_hist_may,pay_hist_apr,bill_amt_sep,pay_amt_sep,pay_amt_aug,pay_amt_jul,pay_amt_jun,pay_amt_may,pay_amt_apr
0,0.0,2,2,1,0.0,2,2,-1,-1,-2,-2,3913,0,689,0,0,0,0
1,1.0,2,2,2,0.0,-1,2,0,0,0,2,2682,0,1000,1000,1000,0,2000
2,0.0,2,2,2,1.0,0,0,0,0,0,0,29239,1518,1500,1000,1000,1000,5000
3,0.0,2,2,1,1.0,0,0,0,0,0,0,46990,2000,2019,1200,1100,1069,1000
4,0.0,1,2,1,3.0,-1,0,-1,0,0,0,8617,2000,36681,10000,9000,689,679


In [11]:
#Cross validation for uncorrelated dataset
X_train_uncor, X_test_uncor, y_train_uncor, y_test_uncor = train_test_split(uncor_feat, depVar, test_size = 0.25, random_state=77)
print(X_train_uncor.shape, y_train_uncor.shape)
print(X_test_uncor.shape, y_test_uncor.shape)

(22500, 18) (22500,)
(7500, 18) (7500,)


In [47]:
#Build models with uncorrelated dataset
#SVC model fit
uncor_modelSVC = SVC()
uncor_modelSVC.fit(X_train_uncor, y_train_uncor)
print(cross_val_score(uncor_modelSVC, X_train_uncor, y_train_uncor))
uncor_svc_accuracy = uncor_modelSVC.score(X_train_uncor, y_train_uncor)
print(uncor_svc_accuracy)

[0.77777778 0.77844444 0.77844444 0.77844444 0.77822222]
0.7786222222222222


In [49]:
#RF uncor model fit
uncor_modelRF = RandomForestClassifier()
uncor_modelRF.fit(X_train_uncor, y_train_uncor)
print(cross_val_score(uncor_modelRF, X_train_uncor, y_train_uncor))
uncor_rf_accuracy = uncor_modelRF.score(X_train_uncor, y_train_uncor)
print(uncor_rf_accuracy)

[0.81066667 0.81422222 0.82       0.81688889 0.82111111]
0.9944444444444445


In [50]:
#KNN uncor model fit
uncor_modelKNN = KNeighborsClassifier()
uncor_modelKNN.fit(X_train_uncor, y_train_uncor)
print(cross_val_score(uncor_modelKNN, X_train_uncor, y_train_uncor))
uncor_knn_accuracy = uncor_modelKNN.score(X_train_uncor, y_train_uncor)
print(uncor_knn_accuracy)

[0.76711111 0.76488889 0.75755556 0.75888889 0.75911111]
0.8186666666666667


In [12]:
# RFE dataset
modelTREE = DecisionTreeClassifier()
rfe = RFE(modelTREE, 12)
fit = rfe.fit(uncor_feat, depVar)
print("Num Features:",fit.n_features_)
print("Selected Features:",fit.support_)
print("Feature Ranking: ",fit.ranking_)

Num Features: 12
Selected Features: [ True False  True False  True  True  True False False False False  True
  True  True  True  True  True  True]
Feature Ranking:  [1 5 1 2 1 1 1 4 7 6 3 1 1 1 1 1 1 1]


In [13]:
rfe_feat = uncor_feat.iloc[:, [0,2,4,5,6,11,12,13,14,15,16,17]]
print(rfe_feat.head())

   limit  education  age  pay_hist_sep  pay_hist_aug  bill_amt_sep  \
0    0.0          2  0.0             2             2          3913   
1    1.0          2  0.0            -1             2          2682   
2    0.0          2  1.0             0             0         29239   
3    0.0          2  1.0             0             0         46990   
4    0.0          2  3.0            -1             0          8617   

   pay_amt_sep  pay_amt_aug  pay_amt_jul  pay_amt_jun  pay_amt_may  \
0            0          689            0            0            0   
1            0         1000         1000         1000            0   
2         1518         1500         1000         1000         1000   
3         2000         2019         1200         1100         1069   
4         2000        36681        10000         9000          689   

   pay_amt_apr  
0            0  
1         2000  
2         5000  
3         1000  
4          679  


In [14]:
#Cross validation for RFE dataset
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(rfe_feat, depVar, test_size = 0.25, random_state=77)
print(X_train_rfe.shape, y_train_rfe.shape)
print(X_test_rfe.shape, y_test_rfe.shape)

(22500, 12) (22500,)
(7500, 12) (7500,)


In [58]:
#Build models with RFE dataset
#SVC model fit
rfe_modelSVC = SVC()
rfe_modelSVC.fit(X_train_rfe, y_train_rfe)
print(cross_val_score(rfe_modelSVC, X_train_rfe, y_train_rfe))
rfe_svc_accuracy = rfe_modelSVC.score(X_train_rfe, y_train_rfe)
print(rfe_svc_accuracy)

[0.77777778 0.77844444 0.77844444 0.77844444 0.77822222]
0.7786222222222222


In [59]:
#RF model fit
rfe_modelRF = RandomForestClassifier()
rfe_modelRF.fit(X_train_rfe, y_train_rfe)
print(cross_val_score(rfe_modelRF, X_train_rfe, y_train_rfe))
rfe_rf_accuracy = rfe_modelRF.score(X_train_rfe, y_train_rfe)
print(rfe_rf_accuracy)

[0.81266667 0.81666667 0.82155556 0.81666667 0.81844444]
0.9922666666666666


In [64]:
#KNN model fit
rfe_modelKNN = KNeighborsClassifier()
rfe_modelKNN.fit(X_train_rfe, y_train_rfe)
print(cross_val_score(rfe_modelKNN, X_train_rfe, y_train_rfe))
rfe_knn_accuracy = rfe_modelKNN.score(X_train_rfe, y_train_rfe)
print(rfe_knn_accuracy)

[0.77244444 0.77288889 0.76088889 0.76844444 0.764      0.75777778
 0.76222222 0.76044444 0.75555556 0.76622222]
0.8177777777777778


In [72]:
#Tune KNN parameters
from sklearn.model_selection import GridSearchCV
rfe_knn2 = KNeighborsClassifier()
leaf_size = list(range(1,50))
n_neighbors = list(range(1, 30))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
knn_gscv = GridSearchCV(rfe_knn2, hyperparameters, cv=10)
best_model = knn_gscv.fit(X_train_rfe, y_train_rfe)

In [74]:
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
print('Best accuracy:', best_model.best_score_)

Best leaf_size: 3
Best p: 1
Best n_neighbors: 18
Best accuracy: 0.7832888888888889


In [76]:
#Tune RF parameters
from sklearn.model_selection import RandomizedSearchCV
rfe_rf2 = RandomForestClassifier()

n_estimators = [100, 200, 300, 400, 500]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [10, 20, 30, 40, 50]
max_depth.append(None)
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf = [1, 2, 5, 10, 15]

grid_param = {'n_estimators':n_estimators, 
              'max_features':max_features, 
              'max_depth':max_depth, 
              'min_samples_split':min_samples_split, 
              'min_samples_leaf':min_samples_leaf}
rf_rscv = RandomizedSearchCV(rfe_rf2, grid_param, cv=10)
best_model_2 = rf_rscv.fit(X_train_rfe, y_train_rfe)

In [77]:
print('Best n_estimators:', best_model_2.best_estimator_.get_params()['n_estimators'])
print('Best max_features:', best_model_2.best_estimator_.get_params()['max_features'])
print('Best max_depth:', best_model_2.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', best_model_2.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', best_model_2.best_estimator_.get_params()['min_samples_leaf'])
print('Best accuracy:', best_model_2.best_score_)

Best n_estimators: 500
Best max_features: log2
Best max_depth: None
Best min_samples_split: 20
Best min_samples_leaf: 5
Best accuracy: 0.822088888888889


In [17]:
# Tune specific parameters
from sklearn.model_selection import RandomizedSearchCV
rfe_rf2 = RandomForestClassifier()

n_estimators = [500, 750, 1000, 1500, 2000, 2500]
max_features = ['log2']
min_samples_split = [20, 25, 50, 75, 100]
min_samples_leaf = [5]

grid_param2 = {'n_estimators':n_estimators, 
              'max_features':max_features, 
              'min_samples_split':min_samples_split, 
              'min_samples_leaf':min_samples_leaf}
rf_rscv2 = RandomizedSearchCV(rfe_rf2, grid_param2, cv=10)
best_model_3 = rf_rscv2.fit(X_train_rfe, y_train_rfe)

In [79]:
print('Best n_estimators:', best_model_3.best_estimator_.get_params()['n_estimators'])
print('Best max_features:', best_model_3.best_estimator_.get_params()['max_features'])
print('Best min_samples_split:', best_model_3.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', best_model_3.best_estimator_.get_params()['min_samples_leaf'])
print('Best accuracy:', best_model_3.best_score_)

Best n_estimators: 2500
Best max_features: log2
Best min_samples_split: 25
Best min_samples_leaf: 5
Best accuracy: 0.8225777777777779


In [19]:
# Predict using RF model
rf_pred = best_model_3.predict(X_test_rfe)
print('Accuracy score:', accuracy_score(y_test_rfe, rf_pred))
print('Kappa score:', cohen_kappa_score(y_test_rfe, rf_pred))

Accuracy score: 0.8172
Kappa score: 0.35732768319207986


In [29]:
print('Number of customers predicted to default:', rf_pred.sum())
print('Percentage of customers predicted to default:', rf_pred.mean()*100)
print('Total number of customers:', len(rf_pred))

Number of customers predicted to default: 863
Percentage of customers predicted to default: 11.506666666666666
Total number of customers: 7500
