# Credit Card Risk Assessment
Dataset : https://www.kaggle.com/tmchls/credit-card-risk-assessment

In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# configurations
sns.set_style("darkgrid")
pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 2500)

In [3]:
# suppress harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# load dataset
df = pd.read_csv("Credit_default_dataset.csv")

In [5]:
# preview dataset
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [6]:
df.tail()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
29995,29996,220000.0,1,3,1,39,0,0,0,0,0,0,188948.0,192815.0,208365.0,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,0,0,1683.0,1828.0,3502.0,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,0,0,3565.0,3356.0,2758.0,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,0,-1,-1645.0,78379.0,76304.0,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1
29999,30000,50000.0,1,2,1,46,0,0,0,0,0,0,47929.0,48905.0,49764.0,36535.0,32428.0,15313.0,2078.0,1800.0,1430.0,1000.0,1000.0,1000.0,1


In [7]:
# shape
print(f"# Rows : {df.shape[0]}")
print(f"# Cols : {df.shape[1]}")

# Rows : 30000
# Cols : 25


In [8]:
# metadata - features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null float64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null float64
BILL_AMT2                     30000 non-null float64
BILL_AMT3                     30000 non-null float64
BILL_AMT4                     30000 non-null float64
BILL_AMT5                     30000 non-null float64
BILL_AMT6   

In [9]:
# check missing values
df.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

- no missing values

- `ID` is a unique identifier. We can drop it.

In [10]:
# drop feature 'ID'
df.drop(['ID'], axis=1, inplace=True)

- `PAY_0` is falling out of order when compared to other features. Therefore, we can rename it to `PAY_1`

In [11]:
# rename feature type
df.rename(columns={"PAY_0": "PAY_1"}, inplace=True)

In [12]:
# explore feature 'EDUCATION'
df['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

- `EDUCATION` is a categorical feature that depicts different levels of education for every person in the record
    - 0 : N/A
    - 1 : School
    - 2 : High School
    - 3 : College
    - 4 : University
    - 5 : N/A
    - 6 : N/A
    - Since the information for 0,5 & 6 is unavailable. From the information from the data source, we know that they belong to the University levels, therefore, we will map 0,5 & 6 to 4

In [13]:
# map 0, 5 & 6 to 4 in feature 'EDUCATION'
df['EDUCATION'] = df['EDUCATION'].map({0:4, 1:1, 2:2, 3:3, 4:4, 5:4, 6:4})

`MARRIAGE`

In [14]:
# explore feature 'MARRIAGE'
df['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

- we will map 0 to 3, which is for married category

In [15]:
# map 0 to 3 in feature 'MARRIAGE'
df['MARRIAGE'] = df['MARRIAGE'].map({0:3, 1:1, 2:2, 3:3})

In [16]:
# shape
print(f"# Rows : {df.shape[0]}")
print(f"# Cols : {df.shape[1]}")

# Rows : 30000
# Cols : 24


In [17]:
# metadata - stats
df.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.842267,1.557267,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,129747.661567,0.489129,0.744494,0.521405,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,10000.0,1.0,1.0,1.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,1000000.0,2.0,4.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


- we can see that different features are at different scale ranges
- this needs to be treated with feature scaling

In [18]:
# split dependent and inpendent features

# independent features
X = df.drop(['default.payment.next.month'], axis=1)
# dependent features
y = df['default.payment.next.month']

In [19]:
# feature scaler with standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
# feature scaling
X = scaler.fit_transform(X)

In [21]:
# hyper parameter optimization for XGBoost
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]  
}

In [22]:
# hyper parameter optimization with Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [27]:
# initialize RandomizedSearchCV object
random_search = RandomizedSearchCV(classifier, 
                   param_distributions=params,
                   n_iter=5,
                   scoring='roc_auc',
                   n_jobs=-1,
                   cv=5,
                   verbose=3)

In [31]:
# import datetime
from datetime import datetime

# UDF to calculate the time taken by RandomizedSearchCV for Hyper Parameter Optimization
def timer(start_time):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print("\nTime Taken : %i hours %i minutes and %s seconds" %(thour, tmin, round(tsec,2)))

In [32]:
# import XGBoost classification model
import xgboost
classifier = xgboost.XGBClassifier()

In [33]:
# here we go
start_time = timer(None)  # start timer from now
random_search.fit(X,y)
timer(start_time)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   37.2s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   39.0s finished



Time Taken : 0 hours 0 minutes and 43.03 seconds


In [34]:
# find the best parameters
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.1,
              learning_rate=0.05, max_delta_step=0, max_depth=4,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [35]:
random_search.best_params_

{'min_child_weight': 5,
 'max_depth': 4,
 'learning_rate': 0.05,
 'gamma': 0.1,
 'colsample_bytree': 0.3}

In [37]:
# initialize new object of XGB classifier
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                                  colsample_bynode=1, colsample_bytree=0.3, gamma=0.1,
                                  learning_rate=0.05, max_delta_step=0, max_depth=4,
                                  min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
                                  nthread=None, objective='binary:logistic', random_state=0,
                                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                                  silent=None, subsample=1, verbosity=1)

## Model Evaluation

In [38]:
# cross evaluation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X, y, cv=10)

In [39]:
# scores for all cross evaluation
score

array([0.80033333, 0.807     , 0.80733333, 0.80766667, 0.816     ,
       0.82333333, 0.82766667, 0.821     , 0.82266667, 0.82066667])

In [40]:
# overall accuracy
score.mean()

0.8153666666666666