In [1]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import matplotlib.pyplot as plt

# Import estimators
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Import model metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score

# Import cross validation
from sklearn.cross_validation import train_test_split



In [2]:
#import data
rawData = pd.read_csv('default of credit card clients.csv', header=1)
rawData.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
#remove ID 
rawData.drop('ID', axis = 1, inplace=True)
#rename 'default payment next month' to DEFAULT
rawData.rename(columns={'default payment next month':'DEFAULT'}, inplace = True)
rawData.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEFAULT'],
      dtype='object')

In [6]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_0        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
DEFAULT      30000 non-null int64
dtypes: int64(24)
memory usage: 5.5 MB


In [4]:
#combine values of 'others' in education
x = {1:1, 2:2, 3:3, 0:4,5:4,6:4}
rawData.EDUCATION = rawData.EDUCATION.replace(x)
rawData.EDUCATION.describe()

count    30000.000000
mean         1.842267
std          0.744494
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: EDUCATION, dtype: float64

In [8]:
#drop other value of in marriage
rawData = rawData[rawData.MARRIAGE != 0]

In [9]:
rawData.groupby('MARRIAGE')['MARRIAGE'].count()

MARRIAGE
1    13659
2    15964
3      323
Name: MARRIAGE, dtype: int64

In [5]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_0        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
DEFAULT      30000 non-null int64
dtypes: int64(24)
memory usage: 5.5 MB


In [11]:
rawData.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
#remove limit balance values higher than 6k
rawData = rawData[rawData['LIMIT_BAL']< 600000]

In [7]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29905 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL    29905 non-null int64
SEX          29905 non-null int64
EDUCATION    29905 non-null int64
MARRIAGE     29905 non-null int64
AGE          29905 non-null int64
PAY_0        29905 non-null int64
PAY_2        29905 non-null int64
PAY_3        29905 non-null int64
PAY_4        29905 non-null int64
PAY_5        29905 non-null int64
PAY_6        29905 non-null int64
BILL_AMT1    29905 non-null int64
BILL_AMT2    29905 non-null int64
BILL_AMT3    29905 non-null int64
BILL_AMT4    29905 non-null int64
BILL_AMT5    29905 non-null int64
BILL_AMT6    29905 non-null int64
PAY_AMT1     29905 non-null int64
PAY_AMT2     29905 non-null int64
PAY_AMT3     29905 non-null int64
PAY_AMT4     29905 non-null int64
PAY_AMT5     29905 non-null int64
PAY_AMT6     29905 non-null int64
DEFAULT      29905 non-null int64
dtypes: int64(24)
memory usage: 5.7 MB


In [8]:
#features
features = rawData.iloc[:,0:23]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [9]:
#dependent variable
depVar = rawData['DEFAULT']
depVar.head()

0    1
1    1
2    0
3    0
4    0
Name: DEFAULT, dtype: int64

In [10]:
#Training Set (Feature Space: X Training)
X_train = (features[: 15000])
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [11]:
#Dependent Variable Training Set (y Training)
y_train = depVar[: 15000]
y_train_count = len(y_train.index)
print('The number of observations in the Y training set are:',str(y_train_count))
y_train.head()

The number of observations in the Y training set are: 15000


0    1
1    1
2    0
3    0
4    0
Name: DEFAULT, dtype: int64

In [12]:
#Testing Set (X Testing)
X_test = features[-4500:]
X_test_count = len(X_test.index)
print('The number of observations in the feature testing set is:',str(X_test_count))
print(X_test.head())

The number of observations in the feature testing set is: 4500
       LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
25483     100000    2          2         1   28      1     -2     -2     -2   
25484      70000    2          3         2   31      2      0      0      0   
25485     250000    2          1         2   31      0      0      0      0   
25487     320000    2          2         1   31      0      0      0      0   
25488     180000    2          2         2   31      0      0      0      0   

       PAY_5    ...     BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  \
25483     -2    ...             0          0          0          0         0   
25484      0    ...         69835      66216      30000      30096      3200   
25485      0    ...         67448      68165      68787      70111      3500   
25487      0    ...          4529       5548       6450       7342      1100   
25488      0    ...         88542      92055      94038      9

In [13]:
#Ground Truth (y_test) 
y_test = depVar[-4500:]
y_test_count = len(y_test.index)
print('The number of observations in the Y training set are:',str(y_test_count))
y_test.head()

The number of observations in the Y training set are: 4500


25483    1
25484    1
25485    0
25487    0
25488    0
Name: DEFAULT, dtype: int64

In [14]:
#cross validation
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = .30, random_state = 0)
X_train.shape, X_test.shape

((10500, 23), (4500, 23))

In [15]:
# Models
modelRF = RandomForestClassifier()
modelKNN = KNeighborsClassifier()
modelSVC = SVC()
modelGBC = GradientBoostingClassifier()

In [16]:
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [17]:
#dependent variable
depVar[0:5,]

0    1
1    1
2    0
3    0
4    0
Name: DEFAULT, dtype: int64

In [18]:
#fit models
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [20]:
modelSVC.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
modelGBC.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [22]:
#model fitting and scoring RF
modelRF.fit(X_train, y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train, y_train)

[0.78457143 0.78514286 0.79314286]


0.9780952380952381

In [23]:
#model fitting and scoring KNN
modelKNN.fit(X_train, y_train)
print(cross_val_score(modelKNN, X_train, y_train))
modelKNN.score(X_train, y_train)

[0.73257143 0.74028571 0.72971429]


0.805047619047619

In [24]:
#model fitting and scoring SVC
modelSVC.fit(X_train, y_train)
print(cross_val_score(modelSVC, X_train, y_train))
modelSVC.score(X_train, y_train)

[0.76828571 0.77028571 0.76971429]


0.9941904761904762

In [25]:
#model fitting and scoring GBC
modelGBC.fit(X_train, y_train)
print(cross_val_score(modelSVC, X_train, y_train))
modelGBC.score(X_train, y_train)

[0.76828571 0.77028571 0.76971429]


0.818

In [26]:
# Make Predictions RF
predictions = modelRF.predict(X_test)
predAccuracy = accuracy_score(y_test, predictions)
predKappa = cohen_kappa_score(y_test, predictions)
print('Accuracy: %.3f' % predAccuracy)
print('Kappa: %.3f' % predKappa)

Accuracy: 0.794
Kappa: 0.270


In [27]:
# Make Predictions KNN
predictions = modelKNN.predict(X_test)
predAccuracy = accuracy_score(y_test, predictions)
predKappa = cohen_kappa_score(y_test, predictions)
print('Accuracy: %.3f' % predAccuracy)
print('Kappa: %.3f' % predKappa)

Accuracy: 0.746
Kappa: 0.094


In [28]:
# Make Predictions SVC
predictions = modelSVC.predict(X_test)
predAccuracy = accuracy_score(y_test, predictions)
predKappa = cohen_kappa_score(y_test, predictions)
print('Accuracy: %.3f' % predAccuracy)
print('Kappa: %.3f' % predKappa)

Accuracy: 0.786
Kappa: 0.021


In [29]:
# Make Predictions GBC
predictions = modelGBC.predict(X_test)
predAccuracy = accuracy_score(y_test, predictions)
predKappa = cohen_kappa_score(y_test, predictions)
print('Accuracy: %.3f' % predAccuracy)
print('Kappa: %.3f' % predKappa)

Accuracy: 0.820
Kappa: 0.361
