In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('credit-card-default.csv')

In [26]:
df.shape

(30000, 25)

In [3]:
pd.set_option('display.max_columns',30)

In [4]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaulted
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID           30000 non-null int64
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_0        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
defaulted    30000 non-null int64
dtypes: int64(25)
memory usage: 5.7 MB


In [6]:
from sklearn.model_selection import train_test_split


In [7]:
x = df.drop('defaulted', axis  =1)
y = df['defaulted']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 100)

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rfc = RandomForestClassifier()

In [11]:
rfc.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
predictions = rfc.predict(x_test)

In [13]:
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [15]:
print('Classification Report', classification_report(y_test, predictions))

Classification Report               precision    recall  f1-score   support

           0       0.83      0.95      0.88      6927
           1       0.65      0.34      0.45      2073

   micro avg       0.81      0.81      0.81      9000
   macro avg       0.74      0.64      0.67      9000
weighted avg       0.79      0.81      0.78      9000



In [16]:
print('Confusion Matrix \n', confusion_matrix(y_test, predictions))

Confusion Matrix 
 [[6547  380]
 [1360  713]]


In [17]:
print('Accuracy Score', accuracy_score(y_test, predictions))

Accuracy Score 0.8066666666666666


In [18]:
# Hyperparameter Tuning

In [19]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 250, 300, 500],
    'max_features': [5,10,15,20]
}
# create a based model
rf = RandomForestClassifier()

# GridSearch model
grid_search = GridSearchCV(estimator = rf,
                          param_grid= param_grid,
                          cv = 3,
                          n_jobs= -1,
                          verbose = 1)

In [20]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 20.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 250, 300, 500], 'max_features': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [21]:
print('Accuracy', grid_search.best_score_, 'best parameteres', grid_search.best_params_)

Accuracy 0.8181904761904762 best parameteres {'max_features': 15, 'n_estimators': 500}


max_features: 10
n_estimators: 250a


In [22]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True,
                             max_features=10,
                             n_estimators=250)

In [23]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
y_pred = rfc.predict(x_test)

In [25]:
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [29]:
print('Classification Report \n', classification_report(y_test, y_pred))

Classification Report 
               precision    recall  f1-score   support

           0       0.84      0.95      0.89      6927
           1       0.69      0.39      0.50      2073

   micro avg       0.82      0.82      0.82      9000
   macro avg       0.76      0.67      0.69      9000
weighted avg       0.80      0.82      0.80      9000



In [28]:
print('Confusion Matrix \n', confusion_matrix(y_test, y_pred))

Confusion Matrix 
 [[6558  369]
 [1267  806]]


In [31]:
print('Accuracy is ', accuracy_score(y_test, y_pred))

Accuracy is  0.8182222222222222
