In [2]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# import all libraries and dependencies for machine learning
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 

# Load the pre-processed data

In [3]:
df = pd.read_csv("loan_without_outliers.csv")

In [5]:
#remove the ids
df = df.drop(df.columns[0], axis=1)
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,2,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,2,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,2,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


# Model Building

In [7]:
# Train test split
X = df.drop(['not.fully.paid'], axis = 1)
y = df['not.fully.paid']

In [8]:
def random_forest(X,y,estimators,depth):
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
    
    clf = RandomForestClassifier(n_estimators=estimators, max_depth=depth, random_state=0)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print("Confusion Matrix : \n ",confusion_matrix(y_test, y_pred))
    print("\n Accuracy Score : \n ",accuracy_score(y_test,y_pred))
    print("\n Classification Report : \n",classification_report(y_test, y_pred))

## Random Forest

- n_estimators = 50
- max_depth    = 5

In [9]:
random_forest(X,y,50,5)

Confusion Matrix : 
  [[2106    0]
 [ 379    0]]

 Accuracy Score : 
  0.8474849094567405

 Classification Report : 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92      2106
           1       0.00      0.00      0.00       379

    accuracy                           0.85      2485
   macro avg       0.42      0.50      0.46      2485
weighted avg       0.72      0.85      0.78      2485



  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

- n_estimators = 100
- max depth = 10


In [10]:
random_forest(X,y,100,10)

Confusion Matrix : 
  [[2104    2]
 [ 378    1]]

 Accuracy Score : 
  0.8470824949698189

 Classification Report : 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92      2106
           1       0.33      0.00      0.01       379

    accuracy                           0.85      2485
   macro avg       0.59      0.50      0.46      2485
weighted avg       0.77      0.85      0.78      2485



# Using over-sampling techniques to balanced the classes

Here, I have imbalanced classes, the class 1 is much lower than class 0. Therefore, one solution is to increase instances by oversampling the minority class

In [11]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

## Testing  Naive random over-sampling (RandomOverSampler) method to create syntetic data

In [12]:
ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_sample(X, y)

# Checking how many instances are in each class after balancing it
print(y_ros.value_counts())

1    7043
0    7043
Name: not.fully.paid, dtype: int64


In [13]:
random_forest(X_ros,y_ros,50,5)

Confusion Matrix : 
  [[1505  565]
 [ 942 1214]]

 Accuracy Score : 
  0.6433980123047799

 Classification Report : 
               precision    recall  f1-score   support

           0       0.62      0.73      0.67      2070
           1       0.68      0.56      0.62      2156

    accuracy                           0.64      4226
   macro avg       0.65      0.65      0.64      4226
weighted avg       0.65      0.64      0.64      4226



In [14]:
random_forest(X_ros,y_ros,100,10)

Confusion Matrix : 
  [[1656  414]
 [ 418 1738]]

 Accuracy Score : 
  0.8031235210601041

 Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.80      0.80      2070
           1       0.81      0.81      0.81      2156

    accuracy                           0.80      4226
   macro avg       0.80      0.80      0.80      4226
weighted avg       0.80      0.80      0.80      4226



## Testing Synthetic Minority Oversampling Technique (SMOTE) method to create syntetic data

In [15]:
X_smote, y_smote = SMOTE(random_state = 0).fit_resample(X, y)

In [16]:
random_forest(X_smote,y_smote,50,5)

Confusion Matrix : 
  [[1480  590]
 [ 602 1554]]

 Accuracy Score : 
  0.7179365830572646

 Classification Report : 
               precision    recall  f1-score   support

           0       0.71      0.71      0.71      2070
           1       0.72      0.72      0.72      2156

    accuracy                           0.72      4226
   macro avg       0.72      0.72      0.72      4226
weighted avg       0.72      0.72      0.72      4226



In [17]:
random_forest(X_smote,y_smote,100,10)

Confusion Matrix : 
  [[1604  466]
 [ 466 1690]]

 Accuracy Score : 
  0.779460482725982

 Classification Report : 
               precision    recall  f1-score   support

           0       0.77      0.77      0.77      2070
           1       0.78      0.78      0.78      2156

    accuracy                           0.78      4226
   macro avg       0.78      0.78      0.78      4226
weighted avg       0.78      0.78      0.78      4226



## Testing Adaptive Synthetic (ADASYN) method to create syntetic data

In [18]:
X_ADASYN, y_ADASYN = ADASYN(random_state = 0).fit_resample(X, y)

In [19]:
random_forest(X_ADASYN,y_ADASYN,50,5)

Confusion Matrix : 
  [[1422  665]
 [ 580 1520]]

 Accuracy Score : 
  0.7026510628134702

 Classification Report : 
               precision    recall  f1-score   support

           0       0.71      0.68      0.70      2087
           1       0.70      0.72      0.71      2100

    accuracy                           0.70      4187
   macro avg       0.70      0.70      0.70      4187
weighted avg       0.70      0.70      0.70      4187



In [20]:
random_forest(X_ADASYN,y_ADASYN,100,10)

Confusion Matrix : 
  [[1599  488]
 [ 539 1561]]

 Accuracy Score : 
  0.7547169811320755

 Classification Report : 
               precision    recall  f1-score   support

           0       0.75      0.77      0.76      2087
           1       0.76      0.74      0.75      2100

    accuracy                           0.75      4187
   macro avg       0.75      0.75      0.75      4187
weighted avg       0.75      0.75      0.75      4187



# Defining the best parameters to the classifier through Grid Search method

As we could see the best performance with balanced data was with  Naive random over-sampling (RandomOverSampler) method, therefore, I will use it to find the parameters

In [21]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_ros,y_ros,test_size=0.3,random_state=0)

## Grid Search
- https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680

In [23]:
rfclassifier = RandomForestClassifier()

# defining parameter range 
param_grid = { 'max_depth'   : [10,20,40],
               'n_estimators' : [100,200,300]
             }  
  
grid = GridSearchCV(rfclassifier, param_grid, cv = 10) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20, 40],
                         'n_estimators': [100, 200, 300]})

In [24]:
# Best parameter after hyper parameter tuning 
print(grid.best_params_) 
  
# Moel Parameters 
print(grid.best_estimator_)

{'max_depth': 40, 'n_estimators': 300}
RandomForestClassifier(max_depth=40, n_estimators=300)


In [25]:
# Prediction using best parameters
grid_predictions = grid.predict(X_test) 
  
# print classification report 
print("Confusion Matrix : \n ",confusion_matrix(y_test, grid_predictions))
print("\n Accuracy Score : \n ",accuracy_score(y_test,grid_predictions))
print("\n Classification Report : \n",classification_report(y_test, grid_predictions))

Confusion Matrix : 
  [[2005   65]
 [  32 2124]]

 Accuracy Score : 
  0.9770468528159015

 Classification Report : 
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      2070
           1       0.97      0.99      0.98      2156

    accuracy                           0.98      4226
   macro avg       0.98      0.98      0.98      4226
weighted avg       0.98      0.98      0.98      4226

