### Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as cPickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import numpy as np
import joblib

### Loading dataset

In [2]:
dataset_path ='/kaggle/input/lending-club/accepted_2007_to_2018Q4.csv.gz'
df_accepted = pd.read_csv(dataset_path,low_memory=False,usecols=['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
                         'annual_inc', 'verification_status', 'loan_status','dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal',
                          'revol_util', 'total_acc'])  

df_accepted.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,annual_inc,verification_status,loan_status,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc
0,3600.0,36 months,13.99,123.03,C,C4,55000.0,Not Verified,Fully Paid,5.91,Aug-2003,7.0,0.0,2765.0,29.7,13.0
1,24700.0,36 months,11.99,820.28,C,C1,65000.0,Not Verified,Fully Paid,16.06,Dec-1999,22.0,0.0,21470.0,19.2,38.0
2,20000.0,60 months,10.78,432.66,B,B4,63000.0,Not Verified,Fully Paid,10.78,Aug-2000,6.0,0.0,7869.0,56.2,18.0
3,35000.0,60 months,14.85,829.9,C,C5,110000.0,Source Verified,Current,17.06,Sep-2008,13.0,0.0,7802.0,11.6,17.0
4,10400.0,60 months,22.45,289.91,F,F1,104433.0,Source Verified,Fully Paid,25.37,Jun-1998,12.0,0.0,21929.0,64.5,35.0


In [3]:
df_accepted.dtypes

loan_amnt              float64
term                    object
int_rate               float64
installment            float64
grade                   object
sub_grade               object
annual_inc             float64
verification_status     object
loan_status             object
dti                    float64
earliest_cr_line        object
open_acc               float64
pub_rec                float64
revol_bal              float64
revol_util             float64
total_acc              float64
dtype: object

### Changing loan_status value to 'Paid' and 'Default'

In [4]:
df_accepted["loan_status"].value_counts(dropna = True)

Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: loan_status, dtype: int64

In [5]:
replace_status = {"Fully Paid":"Paid",
                  "Current": "Paid",
                  "Charged Off": "Default",
                  "Does not meet the credit policy. Status:Charged Off":"Default",
                  "Does not meet the credit policy. Status:Charged Off":"Default",
                  "Does not meet the credit policy. Status:Fully Paid":"Paid",
                  "Late (31-120 days)":"Default",
                  "Late (16-30 days)":"Default",
                  "In Grace Period":"Default",
                  "Default":"Default"
                  }

In [6]:
df_accepted["loan_status"] = df_accepted["loan_status"].replace(replace_status)

In [7]:
df_accepted["loan_status"].value_counts(dropna= True)

Paid       1957056
Default     303612
Name: loan_status, dtype: int64

In [8]:
print(' The Lenght of the Data:',df_accepted.shape)

 The Lenght of the Data: (2260701, 16)


### Categorical and numerical values

In [9]:
categorical=[feature for feature in df_accepted.columns if df_accepted[feature].dtype=='object']
numerical=[feature for feature in df_accepted.columns if feature not in categorical]

In [10]:
print("Categorical columns:",categorical)
print("Numerical columns:",numerical)

Categorical columns: ['term', 'grade', 'sub_grade', 'verification_status', 'loan_status', 'earliest_cr_line']
Numerical columns: ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc']


### Dataset cleaning and preprocessing 

In [11]:
df_accepted = df_accepted.dropna(subset=['term'])
df_accepted = df_accepted.dropna(subset=['dti'])
df_accepted = df_accepted.dropna(subset=['earliest_cr_line'])
df_accepted['revol_util']=df_accepted['revol_util'].fillna(df_accepted['revol_util'].median())

In [12]:
df_accepted.isnull().sum()

loan_amnt              0
term                   0
int_rate               0
installment            0
grade                  0
sub_grade              0
annual_inc             0
verification_status    0
loan_status            0
dti                    0
earliest_cr_line       0
open_acc               0
pub_rec                0
revol_bal              0
revol_util             0
total_acc              0
dtype: int64

In [13]:
df_accepted['earliest_cr_line'] = df_accepted.earliest_cr_line.str.split('-', expand=True)[1]
df_accepted['earliest_cr_line']= df_accepted['earliest_cr_line'].astype('int64')

In [14]:
print("Before:",df_accepted['term'].unique())
df_accepted['term'] = df_accepted['term'].map(lambda x: x.lstrip(' ').rstrip(' months'))
print("After:",df_accepted['term'].unique())
df_accepted['term']= df_accepted['term'].astype('int64')

Before: [' 36 months' ' 60 months']
After: ['36' '60']


In [15]:
df_accepted['grade']= df_accepted['grade'].map({'A':0,'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6})

df_accepted['sub_grade']= df_accepted['sub_grade'].map({'A1': 0, 'A2': 1, 'A3': 2, 'A4': 3, 'A5': 4, 
                                      'B1': 5, 'B2': 6, 'B3': 7, 'B4': 8, 'B5': 9, 
                                      'C1': 10, 'C2': 11, 'C3': 12, 'C4': 13, 'C5': 14, 
                                      'D1': 15, 'D2': 16, 'D3': 17, 'D4': 18, 'D5': 19, 
                                      'E1': 20, 'E2': 21, 'E3': 22, 'E4': 23, 'E5': 24, 
                                      'F1': 25, 'F2': 26, 'F3': 27, 'F4': 28, 'F5': 29, 
                                      'G1': 30, 'G2': 31, 'G3': 32, 'G4': 33, 'G5': 34})


df_accepted['loan_status']= df_accepted['loan_status'].map({'Paid':0, 'Default':1})

df_accepted['verification_status']= df_accepted['verification_status'].map({'Not Verified':0, 'Source Verified':1, 'Verified': 2})

In [16]:
df_accepted.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2258928 entries, 0 to 2260698
Data columns (total 16 columns):
 #   Column               Dtype  
---  ------               -----  
 0   loan_amnt            float64
 1   term                 int64  
 2   int_rate             float64
 3   installment          float64
 4   grade                int64  
 5   sub_grade            int64  
 6   annual_inc           float64
 7   verification_status  int64  
 8   loan_status          int64  
 9   dti                  float64
 10  earliest_cr_line     int64  
 11  open_acc             float64
 12  pub_rec              float64
 13  revol_bal            float64
 14  revol_util           float64
 15  total_acc            float64
dtypes: float64(10), int64(6)
memory usage: 293.0 MB


### Data splitting

In [17]:
X=df_accepted.drop('loan_status', axis=1)
y=df_accepted.loan_status

In [18]:
X.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'annual_inc', 'verification_status', 'dti', 'earliest_cr_line',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc'],
      dtype='object')

### Undersampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, replacement=True)

X, y = rus.fit_resample(X, y)

### Data splitting for test and train subsets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(485600, 15)
(121400, 15)
(485600,)
(121400,)


In [21]:
X_train= np.array(X_train).astype(np.float32)
X_test=np.array(X_test).astype(np.float32)
y_train=np.array(y_train).astype(np.float32)
y_test=np.array(y_test).astype(np.float32)

### Function for getting the results 

In [22]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

### Model training

In [23]:
rf = RandomForestClassifier(n_estimators= 100,max_depth = 110,min_samples_leaf= 2)
clf_rf = rf.fit(X_train, y_train)

y_train_pred = clf_rf.predict(X_train)
y_test_pred = clf_rf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 99.47%
_______________________________________________
CLASSIFICATION REPORT:
                     0.0            1.0  accuracy      macro avg  \
precision       0.993460       0.995962  0.994708       0.994711   
recall          0.995972       0.993443  0.994708       0.994708   
f1-score        0.994714       0.994701  0.994708       0.994708   
support    242800.000000  242800.000000  0.994708  485600.000000   

            weighted avg  
precision       0.994711  
recall          0.994708  
f1-score        0.994708  
support    485600.000000  
_______________________________________________
Confusion Matrix: 
 [[241822    978]
 [  1592 241208]]

Test Result:
Accuracy Score: 68.09%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0           1.0  accuracy      macro avg   weighted avg
precision      0.686158      0.676020  0.680947       0.681089       0.681089
recall         0.666952      0.694942  0.680947  

In [24]:
rf = RandomForestClassifier(max_depth=10)
clf_rf = rf.fit(X_train, y_train)

y_train_pred = clf_rf.predict(X_train)
y_test_pred = clf_rf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 66.30%
_______________________________________________
CLASSIFICATION REPORT:
                     0.0            1.0  accuracy      macro avg  \
precision       0.679135       0.649560  0.663017       0.664347   
recall          0.618027       0.708007  0.663017       0.663017   
f1-score        0.647142       0.677525  0.663017       0.662333   
support    242800.000000  242800.000000  0.663017  485600.000000   

            weighted avg  
precision       0.664347  
recall          0.663017  
f1-score        0.662333  
support    485600.000000  
_______________________________________________
Confusion Matrix: 
 [[150057  92743]
 [ 70896 171904]]

Test Result:
Accuracy Score: 65.15%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0           1.0  accuracy      macro avg   weighted avg
precision      0.667000      0.638673  0.651524       0.652836       0.652836
recall         0.605189      0.697858  0.651524  

### Predict probabilities

In [25]:
probabilities = clf_rf.predict_proba(X_test[62].reshape(1, -1))

for i, probs in enumerate(probabilities):
    class_0_prob = probs[0]
    class_1_prob = probs[1]
    print(f"Sample {i+1}: Class 0 probability: {class_0_prob*100:.2f}%, Class 1 probability: {class_1_prob*100:.2f}%")

Sample 1: Class 0 probability: 33.04%, Class 1 probability: 66.96%


### Saving model

In [26]:
import os
import zipfile

In [27]:
with open('/kaggle/working/random_forest_model.pkl', 'wb') as f:
    cPickle.dump(clf_rf, f)
print(f"Random Forest: {np.round(os.path.getsize('/kaggle/working/random_forest_model.pkl') / 1024 / 1024, 2) } MB")

Random Forest: 11.7 MB


In [28]:
with zipfile.ZipFile('random_forest_model.zip', 'w') as zipf:
    zipf.write('random_forest_model.pkl')