In [53]:
#import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,precision_score,accuracy_score,recall_score,roc_auc_score


In [4]:
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [5]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,381.0,381.0,381.0,370.0,351.0
mean,3579.845144,1277.275381,104.986877,340.864865,0.837607
std,1419.813818,2340.818114,28.358464,68.549257,0.369338
min,150.0,0.0,9.0,12.0,0.0
25%,2600.0,0.0,90.0,360.0,1.0
50%,3333.0,983.0,110.0,360.0,1.0
75%,4288.0,2016.0,127.0,360.0,1.0
max,9703.0,33837.0,150.0,480.0,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [7]:
data.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
data.Gender.value_counts()

Gender
Male      291
Female     85
Name: count, dtype: int64

In [13]:
data.Dependents.value_counts()

Dependents
0     234
2      59
1      52
3+     28
Name: count, dtype: int64

In [14]:
data.Self_Employed.value_counts()

Self_Employed
No     325
Yes     35
Name: count, dtype: int64

In [17]:
data.Loan_Amount_Term.describe()

count    370.000000
mean     340.864865
std       68.549257
min       12.000000
25%      360.000000
50%      360.000000
75%      360.000000
max      480.000000
Name: Loan_Amount_Term, dtype: float64

In [18]:
data.Credit_History.describe()

count    351.000000
mean       0.837607
std        0.369338
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: Credit_History, dtype: float64

In [8]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [26]:
X = data[['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']]
y = data['Loan_Status']

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [29]:
numerical_features = [x for x in X.columns if X.dtypes[x] != 'object']
categorical_features = [x for x in X.columns if X.dtypes[x] == 'object']

In [24]:
numerical_features

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [30]:
categorical_features

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area']

In [35]:
categorical_pipeline = Pipeline(
    steps=[
        ('imputation_mode',SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32))
        ]
)

numerical_pipeline = Pipeline(
    steps=[
        ('imputation_mean',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_features),
    ('categorical_pipeline',categorical_pipeline,categorical_features)
])

In [51]:
preprocessor

In [37]:
scaled_X_train = preprocessor.fit_transform(X_train)
scaled_X_test = preprocessor.transform(X_test)

In [45]:
scaled_X_train[0]

array([-0.22563329,  0.53499125, -0.85688716,  0.27280857,  0.44991256,
        1.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ])

In [52]:
models = {
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier(),
    'Random Forest':RandomForestClassifier()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(scaled_X_train,y_train)

    y_train_pred = model.predict(scaled_X_train)
    y_test_pred = model.predict(scaled_X_test)

    #training set performance
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    #model_train_f1 = f1_score(y_train,y_train_pred,average='weighted')
    #model_train_precision = precision_score(y_train,y_train_pred)
    #model_train_recall =recall_score(y_train,y_train_pred,average="binary", pos_label="neg")
    #model_train_roc_auc_score =roc_auc_score(y_train,y_train_pred)

    #test set performance
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    #model_test_f1 = f1_score(y_test,y_test_pred,average='weighted')
    #model_test_precision = precision_score(y_test,y_test_pred)
    #model_test_recall =recall_score(y_test,y_test_pred,average="binary", pos_label="neg")
    #model_test_roc_auc_score =roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    print('\n')

    print('Model Performance for training set')
    print('Accuracy : {.2f}',format(model_train_accuracy))
    #print('f1_score : {.2f}',format(model_train_f1))
    #print('Precision : {.2f}',format(model_train_precision))
    #print('Recall : {.2f}',format(model_train_recall))
    #print('roc_auc_score : {.2f}',format(model_train_roc_auc_score))


    print('-'*50)

    print('Model Test Performance')
    print('Accuracy : {.2f}',format(model_test_accuracy))
    #print('f1 : {.2f}',format(model_test_f1))
    #print('Precision : {.2f}',format(model_test_precision))
    #print('Recall : {.2f}',format(model_test_recall))
    #print('roc_auc_score : {.2f}',format(model_test_roc_auc_score))

    print('='*50)
    print('\n')

Logistic Regression


Model Performance for training set
Accuracy : {.2f} 0.8486842105263158
--------------------------------------------------
Model Test Performance
Accuracy : {.2f} 0.8441558441558441


Decision Tree


Model Performance for training set
Accuracy : {.2f} 1.0
--------------------------------------------------
Model Test Performance
Accuracy : {.2f} 0.7662337662337663


Random Forest


Model Performance for training set
Accuracy : {.2f} 1.0
--------------------------------------------------
Model Test Performance
Accuracy : {.2f} 0.8051948051948052




In [None]:
RandomForestClassifier()

In [56]:
param_grid = {'n_estimators':[50,100,150,200,300],
              'criterion':['gini','entropy','log_loss'],
              'max_depth':[1,3,5,8,None],
              'min_samples_split':[0.1,0.3,0.5,0.7,0.9],
              'min_samples_leaf':[1,2,3,4,5]
}
grid = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,cv=5,verbose=True,n_jobs=-1,scoring='accuracy')

grid.fit(scaled_X_train,y_train)

Fitting 5 folds for each of 1875 candidates, totalling 9375 fits


In [57]:
grid.best_params_

{'criterion': 'log_loss',
 'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 0.1,
 'n_estimators': 50}

In [59]:
best_model = RandomForestClassifier(criterion='log_loss',max_depth=None,min_samples_leaf=4,min_samples_split=0.1,
n_estimators=50)

In [60]:
best_model.fit(scaled_X_train,y_train)
y_pred = best_model.predict(scaled_X_test)
print('accuracy : ',accuracy_score(y_test,y_pred))

accuracy :  0.8441558441558441
