In [36]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df=pd.read_csv("loandataset.csv")
df_copy=df.copy()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
#statistical exploration
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
#number of missing values in each column
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
# droping missing value
df=df.dropna()

In [7]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

## Unique Values

In [8]:
print("Married:",df["Married"].unique())
print("Gender:",df["Gender"].unique())
print("Dependents:",df["Dependents"].unique())
print("Education:",df["Education"].unique(),)
print("Self_Employed:",df["Self_Employed"].unique())
print("Property_Area:",df["Property_Area"].unique())
print("Loan_Status:",df["Loan_Status"].unique())


Married: ['Yes' 'No']
Gender: ['Male' 'Female']
Dependents: ['1' '0' '2' '3+']
Education: ['Graduate' 'Not Graduate']
Self_Employed: ['No' 'Yes']
Property_Area: ['Rural' 'Urban' 'Semiurban']
Loan_Status: ['N' 'Y']


In [9]:
## applying Label Encoding and One Hot Encoding
categorical_cols=["Married","Gender","Education","Self_Employed", "Loan_Status"]
LE=LabelEncoder()

In [10]:
df[categorical_cols]=df[categorical_cols].apply(lambda col:LE.fit_transform(col))
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,1
5,LP001011,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,Rural,1
610,LP002979,1,1,3+,0,0,4106,0.0,40.0,180.0,1.0,Rural,1
611,LP002983,1,1,1,0,0,8072,240.0,253.0,360.0,1.0,Urban,1
612,LP002984,1,1,2,0,0,7583,0.0,187.0,360.0,1.0,Urban,1


In [11]:
x=df.drop(columns=["Loan_ID","Loan_Status"], axis=1)
y=df["Loan_Status"]

In [12]:
x_encoded=pd.get_dummies(x,columns=["Dependents","Property_Area"])
x_encoded

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,1,0,0,1,0,0
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,0,1
3,1,1,1,0,2583,2358.0,120.0,360.0,1.0,1,0,0,0,0,0,1
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,1
5,1,1,0,1,5417,4196.0,267.0,360.0,1.0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,2900,0.0,71.0,360.0,1.0,1,0,0,0,1,0,0
610,1,1,0,0,4106,0.0,40.0,180.0,1.0,0,0,0,1,1,0,0
611,1,1,0,0,8072,240.0,253.0,360.0,1.0,0,1,0,0,0,0,1
612,1,1,0,0,7583,0.0,187.0,360.0,1.0,0,0,1,0,0,0,1


In [13]:
x_train, x_test, y_train, y_test=train_test_split(x_encoded, y, test_size=0.3, random_state=42)
model=LogisticRegression()
model.fit(x_train, y_train)

In [14]:
y_testpred =model.predict(x_test)
y_trainpred =model.predict(x_train)

In [15]:
print(confusion_matrix(y_train, y_trainpred))
print(classification_report(y_train, y_trainpred))

[[ 51  53]
 [ 12 220]]
              precision    recall  f1-score   support

           0       0.81      0.49      0.61       104
           1       0.81      0.95      0.87       232

    accuracy                           0.81       336
   macro avg       0.81      0.72      0.74       336
weighted avg       0.81      0.81      0.79       336



In [16]:
print(confusion_matrix(y_test, y_testpred))
print(classification_report(y_test, y_testpred))

[[15 29]
 [ 1 99]]
              precision    recall  f1-score   support

           0       0.94      0.34      0.50        44
           1       0.77      0.99      0.87       100

    accuracy                           0.79       144
   macro avg       0.86      0.67      0.68       144
weighted avg       0.82      0.79      0.76       144



## Random Forest Classifer

In [17]:
RFC=RandomForestClassifier(random_state=42, criterion="gini")
RFC.fit(x_train, y_train)

In [18]:
y_pred_test_RFC=RFC.predict(x_test)
y_pred_train_RFC=RFC.predict(x_train)

In [19]:
print(confusion_matrix(y_test, y_pred_test_RFC))
print(classification_report(y_test, y_pred_test_RFC))

[[18 26]
 [ 2 98]]
              precision    recall  f1-score   support

           0       0.90      0.41      0.56        44
           1       0.79      0.98      0.87       100

    accuracy                           0.81       144
   macro avg       0.85      0.69      0.72       144
weighted avg       0.82      0.81      0.78       144



In [20]:
print(confusion_matrix(y_train, y_pred_train_RFC))
print(classification_report(y_train, y_pred_train_RFC))

[[104   0]
 [  0 232]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       104
           1       1.00      1.00      1.00       232

    accuracy                           1.00       336
   macro avg       1.00      1.00      1.00       336
weighted avg       1.00      1.00      1.00       336



In [21]:
## Hyperparameter tunning
param={
    "max_depth":[2,5,7,10],
    "min_samples_leaf":[2,5,7,10],
    "min_samples_split":[10,15,20,25]
}

In [22]:
grid_RFC=GridSearchCV(estimator=RFC, param_grid=param, cv=5, scoring="accuracy")
grid_RFC.fit(x_train, y_train)

In [23]:
grid_RFC.best_estimator_

In [24]:
## 2nd Hyperparameter tunning
param2={
    "max_depth":range(5,15),
    "min_samples_leaf":[2],
    "min_samples_split":range(20,30)
}

In [25]:
grid_RFC2=GridSearchCV(estimator=RFC, param_grid=param2, cv=5, scoring="accuracy")
grid_RFC2.fit(x_train, y_train)

In [26]:
grid_RFC2.best_estimator_

In [27]:
y_pred_test_RFC_grid_RFC2=grid_RFC2.predict(x_test)
y_pred_train_RFC_grid_RFC2=grid_RFC2.predict(x_train)

In [28]:
print(confusion_matrix(y_test, y_pred_test_RFC_grid_RFC2))
print(classification_report(y_test, y_pred_test_RFC_grid_RFC2))

[[15 29]
 [ 1 99]]
              precision    recall  f1-score   support

           0       0.94      0.34      0.50        44
           1       0.77      0.99      0.87       100

    accuracy                           0.79       144
   macro avg       0.86      0.67      0.68       144
weighted avg       0.82      0.79      0.76       144



In [29]:
print(confusion_matrix(y_train, y_pred_train_RFC_grid_RFC2))
print(classification_report(y_train, y_pred_train_RFC_grid_RFC2))

[[ 50  54]
 [  7 225]]
              precision    recall  f1-score   support

           0       0.88      0.48      0.62       104
           1       0.81      0.97      0.88       232

    accuracy                           0.82       336
   macro avg       0.84      0.73      0.75       336
weighted avg       0.83      0.82      0.80       336



## Gradient Boosting Classifier

In [37]:
gbc=GradientBoostingClassifier(random_state=42)
gbc.fit(x_train, y_train)

In [38]:
y_pred_test_gbc=gbc.predict(x_test)
y_pred_train_gbc=gbc.predict(x_train)

In [39]:
print(confusion_matrix(y_test, y_pred_test_gbc))
print(classification_report(y_test, y_pred_test_gbc))

[[15 29]
 [ 1 99]]
              precision    recall  f1-score   support

           0       0.94      0.34      0.50        44
           1       0.77      0.99      0.87       100

    accuracy                           0.79       144
   macro avg       0.86      0.67      0.68       144
weighted avg       0.82      0.79      0.76       144



In [40]:
print(confusion_matrix(y_train, y_pred_train_gbc))
print(classification_report(y_train, y_pred_train_gbc))

[[ 75  29]
 [  1 231]]
              precision    recall  f1-score   support

           0       0.99      0.72      0.83       104
           1       0.89      1.00      0.94       232

    accuracy                           0.91       336
   macro avg       0.94      0.86      0.89       336
weighted avg       0.92      0.91      0.91       336



In [41]:
## Hyperparameter tuning for Gradient Boosting
param={
    "max_depth":[2,5,10,15],
    "min_samples_leaf":[2,5,10,15],
    "n_estimators":[10,15,20,25],
    "learning_rate":[0.1,0.2,0.3,0.4]
}

In [43]:
grid_gbc=GridSearchCV(estimator=gbc, param_grid=param, cv=5, scoring="accuracy")
grid_gbc.fit(x_train, y_train)

In [47]:
grid_gbc.best_estimator_

In [50]:
## 2nd time Hyperparameter tuning for Gradient Boosting
param2={
    "max_depth":range(5,10),
    "min_samples_leaf":range(10,15),
    "n_estimators":range(10,20),
    "learning_rate":[0.4,0.5,0.6,0.7,0.8,0.9]
}

In [51]:
grid_gbc2=GridSearchCV(estimator=gbc, param_grid=param2, cv=5, scoring="accuracy")
grid_gbc2.fit(x_train, y_train)

In [52]:
grid_gbc2.best_estimator_

In [53]:
y_pred_test_grid_gbc2=grid_gbc.predict(x_test)
y_pred_train_grid_gbc2=grid_gbc.predict(x_train)

In [54]:
print(confusion_matrix(y_test, y_pred_test_grid_gbc2))
print(classification_report(y_test, y_pred_test_grid_gbc2))

[[ 14  30]
 [  0 100]]
              precision    recall  f1-score   support

           0       1.00      0.32      0.48        44
           1       0.77      1.00      0.87       100

    accuracy                           0.79       144
   macro avg       0.88      0.66      0.68       144
weighted avg       0.84      0.79      0.75       144



In [55]:
print(confusion_matrix(y_train, y_pred_train_grid_gbc2))
print(classification_report(y_train, y_pred_train_grid_gbc2))

[[ 49  55]
 [  3 229]]
              precision    recall  f1-score   support

           0       0.94      0.47      0.63       104
           1       0.81      0.99      0.89       232

    accuracy                           0.83       336
   macro avg       0.87      0.73      0.76       336
weighted avg       0.85      0.83      0.81       336

