# Problem Statement:Loan Status Prediction

# Import the libraries

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the dataset

In [95]:
df =pd.read_csv('loan_prediction.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# Descriptive statistics

In [96]:
df.shape

(614, 13)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [98]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [99]:
df.isna().sum() # credit history column shows na values

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# handle the missing values

In [100]:
df = df.drop('Loan_ID',axis=1)

In [101]:
#We drop missing rows of the column which are less than 5%
df.isnull().sum() * 100 /len(df)

Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [102]:
#based on above o/p we drop rows of columns :Gender,Dependents,LoanAmount,Loan_Amount_Term
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']
df = df.dropna(subset=columns) #dropna to drop the rows

In [103]:
#check
df.isnull().sum() * 100 /len(df)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [104]:
df['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [105]:
df['Self_Employed'].mode() # since using mode ,we get that highest frequency of rows is "No" for 0

0    No
Name: Self_Employed, dtype: object

In [106]:
df['Self_Employed'].mode()[0] # since using mode ,we get that most frequent value in column is "No" at [0]

'No'

In [107]:
# fill the missing values for rows of the columns which are more than 5%
df['Self_Employed']=df['Self_Employed'].fillna(df['Self_Employed'].mode()[0]) # using the same mode value here to fill missing values

In [108]:
df['Credit_History'].unique()

array([ 1.,  0., nan])

In [109]:
df['Credit_History'].mode()

0    1.0
Name: Credit_History, dtype: float64

In [110]:
df['Credit_History'].mode()[0]

1.0

In [111]:
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [112]:
#check
df.isnull().sum() * 100 /len(df)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

# handling the categorical columns

In [113]:
#handling the categorical columns
df.sample(5) # gives random 5 samples

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
478,Male,Yes,1,Graduate,Yes,16667,2250.0,86.0,360.0,1.0,Semiurban,Y
496,Male,Yes,0,Not Graduate,No,2600,1700.0,107.0,360.0,1.0,Rural,Y
598,Male,Yes,0,Graduate,Yes,9963,0.0,180.0,360.0,1.0,Rural,Y
468,Female,Yes,2,Not Graduate,No,210,2917.0,98.0,360.0,1.0,Semiurban,Y
182,Male,Yes,0,Graduate,No,4600,0.0,73.0,180.0,1.0,Semiurban,Y


In [114]:
# we replace 3+ in 'dependents' column with 4
df['Dependents']=df['Dependents'].replace(to_replace="3+",value='4')

In [115]:
df['Dependents'].unique() #check

array(['1', '0', '2', '4'], dtype=object)

In [116]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [117]:
df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [118]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [119]:
df['Gender'].isna().sum()

0

In [120]:
df['Property_Area'].unique()

array(['Rural', 'Urban', 'Semiurban'], dtype=object)

In [121]:
df['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [122]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0}).astype('int')
df['Married'] = df['Married'].map({'Yes':1,'No':0}).astype('int')
df['Education']=df['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
df['Property_Area'] = df['Property_Area'].map({'Semiurban': 2, 'Urban': 1, 'Rural': 0}).astype('int')
df['Loan_Status']=df['Loan_Status'].map({'Y':1,'N':0}).astype('int')
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1,'No':0}).astype('int')

In [123]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


# Feature scaling

In [138]:
X= df.drop(['Loan_Status'],axis=1)
y=df['Loan_Status']

In [139]:
#We will include only those columns which are not in same range
cols= ['ApplicantIncome',	'CoapplicantIncome'	,'LoanAmount'	,'Loan_Amount_Term']
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X[cols]=sc.fit_transform(X[cols])

In [140]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,1
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,1
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,1
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,1
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,1


# Train test split (K-fold Cross Validation)

In [141]:
#performing train-test split using K-fold Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [144]:
model_df={}
def model_val(model,X,y):
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")

  score = cross_val_score(model,X,y,cv=5)
  print(f"{model} average cross val score is {np.mean(score)}")
  model_df[model]=round(np.mean(score)*100,2) # model score percentage with precision - 2 decimal

# Logistic Regression

In [145]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model,X,y)


LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression() average cross val score is 0.8047829647829647


In [151]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 71.8,
 RandomForestClassifier(): 78.67,
 GradientBoostingClassifier(): 77.76}

# SVC

In [147]:
from sklearn import svm
model = svm.SVC()
model_val(model,X,y)



SVC() accuracy is 0.7927927927927928
SVC() average cross val score is 0.7938902538902539


In [148]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model,X,y)

DecisionTreeClassifier() accuracy is 0.7297297297297297
DecisionTreeClassifier() average cross val score is 0.7179852579852579


In [149]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_val(model,X,y)

RandomForestClassifier() accuracy is 0.7657657657657657
RandomForestClassifier() average cross val score is 0.7866830466830466


In [150]:
from sklearn.ensemble import GradientBoostingClassifier
model =GradientBoostingClassifier()
model_val(model,X,y)

GradientBoostingClassifier() accuracy is 0.7927927927927928
GradientBoostingClassifier() average cross val score is 0.7776085176085176


In [152]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 71.8,
 RandomForestClassifier(): 78.67,
 GradientBoostingClassifier(): 77.76}

# Hyperparameter tuning

In [155]:
# In ML, There are 2 types of parameters :
#one is "model parameters" --> These are fixed parameters which model will learn during training phase from the data eg. y=mx+c , so m and c are model parameters learnt during training phase
#second is "Hyperparameters" --> These are adjustable parameters,that must be tuned to obtain model with optimal performance.
#ML models can have many hypermeters and finding best combination of parameters can be treated as search problem.
#2 best strategies for hyperparameter tuning 1. Grid searchCv,2. RandomisedsearchCv
# 1. Grid searchCv -->computationally very expensive,as it will go through all the parameters.
#2. RandomisedsearchCv --> Better option than gridsearch,as it goes through only fixed number of hyperparameter settings.It moves within a grid in a random fashion to find the best set of hyperparameters

In [156]:
from sklearn.model_selection import RandomizedSearchCV

# Logistic regression

In [157]:
#lets tune hyperparameter of logistic regression,it has many parameters but we are going to tune 'C' and 'Solver' parameters

log_reg_grid={"C":np.linspace(-4,4,20),
              "solver":['liblinear']}


In [158]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),param_distributions=log_reg_grid,n_iter=20,cv=5,verbose=True)

In [159]:
rs_log_reg.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterErr

In [160]:
rs_log_reg.best_score_

0.8047829647829647

In [161]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.21052631578947345}

#SVC

In [163]:
svc_grid={'C':[0.25,0.5,0.75,1.0],"kernel":["linear"]}

In [165]:
rs_svc=RandomizedSearchCV(svm.SVC(),param_distributions=svc_grid,cv=5,n_iter=20,verbose=True)

In [166]:
rs_svc.fit(X,y)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [167]:
rs_svc.best_score_

0.8066011466011467

In [168]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

# Random Forest

In [176]:
rf_grid={'n_estimators':np.arange(10,1000,10),'max_features':['auto','sqrt'],'max_depth':[None,3,5,10,20,30],'min_samples_split':[2,5,20,50,100],'min_samples_leaf':[1,2,5,10]}

In [178]:
rs_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=rf_grid, cv=5,n_iter=20,verbose=True)

In [179]:
rs_rf.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [180]:
rs_rf.best_score_

0.8066175266175266

In [181]:
rs_rf.best_params_

{'n_estimators': 450,
 'min_samples_split': 50,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20}

In [None]:
LogisticRegression score Before Hyperparameter Tuning: 80.48
LogisticRegression score after Hyperparameter Tuning: 80.18 (ie 0.8018018018018018)

------------------------------------------------------
SVC score Before Hyperparameter Tuning: 79.39
SVC score after Hyperparameter Tuning: 80.66 (ie 0.8066011466011467)

--------------------------------------------------------
RandomForestClassifier score Before Hyperparameter Tuning: 78.67
RandomForestClassifier score after Hyperparameter Tuning: 80.66 (ie.0.8066175266175266)

In [182]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 71.8,
 RandomForestClassifier(): 78.67,
 GradientBoostingClassifier(): 77.76}

# Save the model :with best score model - randomForestClassifier

In [185]:
#train the best model on entire dataset (random forest)
X = df.drop('Loan_Status',axis=1)
y = df['Loan_Status']

In [186]:
# picking the best score parameters of random forest
rf = RandomForestClassifier(n_estimators= 450,
 min_samples_split=50,
 min_samples_leaf= 1,
 max_features= 'sqrt',
 max_depth= 20)

In [187]:
#train the model
rf.fit(X,y)

# Load the best model

In [188]:
import joblib
joblib.dump(rf,'loan_status_predict')

['loan_status_predict']

In [189]:
#load the saved model and can use for prediction
model = joblib.load('loan_status_predict')

# Perform prediction on unseen data

In [190]:
import pandas as pd
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [191]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,2889,0.0,45,180,0,1


In [193]:
model.predict(df) #output is '0' means loan is not approved

array([0])

In [194]:
#display
result = model.predict(df)
if result==1:
  print("loan approved")
else:
  print("loan not approved")

loan not approved
