# Home Loan Prediction

# Problem Statement

Dream Housing Finance company deals in all kinds of home loans. They have presence across all urban, semi urban and rural areas. Customer first applies for home loan and after that company validates the customer eligibility for loan.

Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have provided a dataset to identify the customers segments that are eligible for loan amount so that they can specifically target these customers. 



# Data Dictionary

#### Variable	Description
Loan_ID         	Unique Loan ID

Gender	            Male/ Female

Married	            Applicant married (Y/N)

Dependents	        Number of dependents

Education	        Applicant Education (Graduate/ Under Graduate
)
Self_Employed	    Self employed (Y/N)

ApplicantIncome	    Applicant income

CoapplicantIncome	Coapplicant income

LoanAmount	        Loan amount in thousands

Loan_Amount_Term	Term of loan in months

Credit_History	    credit history meets guidelines

Property_Area	    Urban/ Semi Urban/ Rural

Loan_Status	        (Target) Loan approved (Y/N)1


# Import Packages

In [40]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 
import os
import matplotlib.pyplot as plt#visualization
from PIL import  Image
%matplotlib inline
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization

# Data Import

In [41]:
Bank = pd.read_csv (r'C:\Users\kajal\Downloads\train_ctrUa4K.csv')

In [42]:
Bank.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [43]:
Bank.shape

(614, 13)

##### Data Type & Conversion

In [44]:
Bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


###### Identifying & Treatment Missing Value

In [45]:
# count the number of NaN values in each column
print(Bank.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


#### Replacing Missing value by Mode

In [46]:
Bank.Married.value_counts(dropna=False)

Yes    398
No     213
NaN      3
Name: Married, dtype: int64

In [47]:
for column in ['Gender','Married','Dependents','Self_Employed','LoanAmount','Loan_Amount_Term','Credit_History']:
    Bank[column].fillna(Bank[column].mode()[0], inplace=True)

In [48]:
Bank.Married.value_counts(dropna=False)

Yes    401
No     213
Name: Married, dtype: int64

In [49]:
# count the number of NaN values in each column
print(Bank.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [50]:
Bank.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,120.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Visualization

#### Bad Rate Analysis

In [51]:
Bank.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [13]:
import plotly.express as px

fig = px.pie(Bank,names='Loan_Status',color='Loan_Status',
             color_discrete_map={'Yes':'red',
                                 '`':'cyan'})
fig.show()

In [52]:
Bank_No= Bank[Bank["Loan_Status"] == "No"]

In [53]:
Bank_No.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


# Breaking Data into Two Part

Quantitative Variable

Qualitative Variable

In [54]:
Bank=Bank.drop('Loan_ID',axis=1)

In [55]:
Bank.select_dtypes(include=[np.number]).columns.tolist()

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [56]:
# Qualitative Variable
Quantitative_Variable = Bank[Bank.select_dtypes(include=[np.number]).columns.tolist()]
Quantitative_Variable.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849,0.0,120.0,360.0,1.0
1,4583,1508.0,128.0,360.0,1.0
2,3000,0.0,66.0,360.0,1.0


In [57]:
Qualitative_Variable = Bank[Bank.select_dtypes(include=['object']).columns.tolist()]
Qualitative_Variable.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y


## Converting qualitative Variable into number

In [58]:
from sklearn.preprocessing import LabelEncoder
Qualitative_Variable=Qualitative_Variable.apply(LabelEncoder().fit_transform) # label in ascending order
Qualitative_Variable.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,1,0,0,0,0,2,1
1,1,1,1,0,0,0,0
2,1,1,0,0,1,2,1


# Combining the data set

In [59]:
Final_combined = pd.concat([Qualitative_Variable, Quantitative_Variable],axis=1)
Final_combined.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,2,1,5849,0.0,120.0,360.0,1.0
1,1,1,1,0,0,0,0,4583,1508.0,128.0,360.0,1.0
2,1,1,0,0,1,2,1,3000,0.0,66.0,360.0,1.0
3,1,1,0,1,0,2,1,2583,2358.0,120.0,360.0,1.0
4,1,0,0,0,0,2,1,6000,0.0,141.0,360.0,1.0


# Data Partition

In [60]:
from sklearn.model_selection import train_test_split

# define our input variable (X) & output variable
X = Final_combined.drop('Loan_Status', axis = 1)
Y = Final_combined[['Loan_Status']]
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=56)

# Model building

# Hypothesis
H0 :- There is no log linear relationship between Loan_Status and all independent Variable
Vs

H1 :- There is log linear relationship between Loan_Status and all independent Variable
Alpha = 0.05

In [61]:
# Needed to run the logistic regression
import statsmodels.formula.api as smf
import statsmodels.api as sm
X_1 = sm.add_constant(X_train)  ## give intercept coefficient in the model
result=sm.Logit(y_train,X_1).fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.473455
         Iterations 7
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.253     
Dependent Variable: Loan_Status      AIC:              488.9329  
Date:               2021-11-18 15:57 BIC:              539.2902  
No. Observations:   491              Log-Likelihood:   -232.47   
Df Model:           11               LL-Null:          -311.36   
Df Residuals:       479              LLR p-value:      3.7999e-28
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
const             -2.6362   0.8802 -2.9950 0.0027 -4.3613 -0.9111
Gender            -0.0449   0.3148 -0.1428 0.8865 -0.6620  0.5721


## Backward method

In [62]:
cols = list(X.columns) # all column present  in x  
pmax = 1
while (len(cols)>0):  # count of variable should be greater than zero
    p= []             # empty list
    X_1 = X_train[cols]  # all column we are assign in x_1
    X_1 = sm.add_constant(X_1)# adding a column with value 1
    output=sm.Logit(y_train,X_1).fit() # Regression model
    p = pd.Series(output.pvalues.values[1:],index = cols) # to get p-values for all variable only     
    pmax = max(p)  # select a max P-value 
    feature_with_p_max = p.idxmax()   # idmax is used to display the variable name which has max P-value
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols

Optimization terminated successfully.
         Current function value: 0.473455
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473456
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473457
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473459
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473480
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473549
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.473676
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.475003
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.476628
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.478307
  

In [64]:
output.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.246
Dependent Variable:,Loan_Status,AIC:,475.6976
Date:,2021-11-18 15:57,BIC:,488.287
No. Observations:,491,Log-Likelihood:,-234.85
Df Model:,2,LL-Null:,-311.36
Df Residuals:,488,LLR p-value:,5.9076e-34
Converged:,1.0000,Scale:,1.0
No. Iterations:,7.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-3.0733,0.4977,-6.1748,0.0000,-4.0489,-2.0978
Married,0.6115,0.2362,2.5890,0.0096,0.1486,1.0744
Credit_History,3.9497,0.4807,8.2160,0.0000,3.0075,4.8919


#### There is log linear relationship between Loan_Status and Married,Credit_History

# Predictions on Train Dataset

In [63]:
train=pd.concat([X_train,y_train],axis=1)
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
542,0,0,1,0,0,1,3652,0.0,95.0,360.0,1.0,1
203,1,1,1,1,0,2,3500,1083.0,135.0,360.0,1.0,1
355,0,0,0,0,0,2,3813,0.0,116.0,180.0,1.0,1
195,1,1,1,0,0,1,3125,2583.0,170.0,360.0,1.0,0
458,1,0,2,0,0,0,4354,0.0,136.0,360.0,1.0,1


In [65]:
train['Probability']=output.predict(X_1)
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Probability
542,0,0,1,0,0,1,3652,0.0,95.0,360.0,1.0,1,0.706073
203,1,1,1,1,0,2,3500,1083.0,135.0,360.0,1.0,1,0.81576
355,0,0,0,0,0,2,3813,0.0,116.0,180.0,1.0,1,0.706073
195,1,1,1,0,0,1,3125,2583.0,170.0,360.0,1.0,0,0.81576
458,1,0,2,0,0,0,4354,0.0,136.0,360.0,1.0,1,0.706073


In [66]:
train['Predicted']=np.where(train['Probability'] > 0.7,1,0) 
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Probability,Predicted
542,0,0,1,0,0,1,3652,0.0,95.0,360.0,1.0,1,0.706073,1
203,1,1,1,1,0,2,3500,1083.0,135.0,360.0,1.0,1,0.81576,1
355,0,0,0,0,0,2,3813,0.0,116.0,180.0,1.0,1,0.706073,1
195,1,1,1,0,0,1,3125,2583.0,170.0,360.0,1.0,0,0.81576,1
458,1,0,2,0,0,0,4354,0.0,136.0,360.0,1.0,1,0.706073,1


# Model Performance Metrics

In [67]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(train['Predicted'], train['Loan_Status'])
print(matrix)

[[ 70   5]
 [ 92 324]]


In [68]:
Accuracy_Train=((550+142)/(784)*100)
print(Accuracy_Train)

88.26530612244898


##### Accuracy of Model is 88% , So we say Model good fit data

In [31]:
from sklearn.metrics import classification_report
print(classification_report(train['Predicted'],train['Loan_Status']))

              precision    recall  f1-score   support

           0       0.43      0.93      0.59        75
           1       0.98      0.78      0.87       416

    accuracy                           0.80       491
   macro avg       0.71      0.86      0.73       491
weighted avg       0.90      0.80      0.83       491



#### Accuracy of Bad Customer Capture by Model is 59% ( Sensitivity )

Accuracy of Good Customer Capture by Model is 93%

Accuracy of Predicted Bad Customer And often Correct is 98%

Accuracy of Predicted Good Customer And often Correct is 43%

# Test Data Import

In [32]:
BankTestData = pd.read_csv (r'C:\Users\kajal\Downloads\test_lAUu6dG.csv')

In [33]:
BankTestData.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [34]:
BankTestData.shape

(367, 12)

In [35]:
BankTestData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [36]:
test=pd.concat([X_test,y_test],axis=1)
test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
348,1,1,0,0,0,1,6333,4583.0,259.0,360.0,1.0,1
538,1,1,0,1,0,0,2917,536.0,66.0,360.0,1.0,0
430,0,0,1,0,1,1,8624,0.0,150.0,360.0,1.0,1
536,1,1,0,0,0,2,6133,3906.0,324.0,360.0,1.0,1
354,0,1,0,0,0,1,2423,505.0,130.0,360.0,1.0,1


In [37]:
X_test = sm.add_constant(X_test)
X_test.head()

Unnamed: 0,const,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
348,1.0,1,1,0,0,0,1,6333,4583.0,259.0,360.0,1.0
538,1.0,1,1,0,1,0,0,2917,536.0,66.0,360.0,1.0
430,1.0,0,0,1,0,1,1,8624,0.0,150.0,360.0,1.0
536,1.0,1,1,0,0,0,2,6133,3906.0,324.0,360.0,1.0
354,1.0,0,1,0,0,0,1,2423,505.0,130.0,360.0,1.0


In [38]:
abc=X_test[['const', 'Married','ApplicantIncome','LoanAmount','Credit_History']]
abc.head(2)

Unnamed: 0,const,Married,ApplicantIncome,LoanAmount,Credit_History
348,1.0,1,6333,259.0,1.0
538,1.0,1,2917,66.0,1.0


# Model Performance Metrics on Test data

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(test['Predicted'],test['Loan_Status'])
print(matrix)

In [None]:
Accuracy_test=((150+10)/(197)*100)
Accuracy_test

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test['Predicted'],test['Loan_Status']))

## Finish