In [51]:
!pip install matplotlib seaborn --upgrade --quiet
!pip install scikit-learn



# Model Preparation for Loan Approval

### Importing all the libraries to be used

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Reading the csv file using pandas

In [53]:
df1 = pd.read_csv("/Users/vanshtrivedi/Downloads/archive 3/loan_approval.csv")

df1

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [54]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [55]:
df1.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


#### Now we find the unique values in every field so that the missing values can be handled.

In [56]:
print(df1['Gender'].unique())

['Male' 'Female' nan]


In [57]:
print(df1['Married'].unique())

['No' 'Yes' nan]


In [58]:
print(df1['Dependents'].unique())

['0' '1' '2' '3+' nan]


In [59]:
print(df1['Education'].unique())

['Graduate' 'Not Graduate']


In [60]:
print(df1['Self_Employed'].unique())

['No' 'Yes' nan]


In [61]:
print(df1['Credit_History'].unique())

[ 1.  0. nan]


In [62]:
print(df1['Property_Area'].unique())

['Urban' 'Rural' 'Semiurban']


In [63]:
print(df1['Loan_Status'].unique())

['Y' 'N']


#### Now we replace the object type values with 0's and 1's and dropping the null values

In [64]:
df1['Married'] = df1['Married'].replace({'No': 0, 'Yes': 1})
df1['Married'] = df1['Married'].fillna(df1['Married'].mean())

In [65]:
df1['Education'] = df1['Education'].replace({'Not Graduate': 0, 'Graduate': 1})

In [66]:
df1['Gender'] = df1['Gender'].replace({'Female': 0, 'Male': 1})
df1['Gender'] = df1['Gender'].fillna(df1['Gender'].mean())

In [67]:
df1['Self_Employed'] = df1['Self_Employed'].replace({'No': 0, 'Yes': 1})
df1['Self_Employed'] = df1['Self_Employed'].fillna(df1['Self_Employed'].mean())

In [68]:
df1['Dependents'] = df1['Dependents'].replace({'0': 0, '1': 1, '2': 2, '3+': 3})
df1['Dependents'] = df1['Dependents'].fillna(df1['Dependents'].mean())

In [69]:
df1['Credit_History'] = df1['Credit_History'].fillna(df1['Credit_History'].mean())

In [70]:
df1['Property_Area'] = df1['Property_Area'].replace({'Rural': 0, 'Urban': 1, 'Semiurban':2})

In [71]:
df1['Loan_Status'] = df1['Loan_Status'].replace({'N': 0, 'Y': 1})

In [72]:
df1['LoanAmount'] = df1['LoanAmount'].fillna(df1['LoanAmount'].mean())

In [73]:
df1['Loan_Amount_Term'] = df1['Loan_Amount_Term'].fillna(df1['Loan_Amount_Term'].mean())

#### Checking for the Null values 

In [74]:
df1.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

#### Dropping the field "Loan_Status " and "Loan_ID". 

In [75]:
X = df1.drop(['Loan_Status','Loan_ID'], axis = 1)
X
y = df1['Loan_Status']
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

### Now splitting then the data into training and testing set using "train_test_split" function provide library scikit-learn ,here "test_size=.2" represent that our 20% of data will be used for testing and remaining 80% will be used for training.

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2)
X_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
133,1.000000,1.0,0.000000,1,1.0,3459,0.0,25.000000,120.0,1.0,2
0,1.000000,0.0,0.000000,1,0.0,5849,0.0,146.412162,360.0,1.0,1
256,1.000000,0.0,0.000000,0,0.0,6045,0.0,115.000000,360.0,0.0,0
477,0.813644,1.0,2.000000,1,0.0,2873,1872.0,132.000000,360.0,0.0,2
199,1.000000,0.0,0.000000,1,1.0,11000,0.0,83.000000,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
263,0.000000,0.0,0.000000,1,0.0,7200,0.0,120.000000,360.0,1.0,0
346,1.000000,1.0,0.762938,0,0.0,3523,3230.0,152.000000,360.0,0.0,0
239,1.000000,1.0,1.000000,1,0.0,3315,0.0,96.000000,360.0,1.0,2
131,1.000000,0.0,0.000000,1,0.0,2014,1929.0,74.000000,360.0,1.0,1


### Now we create a instance of logistic regression classifier ,where "LogisticRegression()"is the constructor for the logistic regression classifier.

In [77]:
logReg = LogisticRegression()

### Now we train a logistic regression classifier using a training dataset,where logReg is an instance of a logistic regression classifier, created above using LogisticRegression() and .fit(X_train, y_train) is used to train the logistic regression model with the training data

In [78]:
logReg.fit(X_train, y_train)

### Now we check the accuracy of our model

In [79]:
logReg.score(X_test, y_test)

0.8536585365853658

In [80]:
y_hat = logReg.predict(X_test)

In [81]:
accuracy_score(y_test, y_hat)

0.8536585365853658