In [1]:
#We start by importing pandas to load help us load and manipulate the csv file
import pandas as pd

In [2]:
#Loading the customer details in csv file
customer_data_df = pd.read_csv('./resource/loan_data.csv')

## The Customer details/data set contains the following variables that we can use to make decisions on whether to give them loan or not
> Loan_ID : Unique Loan ID

> Gender : Male/ Female

> Married : Applicant married (Y/N)

> Dependents : Number of dependents

> Education : Applicant Education (Graduate/ Under Graduate)
> Self_Employed : Self employed (Y/N)

> ApplicantIncome : Applicant income

> CoapplicantIncome : Coapplicant income

> LoanAmount : Loan amount in thousands of dollars

> Loan_Amount_Term : Term of loan in months

> Credit_History : Credit history meets guidelines yes or no

> Property_Area : Urban/ Semi Urban/ Rural
> Loan_Status : Loan approved (Y/N) this is the target variable

## Let us check the first 5 and last 5 rows of the dataset

In [3]:
#View the first five row of the dataset
customer_data_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [4]:
#Check the last five rows of the dataset
customer_data_df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


## We now need to fine the shape of our dataset
This is the number of rows and number of columns

In [5]:
#use shape attribute to get the shape of dataframe
customer_data_df.shape

(381, 13)

In [6]:
print("Number of Rows", customer_data_df.shape[0])
print("Number of Columns", customer_data_df.shape[1])

Number of Rows 381
Number of Columns 13


## We now need to get info about our dataset.

>The total number of rows
 Total number of columns
Datatypes of each column
and memory requirement

In [7]:
customer_data_df.info

<bound method DataFrame.info of       Loan_ID  Gender Married Dependents     Education Self_Employed  \
0    LP001003    Male     Yes          1      Graduate            No   
1    LP001005    Male     Yes          0      Graduate           Yes   
2    LP001006    Male     Yes          0  Not Graduate            No   
3    LP001008    Male      No          0      Graduate            No   
4    LP001013    Male     Yes          0  Not Graduate            No   
..        ...     ...     ...        ...           ...           ...   
376  LP002953    Male     Yes         3+      Graduate            No   
377  LP002974    Male     Yes          0      Graduate            No   
378  LP002978  Female      No          0      Graduate            No   
379  LP002979    Male     Yes         3+      Graduate            No   
380  LP002990  Female      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               4583          

## Check null values

In [8]:
customer_data_df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
#Check the percentage of missing value pr column
customer_data_df.isnull().sum()*100 / len(customer_data_df)

Loan_ID              0.000000
Gender               1.312336
Married              0.000000
Dependents           2.099738
Education            0.000000
Self_Employed        5.511811
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     2.887139
Credit_History       7.874016
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

## Handling missing values

In [10]:
customer_data_df = customer_data_df.drop('Loan_ID', axis=1)

In [11]:
customer_data_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


### Remove missing value for columns with percentage missing value of below 5%

In [12]:
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

In [13]:
customer_data_df = customer_data_df.dropna(subset=columns)

In [14]:
#Check the percentage of missing value pr column
customer_data_df.isnull().sum()*100 / len(customer_data_df)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.586592
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.379888
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

### For the remaining two columns(Self_Employed and Credit_History) we will feel the missing values with No

In [15]:
customer_data_df['Self_Employed'].mode()

0    No
Name: Self_Employed, dtype: object

In [16]:
customer_data_df['Self_Employed'] = customer_data_df['Self_Employed'].fillna(customer_data_df['Self_Employed'].mode()[0])

In [17]:
#Check the percentage of missing value pr column
customer_data_df.isnull().sum()*100 / len(customer_data_df)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        0.000000
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.379888
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [18]:
customer_data_df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [19]:
customer_data_df['Credit_History'].unique()

array([ 1., nan,  0.])

In [20]:
#Most frequent value
customer_data_df['Credit_History'].mode()[0]

1.0

In [21]:
customer_data_df['Credit_History'] = customer_data_df['Credit_History'].fillna(customer_data_df['Credit_History'].mode()[0])

In [22]:
#Check the percentage of missing value pr column
customer_data_df.isnull().sum()*100 / len(customer_data_df)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

## Next is handling categorical columns

In [23]:
customer_data_df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
325,Male,Yes,3+,Graduate,No,3466,3428.0,150.0,360.0,1.0,Rural,Y
41,Male,Yes,2,Graduate,No,2708,1167.0,97.0,360.0,1.0,Semiurban,Y
288,Male,Yes,0,Graduate,No,3948,1733.0,149.0,360.0,0.0,Rural,N
362,Male,No,0,Graduate,No,3229,2739.0,110.0,360.0,1.0,Urban,Y
70,Male,Yes,2,Not Graduate,No,4288,3263.0,133.0,180.0,1.0,Urban,Y
331,Male,Yes,3+,Graduate,Yes,5677,1424.0,100.0,360.0,1.0,Rural,Y
80,Male,Yes,2,Graduate,No,2957,0.0,81.0,360.0,1.0,Semiurban,Y
230,Male,No,0,Graduate,No,2500,0.0,96.0,480.0,1.0,Semiurban,N
201,Male,Yes,2,Graduate,No,3717,0.0,120.0,360.0,1.0,Semiurban,Y
295,Female,No,1,Not Graduate,No,5191,0.0,132.0,360.0,1.0,Semiurban,Y


In [24]:
customer_data_df['Dependents'] = customer_data_df['Dependents'].replace(to_replace="3+", value=4)

In [25]:
customer_data_df['Dependents'].unique()

array(['1', '0', '2', 4], dtype=object)

## Assigning numerical values to columns with non-numerical values

*This columns include:*
- Gender	
- Married		
- Education	
- Self_Employed
- Property_Area

In [26]:
customer_data_df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [27]:
customer_data_df['Gender'] = customer_data_df['Gender'].map({'Male':1, 'Female':0}).astype('int')

In [28]:
# Check unique values
customer_data_df['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [29]:
customer_data_df['Married'] = customer_data_df['Married'].map({'Yes':1, 'No':0}).astype('int')
customer_data_df['Education'] = customer_data_df['Education'].map({'Graduate':1, 'Not Graduate':0}).astype('int')
customer_data_df['Self_Employed'] = customer_data_df['Self_Employed'].map({'Yes':1, 'No':0}).astype('int')
customer_data_df['Property_Area'] = customer_data_df['Property_Area'].map({'Rural':0, 'Semiurban':2, 'Urban':1}).astype('int')
customer_data_df['Loan_Status'] = customer_data_df['Loan_Status'].map({'Y':1, 'N':0}).astype('int')

In [30]:
customer_data_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,1,1


## Next step is to store feature Matrix in X and Response in vector y

In [31]:
X = customer_data_df.drop('Loan_Status', axis=1)

In [32]:
y = customer_data_df['Loan_Status']
y

0      0
1      1
2      1
3      1
4      1
      ..
376    1
377    1
378    1
379    1
380    0
Name: Loan_Status, Length: 358, dtype: int64

## Feature Scaling

> This allows us to put our features in the same scale
> using `head()` you we find that columns like Applicantincome and LoanAmount are not in the same scale

In [33]:
customer_data_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,1,1


In [34]:
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [35]:
#perform feature scaling
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X[cols] = st.fit_transform(X[cols])

In [36]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,0.711630,0.092069,0.805980,0.285826,1.0,0
1,1,1,0,1,1,-0.398856,-0.539332,-1.350425,0.285826,1.0,1
2,1,1,0,0,0,-0.691384,0.447965,0.527735,0.285826,1.0,1
3,1,0,0,1,0,1.705666,-0.539332,1.258130,0.285826,1.0,1
4,1,1,0,0,0,-0.866761,0.095418,-0.341784,0.285826,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
376,1,1,4,1,0,1.497318,-0.539332,0.805980,0.285826,1.0,1
377,1,1,0,1,0,-0.236106,0.277135,0.110366,0.285826,1.0,0
378,0,0,0,1,0,-0.469007,-0.539332,-1.176522,0.285826,1.0,0
379,1,1,4,1,0,0.377011,-0.539332,-2.254724,-2.337910,1.0,0


## Next is to split out dataset into the training and test data sets them we apply K-Fod Cross Validation

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

In [38]:
model_df = {}
def model_val(model, X, y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")

    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Average Cross Validation score is {np.mean(score)}")
    model_df[model] = round(np.mean(score)*100,2)

## Using Regression 

In [39]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model,X,y)

LogisticRegression() accuracy is 0.8472222222222222
LogisticRegression() Average Cross Validation score is 0.8354068857589985


In [40]:
model_df

{LogisticRegression(): 83.54}

## SVC

In [41]:
from sklearn import svm
model = svm.SVC()
model_val(model,X,y)

SVC() accuracy is 0.8472222222222222
SVC() Average Cross Validation score is 0.8296948356807512


## Decision Classifier

In [42]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model,X,y)

DecisionTreeClassifier() accuracy is 0.8333333333333334
DecisionTreeClassifier() Average Cross Validation score is 0.7820422535211267


## Random Forest Classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()
model_val(model,X,y)

RandomForestClassifier() accuracy is 0.8472222222222222
RandomForestClassifier() Average Cross Validation score is 0.8324726134585289


## Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier
model =GradientBoostingClassifier()
model_val(model,X,y)

GradientBoostingClassifier() accuracy is 0.8611111111111112
GradientBoostingClassifier() Average Cross Validation score is 0.8240219092331769


Hyperparameter tuning

In [45]:
from sklearn.model_selection import RandomizedSearchCV

Logistic Regression

In [46]:
log_reg_grid = {"C":np.logspace(-4,4,20),
                "solver":['liblinear']}

In [51]:
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                   n_iter=20, cv=5, verbose=True)

In [52]:
#check best scorers_log_reg.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [53]:
rs_log_reg.best_score_

0.8354068857589985

In [54]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 1.623776739188721}

SVC

In [67]:
svc_grid ={'C':[0.25,0.50,0.75,1],"kernel":["linear"]}

In [68]:
rs_svc = RandomizedSearchCV(svm.SVC(),
                            param_distributions=svc_grid,
                            cv=5,
                            n_iter=20,
                            verbose=True)

In [69]:
rs_svc.fit(X,y)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [70]:
rs_svc.best_score_

0.840962441314554

In [71]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

Randon Forest Classifier

In [72]:
RandomForestClassifier()

In [73]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['auto','sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }

In [74]:
rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [75]:
rs_rf.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


65 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "/home/wasan/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/wasan/.local/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/home/wasan/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/wasan/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    r

In [76]:
rs_rf.best_score_

0.840962441314554

In [77]:

rs_rf.best_params_

{'n_estimators': 620,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 5}

Save Model

In [79]:
X = customer_data_df.drop('Loan_Status',axis=1)
y = customer_data_df['Loan_Status']

In [80]:
rf = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)

In [81]:
rf.fit(X,y)

In [82]:
import joblib

In [83]:
joblib.dump(rf,'loan_status_predict')

['loan_status_predict']

In [84]:
model = joblib.load('loan_status_predict')

In [85]:
import pandas as pd
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [86]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,2889,0.0,45,180,0,1


In [87]:
result = model.predict(df)

In [89]:
result

array([0])

In [90]:
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved


In [92]:
from tkinter import *
import joblib
import pandas as pd

In [93]:
def show_entry():

    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())

    model = joblib.load('loan_status_predict')
    df = pd.DataFrame({
    'Gender':p1,
    'Married':p2,
    'Dependents':p3,
    'Education':p4,
    'Self_Employed':p5,
    'ApplicantIncome':p6,
    'CoapplicantIncome':p7,
    'LoanAmount':p8,
    'Loan_Amount_Term':p9,
    'Credit_History':p10,
    'Property_Area':p11
},index=[0])
    result = model.predict(df)

    if result == 1:
        Label(master, text="Loan approved").grid(row=31)
    else:
        Label(master, text="Loan Not Approved").grid(row=31)


master =Tk()
master.title("Loan Status Prediction Using Machine Learning")
label = Label(master,text = "Loan Status Prediction",bg = "black",
               fg = "white").grid(row=0,columnspan=2)

Label(master,text = "Gender [1:Male ,0:Female]").grid(row=1)
Label(master,text = "Married [1:Yes,0:No]").grid(row=2)
Label(master,text = "Dependents [1,2,3,4]").grid(row=3)
Label(master,text = "Education").grid(row=4)
Label(master,text = "Self_Employed").grid(row=5)
Label(master,text = "ApplicantIncome").grid(row=6)
Label(master,text = "CoapplicantIncome").grid(row=7)
Label(master,text = "LoanAmount").grid(row=8)
Label(master,text = "Loan_Amount_Term").grid(row=9)
Label(master,text = "Credit_History").grid(row=10)
Label(master,text = "Property_Area").grid(row=11)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)

Button(master,text="Predict",command=show_entry).grid()

mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "/usr/lib/python3.10/tkinter/__init__.py", line 1921, in __call__
    return self.func(*args)
  File "/tmp/ipykernel_3336/22229994.py", line 3, in show_entry
    p1 = float(e1.get())
ValueError: could not convert string to float: ''
