# Loan status prediction
made by: Szabolcs Füle

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

import plotly.express as px

In [39]:
df = pd.read_csv('https://raw.githubusercontent.com/szabolcsfule/loan_status_prediction/master/data.csv')
# df = pd.read_csv('data.csv')

### Exploratory Data Analysis

In [3]:
df.head(6)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df.shape

(480, 13)

In [8]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,480.0,480.0,480.0,480.0,480.0
mean,5364.23125,1581.093583,144.735417,342.05,0.854167
std,5668.251251,2617.692267,80.508164,65.212401,0.353307
min,150.0,0.0,9.0,36.0,0.0
25%,2898.75,0.0,100.0,360.0,1.0
50%,3859.0,1084.5,128.0,360.0,1.0
75%,5852.5,2253.25,170.0,360.0,1.0
max,81000.0,33837.0,600.0,480.0,1.0


### Data visualisation

Count of females and males

In [9]:
gender_counts = df['Gender'].value_counts()
df_gender = pd.DataFrame(gender_counts).reset_index()
df_gender = df_gender.rename(columns={"index": "gender", "Gender": "count"})
df_gender

Unnamed: 0,gender,count
0,Male,394
1,Female,86


In [10]:
fig = px.pie(df_gender, values='count', names='gender', color='gender',
             color_discrete_map={'Male':'lightblue','Female':'pink'})
fig.update_layout(title_text='Count of females and males')
fig.show()

Count of graduated and not graduated customers

In [11]:
graduate_counts = df['Education'].value_counts()
df_graduate = pd.DataFrame(graduate_counts).reset_index()
df_graduate = df_graduate.rename(columns={"index": "education", "Education": "count"})
df_graduate

Unnamed: 0,education,count
0,Graduate,383
1,Not Graduate,97


In [12]:
fig = px.bar(df_graduate, x="education", y="count", color="education",text_auto=True)
fig.update_layout(title_text='Count of graduated and not graduated customers')
fig.show()

Count of married and not married customers

In [13]:
married_counts = df['Married'].value_counts()
df_married = pd.DataFrame(married_counts).reset_index()
df_married = df_married.rename(columns={"index": "married", "Married": "count"})
df_married

Unnamed: 0,married,count
0,Yes,311
1,No,169


In [14]:
fig = px.bar(df_married, x="married", y="count", color="married",text_auto=True)
fig.update_layout(title_text='Count of married and not married customers')
fig.show()

Where do the customers live?

In [15]:
area_counts = df['Property_Area'].value_counts()
df_area = pd.DataFrame(area_counts).reset_index()
df_area = df_area.rename(columns={"index": "area", "Property_Area": "count"})
df_area

Unnamed: 0,area,count
0,Semiurban,191
1,Urban,150
2,Rural,139


In [16]:
fig = px.bar(df_area, x="count", y="area", color="area",text_auto=True)
fig.update_layout(title_text='Where do the customers live')
fig.show()

Count of self-employeed customers

In [17]:
employeed_counts = df['Self_Employed'].value_counts()
df_employeed = pd.DataFrame(employeed_counts).reset_index()
df_employeed = df_employeed.rename(columns={"index": "Self_Employed", "Self_Employed": "count"})
df_employeed

Unnamed: 0,Self_Employed,count
0,No,414
1,Yes,66


In [18]:
fig = px.bar(df_employeed, x="Self_Employed", y="count", color="Self_Employed",text_auto=True)
fig.update_layout(title_text='Count of self-employeed customers')
fig.show()

Distribution of applicant incomes

In [19]:
fig = px.histogram(df, x="ApplicantIncome", color=df.Gender, marginal="box", nbins=5, text_auto=True, 
    color_discrete_map={'Male':'lightblue','Female':'pink'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of applicant incomes')
fig.show()

Distribution of coapplicant incomes

In [20]:
fig = px.histogram(df, x="CoapplicantIncome", color=df.Gender, marginal="box", nbins=10, text_auto=True, 
    color_discrete_map={'Male':'lightblue','Female':'pink'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of coapplicant incomes')
fig.show()

Distribution of loan ammount

In [21]:
fig = px.histogram(df, x="LoanAmount", color='Self_Employed', marginal="box", nbins=10, text_auto=True)
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of loan ammount')
fig.show()

How many customers got loan

In [22]:
loan_counts = df['Loan_Status'].value_counts()
df_loan = pd.DataFrame(loan_counts).reset_index()
df_loan = df_loan.rename(columns={"index": "Got Loan", "Loan_Status": "count"})
df_loan

Unnamed: 0,Got Loan,count
0,Y,332
1,N,148


In [23]:
fig = px.pie(df_loan, values='count', names='Got Loan', color='Got Loan',
             color_discrete_map={'Yes':'lightgreen','No':'lightred'})
fig.update_layout(title_text='How many customers got loan')
fig.show()

### Data Preprocessing

In [24]:
# convert categorical columns to numerical values
df.replace({'Married':{'No':0,'Yes':1},'Gender':{'Male':1,'Female':0},
        'Self_Employed':{'No':0,'Yes':1},
        'Property_Area':{'Rural':0,'Semiurban':1,'Urban':2},
        'Education':{'Graduate':1,'Not Graduate':0},
        'Dependents':{'0':0,'1':1,'2':2,'3+':4},
        "Loan_Status":{'N':0,'Y':1}},inplace=True)

In [25]:
df = df.drop(columns=['Loan_ID'],axis=1)

In [26]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2,1


In [27]:
df.dtypes

Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status            int64
dtype: object

Correlation between columns

In [28]:
fig = px.imshow(df.corr(), text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.update_layout(title_text='Correlation between columns')
fig.show()

In [29]:
features = df.drop(columns=['Loan_Status'],axis=1)
target = df['Loan_Status']

In [30]:
features_train, features_test,target_train,target_test = train_test_split(features,target,test_size=0.1,stratify=target,random_state=2)

In [31]:
print(features.shape, features_train.shape, features_test.shape)

(480, 11) (432, 11) (48, 11)


### Train the model

In [32]:
model = svm.SVC(kernel='linear')

In [33]:
#training the support Vector Macine model
model.fit(features_train, target_train)

### Model evaulation

In [34]:
features_train_prediction = model.predict(features_train)
training_data_accuray = accuracy_score(features_train_prediction,target_train)

In [35]:
print(f'Accuracy on training data: {training_data_accuray}')

Accuracy on training data: 0.7986111111111112


In [36]:
features_test_prediction = model.predict(features_test)
test_data_accuray = accuracy_score(features_test_prediction,target_test)

In [37]:
print(f'Accuracy on test data: {test_data_accuray}')

Accuracy on test data: 0.8333333333333334


### Building predictive system

In [38]:
input_data = (1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2)

# change tuple to np array
input_data = np.asarray(input_data)

# reshape array
input_data = input_data.reshape(1,-1)

# prediction
prediction = model.predict(input_data)
print('Expected: Yes')
print(f'Prediction on custom data: {prediction[0]}')

Expected: Yes
Prediction on custom data: 1



X does not have valid feature names, but SVC was fitted with feature names

