Import Dependencies

In [591]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection

In [592]:
data = pd.read_csv(r'loan_approval_dataset.csv')
df = data.copy()
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [593]:
df.isna().sum() #No null values

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [594]:
df.duplicated().sum() #No duplicate records

np.int64(0)

In [595]:
#Will convert education and self_employed columns to binary
df[' education'] = df[' education'].replace({
    ' Graduate': 1,
    ' Not Graduate' :0
})

df[' self_employed'] = df[' self_employed'].replace({
    ' Yes': 1,
    ' No' :0
})

#Convert df column data types
df[[' education', ' self_employed']] = df[[' education', ' self_employed']].apply(pd.to_numeric)
df.head()

  df[' education'] = df[' education'].replace({
  df[' self_employed'] = df[' self_employed'].replace({


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [596]:
df[' loan_status'].value_counts()

 loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

In [597]:
#separate data and lables
X = df.drop(columns=' loan_status', axis=1)
Y = df[' loan_status']

Training and Test data

In [598]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, stratify=Y, random_state=1)
print(X.shape, X_train.shape, X_test.shape) #shows that data is split

(4269, 12) (3415, 12) (854, 12)


Model Training - Logistic Regression

In [599]:
model = LogisticRegression(max_iter=1000)

#training the logistic regression model
model.fit(X_train, Y_train)


In [600]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(training_data_accuracy)

0.8076134699853587


In [601]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on test data: ', test_data_accuracy)

Accuracy on test data:  0.8114754098360656


Making a predictive system

In [603]:
#sample approve data
#input_data = (1,2, 1, 0,9600000,29900000,12,778,2400000,17600000,22700000,8000000)

#sample reject data
#input_data = (6,0, 1, 1,4800000,13500000,10,319,6800000,8300000,13700000,5100000)

#sample reject data BUT failed test
input_data = (2,0, 0, 1,4100000,12200000,8,417,2700000,2200000,8800000,3300000)

#changing the input_dat to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped) #use our model to predict our sample data
print(prediction)

if (prediction[0] == ' Approved'):
    print('APPROVED')
else:
    print('REJECTED')

[' Approved']
APPROVED


