# Importing Libraries and Dataset





In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [49]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
df=pd.read_csv('/content/drive/My Drive/loan_data.csv')

In [62]:
print(df.head())

   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                         3.0           561  

# Shape of the Dataset

In [63]:
df.shape

(45000, 14)

# Unique values of target variable

In [64]:
df.loan_status.unique()

array([1, 0])

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

The data set contains 14 columns and correspnding 45000 entries where each column represents an financial related attribute with its own specifications. The output variable for the dataset is loan_status which will be either 0 or 1 , where 0 depicts the non-eleigibility of a person for a loan and vice versa for 1.


# Pre-processing of Dataset

In [66]:
df.dropna(inplace=True)

In [68]:
df=df.drop(columns=['person_gender','person_education','person_home_ownership','loan_intent','previous_loan_defaults_on_file'])
print(df.head())


   person_age  person_income  person_emp_exp  loan_amnt  loan_int_rate  \
0        22.0        71948.0               0    35000.0          16.02   
1        21.0        12282.0               0     1000.0          11.14   
2        25.0        12438.0               3     5500.0          12.87   
3        23.0        79753.0               0    35000.0          15.23   
4        24.0        66135.0               1    35000.0          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  loan_status  
0                 0.49                         3.0           561            1  
1                 0.08                         2.0           504            0  
2                 0.44                         3.0           635            1  
3                 0.44                         2.0           675            1  
4                 0.53                         4.0           586            1  


In [69]:
X=df.drop(columns=['loan_status'])
Y=df['loan_status']


In [70]:
Y.value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,35000
1,10000


# Normalization of the dataset

In [72]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Encoding the dataset

In [73]:
encoder=LabelEncoder()
Y=encoder.fit_transform(Y)

# Split the dataset into training and testing sets.

In [74]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.15,random_state=48)

# Using the imported Logistic Regression model.

In [78]:
model1=LogisticRegression()
model1.fit(X_train,Y_train)


In [84]:
Ypred1=model1.predict(X_test)
accuracy1=accuracy_score(Y_test,Ypred1)*100

# By implementing a Logistic Regression model


In [80]:
class LogitRegression():
  def __init__(self,learning_rate,iterations):

    self.learning_rate = learning_rate
    self.iterations = iterations

  def fit(self,X,Y):

    self.m, self.n = X.shape
    self.W = np.zeros(self.n)
    self.b = 0
    self.X = X
    self.Y = Y

    for i in range(self.iterations):
      self.update_weights()

    return self

  def update_weights(self):

    A = 1 / (1+ np.exp (- ( self.X.dot( self.W ) + self.b )))
    tmp = A - self.Y.T
    tmp = np.reshape( tmp, self.m )
    dW = np.dot( self.X.T, tmp/self.m )
    db = np.sum(tmp)/self.m

    self.W = self.W-self.learning_rate *dW
    self.b = self.b-self.learning_rate *db

    return self

  def predict(self,X):

    Z = 1 / ( 1 + np.exp( -( X.dot( self.W ) + self.b )))
    Y = np.where( Z>0.5, 1, 0 )

    return Y

In [82]:
model2 = LogitRegression(learning_rate=0.01, iterations=1000)
model2.fit(X_train, Y_train)

<__main__.LogitRegression at 0x7b6a025e6290>

In [85]:
Ypred2=model2.predict(X_test)
correctly_classified=0
total_test_count=0
count=0
for count in range(np.size(Y_test)):
  if Ypred2[count]==Y_test[count]:
    correctly_classified+=1
  total_test_count+=1
  count+=1
accuracy2=(correctly_classified/total_test_count)*100
print("Accuracy given by sklearn model :",accuracy1)
print("Accuracy given by model made from scratch:",accuracy2)


Accuracy given by sklearn model : 82.34074074074074
Accuracy given by model made from scratch: 77.77777777777779


# Final Analysis

We got the accuracy 82.34074074074074 in Sklearn model and 77.77777777777779 from model made from scratch.Despite both model work on the same concept of Logistic Regression we can obsereve that sklearn provided us a liitle more accurate answer.


