In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

In [58]:
data=pd.read_csv('loan_data.csv')

In [59]:
data_df=pd.DataFrame(data)

In [60]:
#SHAPE OF THE DATA FRAME
data_df.shape

(45000, 14)

In [61]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [62]:
#UNIQUE VALUES OF TARGET VARIABLE
data_df['loan_status'].unique()

array([1, 0])

**DESCRIPTION**

The following dataset contains 45000 rows and 14 columns which decribe the age,income,qualifications,amount of loan taken and some other attributes like interest rate etc. of 45000 people which shall help us to build a model. The last column is about the loan status which can take the values 0 and 1 which symbolise a person can and can't take a loan respectively.


**PRE** **PROCESSING** **OF** **DATA**

In [64]:
#DROPPING IRRELLEVENT DATA
data_df=data_df.drop(columns=['person_age','person_gender','loan_intent','person_home_ownership','person_education','previous_loan_defaults_on_file','person_emp_exp'])

In [68]:
data_df.head()

Unnamed: 0,person_income,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
0,71948.0,35000.0,16.02,0.49,3.0,561,1
1,12282.0,1000.0,11.14,0.08,2.0,504,0
2,12438.0,5500.0,12.87,0.44,3.0,635,1
3,79753.0,35000.0,15.23,0.44,2.0,675,1
4,66135.0,35000.0,14.27,0.53,4.0,586,1


In [69]:
#drop missing values
data_df.dropna(inplace=True)

In [70]:
#DEFING FEATURES AND TARGET VARIABLE
Y=data_df.loan_status
X=data_df.drop(columns=['loan_status'])
Y.value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,35000
1,10000


In [71]:
#NORMALISATION OF DATA
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X=scaler.fit_transform(X)

**ENCODING** **AND** **SPLITTING** **OF** **DATA**

In [72]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
Y=encoder.fit_transform(Y)

In [74]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

**USING** **SCIKIT**'**S**-**LEARN** **LOGISTIC** **REGRESSION**

In [86]:
model1=LogisticRegression()
model1.fit(X_train,Y_train)

In [89]:
Y_pred1 = model1.predict(X_test)
accuracy1 = accuracy_score(Y_test, Y_pred1)
#Evaluating the model

**USING** **LOGISTIC** **REGRESSION** **FROM** **SCRATCH**

In [91]:
class LogitRegression():
  def __init__(self,learning_rate,iterations):

    self.learning_rate = learning_rate
    self.iterations = iterations

  def fit(self,X,Y):

    self.m, self.n = X.shape
    self.W = np.zeros(self.n)
    self.b = 0
    self.X = X
    self.Y = Y

    for i in range(self.iterations):
      self.update_weights()
    return self

  def update_weights(self):

    A = 1 / (1+ np.exp (- ( self.X.dot( self.W ) + self.b )))
    tmp = A - self.Y.T
    tmp = np.reshape( tmp, self.m )
    dW = np.dot( self.X.T, tmp/self.m )
    db = np.sum(tmp)/self.m

    self.W = self.W-self.learning_rate *dW
    self.b = self.b-self.learning_rate *db

    return self

  def predict(self,X):

    Z = 1 / ( 1 + np.exp( -( X.dot( self.W ) + self.b )))
    Y = np.where( Z>0.5, 1, 0 )

    return Y

  def accuracy_score(Y_test,Y_pred2):
    count=0
    for i in range(len(Y_pred)):
      if(Y_pred[i]==Y_test[i]):
        count=count+1
    return count/len(Y_pred)

#TRAINING
model2=LogitRegression(learning_rate=0.01,iterations=1000)
model2.fit(X_train,Y_train)

#PREDICTIONS
Y_pred2=model2.predict(X_test)
#calculating accuracy
accuracy2=accuracy_score(Y_test,Y_pred2)

#COMPARING ACCURACY OF BOTH MODELS
print("accuracy of model1 is",accuracy1)
print("accuracy of model2 is",accuracy2*100)


accuracy of model1 is 82.47777777777779
accuracy of model2 is 77.66666666666666


**ANALYSIS**

The accuracy of scikit-learn's model is 82.47% while that of our model is 77.66%, This shows that the performance of both the models is quite satisfactory. Although the model1(scikit-learn's) model is a little more efficient than our model.