# Importing Libraries and Dataset

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
import pandas as pd
filepath="/content/drive/My Drive/loan_data.csv"
df=pd.read_csv(filepath)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

# Shape Of the Dataset

In [58]:
df.shape

(45000, 14)

# Unique values of Target Variable

In [59]:
df['loan_status'].unique()

array([1, 0])

## Description of Dataset

The dataset contains 45,000 entries and 14 columns. The columns represent various attributes about individuals and their loan-related characteristics. The output variable is loan_status which will be 0 if the person is eligible for loan and 1 if the person is not.

# Preprocessing Dataset

In [60]:
df.dropna(inplace=True)
#Dropping null values

In [61]:
df=df.drop(columns=['person_gender', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file', 'person_education'])
#Dropping Irrelevant columns

In [62]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1


# Defining features and target variable

In [63]:
X = df.drop(columns=['loan_status'])
y = df['loan_status']

In [64]:
y.value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,35000
1,10000


# Normalisation of features

In [65]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Encoding

In [66]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(y)

# Splitting the dataset into training and testing sets

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using Scikit-learn's Logistic Regression

In [68]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)
#Training the logistic regression model

In [69]:
y_pred1 = model_1.predict(X_test)
accuracy1 = accuracy_score(y_test, y_pred1)
#Evaluating the model

# Implementing Logistic Regression from Scratch

In [84]:
class LogitRegression():
  def __init__(self,learning_rate,iterations):

    self.learning_rate = learning_rate
    self.iterations = iterations

  def fit(self,X,Y):

    self.m, self.n = X.shape
    self.W = np.zeros(self.n)
    self.b = 0
    self.X = X
    self.Y = Y

    for i in range(self.iterations):
      self.update_weights()
    return self

  def update_weights(self):

    A = 1 / (1+ np.exp (- ( self.X.dot( self.W ) + self.b )))
    tmp = A - self.Y.T
    tmp = np.reshape( tmp, self.m )
    dW = np.dot( self.X.T, tmp/self.m )
    db = np.sum(tmp)/self.m

    self.W = self.W-self.learning_rate *dW
    self.b = self.b-self.learning_rate *db

    return self

  def predict(self,X):

    Z = 1 / ( 1 + np.exp( -( X.dot( self.W ) + self.b )))
    Y = np.where( Z>0.5, 1, 0 )

    return Y

In [85]:
model_2 = LogitRegression(learning_rate=0.01, iterations=1000)
model_2.fit(X_train, y_train)
#Training the model

<__main__.LogitRegression at 0x7a9faa07bb80>

In [86]:
y_pred2 = model_2.predict(X_test)
#Make predictions
correctly_classified=0
for count in range(np.size(y_pred2)):
  if y_test[count]==y_pred2[count]:
    correctly_classified=correctly_classified+1
accuracy2=((correctly_classified/np.size(y_pred2))*100)
#Calculate accuracy

In [87]:
print( "Accuracy on test set by model_1 (sklearn model) :  ", accuracy1*100)
print( "Accuracy on test set by model_2 (our model)   :  ", accuracy2)

Accuracy on test set by model_1 (sklearn model) :   82.47777777777779
Accuracy on test set by model_2 (our model)   :   77.66666666666666


# Final Analysis

The accuracy of the Logistic Regression model implemented using Scikit-learn was found to be 82.47777777777779. In contrast, the accuracy of the custom Logistic Regression model was  77.66666666666666. This indicates that while both models performed similarly, the accuracy of Scikit-learn implementation is higher than the custom implementation from scratch.