In [17]:
from google.colab import files
uploaded = files.upload()

Saving loan_data.csv to loan_data (1).csv


In [209]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [210]:
import io
for filename in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
print(df.head())

   person_age person_gender person_education  person_income  person_emp_exp  \
0          22        female           Master          71948               0   
1          21        female      High School          12282               0   
2          25        female      High School          12438               3   
3          23        female         Bachelor          79753               0   
4          24          male           Master          66135               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT      35000    PERSONAL          16.02   
1                   OWN       1000   EDUCATION          11.14   
2              MORTGAGE       5500     MEDICAL          12.87   
3                  RENT      35000     MEDICAL          15.23   
4                  RENT      35000     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                           3           561  

In [211]:
print(df.shape)

(9999, 14)


In [212]:
print(df['loan_status'].unique())

[1 0]


The dataset consists of 9999 entries with personal information about each person such as their age, gender, education, income, employment experience and home ownership. We also have information about their loan request such as loan amount, loan intent, loan interest rate and loan percent income. The dataset also provides insights on the credit background of each individual through information such as credit history length, credit score and previous loan defaults on file. Finally, we have the loan status which tells us whether the loan was approved or not, which is also the output we would like to predict in this assignment.

In [213]:
df = df.drop(columns=['person_age', 'person_gender', 'person_education', 'person_home_ownership', 'loan_intent'])

In [214]:
print(df.isna().sum()) #none of the columns have NaN values hence no need to perform removing action

person_income                     0
person_emp_exp                    0
loan_amnt                         0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64


In [215]:
# previous_loan_defaults_on_file also impacts loan_status
# hence convert it to binary integer values for analysis
df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': -1})

In [216]:
# Shuffle the rows before splitting to ensure no bias and get fair sets for testing and training
df_shuffled = df.sample(frac=1, random_state=69).reset_index(drop=True)

# Using 80% for training and 20% for testing
split_index = (int)(9999 * 0.8)

# Split into training and testing sets
train_data = df_shuffled[:split_index]
test_data = df_shuffled[split_index:]

x_train = train_data.drop('loan_status', axis=1)
y_train = train_data['loan_status']

x_test = test_data.drop('loan_status', axis=1)
y_test = test_data['loan_status']

In [221]:
# Logistic Regression using scikit learn

model = LogisticRegression(max_iter = 8000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [222]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of imported model: {accuracy}")

Accuracy of imported model: 0.889


In [219]:
# Logistic Regression manually

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cost_function(input, output, weights, bias):
    n = len(output)

    z = np.dot(input, weights) + bias

    cost = (-1 / n) * np.sum(output * np.log(sigmoid(z)) + (1 - output) * np.log(1 - sigmoid(z)))

    return cost

def gradient_descent(input, output, weights, bias, learning_rate, iterations):
    n = len(output)

    for i in range(iterations):

        z = np.dot(input, weights) + bias

        dw = (1 / n) * np.dot(input.T, (sigmoid(z) - output))
        db = (1 / n) * np.sum(sigmoid(z) - output)

        weights -= learning_rate * dw
        bias -= learning_rate * db

    return weights, bias

def logistic_regression(input, output):

    weights = np.zeros(input.shape[1])
    bias = 0

    weights, bias = gradient_descent(input, output, weights, bias, 0.01, 2500)

    return weights, bias

def predict(input, weights, bias):
    z = np.dot(input, weights) + bias
    predictions = sigmoid(z)
    return [1 if i >= 0.5 else 0 for i in predictions]

# Searched how to standardize the training set (was causing overflow without this and also improved accuracy)
train_mean = np.mean(x_train, axis = 0)
train_std = np.std(x_train, axis = 0)
x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

weights, bias = logistic_regression(x_train, y_train)

y_pred = predict(x_test, weights, bias)


In [220]:
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy of manual model: {accuracy}")

Accuracy of manual model: 0.886


Accuracy of imported model: 0.8825  
Accuracy of manual model: 0.8860  
Earlier (before standardizing the dataset in manual model), I was getting overflow and an accuracy of 0.747. On searching about the issue, I found that the overflow was due to some very large negative values being elements of z. If an element in z is a very large negative number, exp(-z) will become a very large value and cause overflow error in np.exp(-z).  
I think accuracy of my model is comparable to the imported model because the imported scikit learn model doesn't standardize the inputs, but my model's accuracy was somehow improved by standardization by a significant amount.