In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import os

In [2]:
# Import the data
file_path = os.path.join("Resources/lending_data.csv")
lending_df = pd.read_csv(file_path)
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


#### Prediction: The Logistic Regression Model will be the more effective model in predicting whether a loan will be approved.

This data seems like a great fit for the Logistic Regression Model, as it is generally most effective when using continuous variables as the features and predicting a binary target. In this instance, we have seven features that are all linear in nature and the prediction is whether or not one's loan would be approved, which is a yes or no binary variable. The Random Forest Model may still be effective, but it would be more advantageous (comparitively to Logistic Regression) if the data had more qualitative variables, and potentially ones that were more likely to be noisy on the surface level.

In [3]:
# Define the features and target
X = lending_df.drop(columns = ["loan_status"])
y = lending_df["loan_status"]

In [4]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [5]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Create a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [7]:
# Train a Logistic Regression model print the model score
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Accuracy: {round(classifier.score(X_train_scaled, y_train), 4)}")
print(f"Testing Data Accuracy: {round(classifier.score(X_test_scaled, y_test), 4)}")

Training Data Accuracy: 0.9943
Testing Data Accuracy: 0.9937


In [None]:
# Train a Random Forest Classifier model and print the model score