In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data_file = Path("Resources/lending_data.csv")
df = pd.read_csv(data_file)

In [3]:
# Separate the data into labels and features
y = df["loan_status"]
X = df.drop(columns=["loan_status"])

In [4]:
# Check the balance of the labels variable (y) using the value_counts function
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [5]:
# Split the data into training and testing datasets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Fit a logistic regression model using the training data (X_train and y_train)
model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

In [7]:
# Save the predictions on the testing data labels using the testing feature data (X_test) and the fitted model
y_pred = model.predict(X_test)

In [8]:
# Calculate the accuracy score of the model
accuracy_score = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {accuracy_score}")

Balanced Accuracy Score: 0.9520479254722232


In [9]:
# Generate a confusion matrix for the model
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[18663   102]
 [   56   563]]


In [10]:
# Print the classification report for the model
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [11]:
# Answer the question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?
print("The logistic regression model predicts both '0' (healthy loan) and '1' (high-risk loan) labels with moderate accuracy.")

The logistic regression model predicts both '0' (healthy loan) and '1' (high-risk loan) labels with moderate accuracy.


In [12]:
# Use the RandomOverSampler to resample the data
oversampler = RandomOverSampler(random_state=1)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

In [13]:
# Fit a logistic regression model using the resampled data (X_resampled and y_resampled)
resampled_model = LogisticRegression(random_state=1)
resampled_model.fit(X_resampled, y_resampled)

In [14]:
# Save the predictions on the testing data labels using the testing feature data (X_test) and the fitted resampled model
resampled_y_pred = resampled_model.predict(X_test)

In [15]:
# Calculate the accuracy score of the resampled model
resampled_accuracy_score = balanced_accuracy_score(y_test, resampled_y_pred)
print(f"Resampled Balanced Accuracy Score: {resampled_accuracy_score}")

Resampled Balanced Accuracy Score: 0.9936781215845847


In [16]:
# Generate a confusion matrix for the resampled model
resampled_confusion_mat = confusion_matrix(y_test, resampled_y_pred)
print("Resampled Confusion Matrix:")
print(resampled_confusion_mat)

Resampled Confusion Matrix:
[[18649   116]
 [    4   615]]


In [17]:
# Print the classification report for the resampled model
resampled_classification_rep = classification_report(y_test, resampled_y_pred)
print("Resampled Classification Report:")
print(resampled_classification_rep)

Resampled Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



In [18]:
# Answer the question: How well does the logistic regression model with resampled data predict both the 0 (healthy loan) and 1 (high-risk loan) labels?
print("The logistic regression model with resampled data predicts both '0' (healthy loan) and '1' (high-risk loan) labels with improved accuracy compared to the original model.")

The logistic regression model with resampled data predicts both '0' (healthy loan) and '1' (high-risk loan) labels with improved accuracy compared to the original model.
