In [1]:
pip install numpy pandas scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("lending_data.csv")

# Review the DataFrame
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df["loan_status"]

# Separate the X variable, the features
X = df.drop(columns=["loan_status"])

print("Labels (y):")
print(y.head())
print("\nFeatures (X):")
print(X.head())

Labels (y):
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

Features (X):
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


In [6]:
# Review the y variable Series
print("Basic info of y:")
print(y.info())

Basic info of y:
<class 'pandas.core.series.Series'>
RangeIndex: 77536 entries, 0 to 77535
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
77536 non-null  int64
dtypes: int64(1)
memory usage: 605.9 KB
None


In [7]:
# Display the first few rows
print("n\First 5 Rows of y:")
print(y.head())

n\First 5 Rows of y:
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [8]:
# Review the X variable DataFrame
print("Basic Info of X:")
print(X.info())  

Basic Info of X:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 4.1 MB
None


In [9]:
# Display the first few rows
print("\nFirst 5 Rows of X:")
print(X.head())  



First 5 Rows of X:
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [10]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Display the shape of the resulting datasets
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Testing Labels Shape:", y_test.shape)



Training Features Shape: (62028, 7)
Testing Features Shape: (15508, 7)
Training Labels Shape: (62028,)
Testing Labels Shape: (15508,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [12]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logreg = LogisticRegression(max_iter=1000, random_state=1)

# Fit the model using training data
logreg.fit(X_train, y_train)



In [13]:
# Display model coefficients

print("Model Coefficients:", logreg.coef_)
print("Model Intercept:", logreg.intercept_)

Model Coefficients: [[ 5.11470070e-03 -1.59054179e-03 -1.24648879e-03  2.90282724e-01
  -3.03820078e-01  1.64798196e+00  2.21120676e-04]]
Model Intercept: [-5.15116384e-08]


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [15]:
# Make a prediction using the testing data
y_pred = logreg.predict(X_test)

# Display the first few predictions
print("First 10 Predictions:", y_pred[:10])

# Save predictions to a csv file
predictions_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
predictions_df.to_csv("predictions.csv", index=False)

print("Predictions saved to 'predictions.csv'")


First 10 Predictions: [0 0 0 1 0 0 0 0 0 0]
Predictions saved to 'predictions.csv'


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [16]:
from sklearn.metrics import confusion_matrix, classification_report

In [17]:
# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[14948    60]
 [   33   467]]


In [18]:
# Print the classification report for the model
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", class_report)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15008
           1       0.89      0.93      0.91       500

    accuracy                           0.99     15508
   macro avg       0.94      0.97      0.95     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model can predict both models well by using the classification report and confusion matrix. if the precision, recall, and F1-score are balanced for both labels ('0' and '1'), the model can predict both well. However, if one class has a lower metrics, the model may have difficuities with predicting them well.Checking the confusion matrix can also help identifiy misclassifications.Overla, to help improve performance, techniques like balancing the dataset, fature engineering or using a different model can do that.

---