## Importing the Packages

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
sns.set()

## Importing the Data

In [2]:
loan_data = pd.read_csv("loan-data-preprocessed.csv")

In [3]:
loan_data

Unnamed: 0,id,loan_amnt_USD,loan_amnt_EUR,funded_amnt_USD,funded_amnt_EUR,int_rate,installment_USD,installment_EUR,total_pymnt_USD,total_pymnt_EUR,exchange_rate,issue_data,loan_status,term_months,sub_grade,verification_status,state_address
0,373332.0,9950.0,9038.082814,1000.0,908.350032,0.1825,360.97,327.887111,1072.82,974.496081,1.100897,10.0,1.0,36.0,21.0,0.0,1.0
1,575239.0,12000.0,10900.200379,12000.0,10900.200379,0.2099,324.58,294.832253,959.75,871.788943,1.100897,10.0,1.0,60.0,25.0,1.0,2.0
2,707689.0,10000.0,8924.299805,10000.0,8924.299805,0.1366,340.13,303.542209,3726.25,3325.417215,1.120536,2.0,1.0,36.0,13.0,1.0,0.0
3,709828.0,27200.0,24707.120859,27200.0,24707.120859,0.2899,553.87,503.107832,41913.62,38072.238051,1.100897,10.0,1.0,60.0,6.0,0.0,4.0
4,849994.0,11400.0,10526.076489,11400.0,10526.076489,0.2899,376.09,347.258957,3753.60,3465.849185,1.083025,3.0,0.0,36.0,10.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,68603178.0,14000.0,12804.119629,14000.0,12804.119629,0.2899,421.61,385.596063,41913.62,38333.357469,1.093398,12.0,1.0,36.0,1.0,0.0,1.0
9996,68604253.0,20000.0,18291.599470,20000.0,18291.599470,0.2899,631.26,577.337754,0.00,0.000000,1.093398,12.0,1.0,36.0,6.0,0.0,2.0
9997,68614880.0,5600.0,5121.647852,5600.0,5121.647852,0.2899,180.18,164.789020,0.00,0.000000,1.093398,12.0,1.0,36.0,8.0,1.0,1.0
9998,68615915.0,4000.0,3658.319894,4000.0,3658.319894,0.2899,131.87,120.605661,0.00,0.000000,1.093398,12.0,1.0,36.0,10.0,1.0,2.0


loan status 
0 = '', 'Charged Off', 'Default', 'Late (31-120 days)'
1 = 'Current', 'Fully Paid', 'In Grace Period', 'Issued','Late (16-30 days)'

## Declare the dependent and the independent variables

In [4]:
# Independent features
X = loan_data.drop('loan_status', axis=1)

#Dependent feature
y = loan_data['loan_status']

In [6]:
#Standardization
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build and train the logistic regression model

In [9]:
# Train the logistic regression model
logit = LogisticRegression()
logit.fit(X_train, y_train)

LogisticRegression()

In [11]:
# Predicting the model 
y_pred = logit.predict(X_test)

## Evaluate the models 

In [15]:
from sklearn.metrics import classification_report, roc_auc_score, plot_confusion_matrix, plot_precision_recall_curve

print("The accuracy of logit model is:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))


The accuracy of logit model is: 0.9325
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       134
         1.0       0.93      1.00      0.97      1866

    accuracy                           0.93      2000
   macro avg       0.47      0.50      0.48      2000
weighted avg       0.87      0.93      0.90      2000



In [16]:
# Calculate and print the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_mat)

Confusion Matrix:
 [[   0  134]
 [   1 1865]]


##  Use the model to estimate the probability of default

In [18]:
# Predict default probabilities for the test set
y_pred_prob = logit.predict_proba(X_test)[:, 1]

In [22]:
# Predict probabilities for all accounts
probabilities = logit.predict_proba(X)[:, 1]

# Add the probabilities to the DataFrame
loan_data['default_probability'] = probabilities

# Print the DataFrame with default probabilities
print(loan_data[['id', 'default_probability']])

              id  default_probability
0       373332.0             0.453406
1       575239.0             0.215866
2       707689.0             0.447863
3       709828.0             0.746766
4       849994.0             0.497751
...          ...                  ...
9995  68603178.0             0.995883
9996  68604253.0             0.955043
9997  68614880.0             0.947227
9998  68615915.0             0.937932
9999  68616519.0             0.958072

[10000 rows x 2 columns]
