In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import statsmodels.api as sm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
loan=pd.read_csv('../.Database/merged_cleaned_loan.csv')
loan['home_ownership'].value_counts()

home_ownership
MORTGAGE    126680
RENT        109239
OWN          21638
OTHER          176
NONE            43
ANY              1
Name: count, dtype: int64

In [3]:
# As there is only 1 observation with home_ownership = ANY,
# we will drop that observation to avoid issues with regression
loan = loan.drop(loan[loan['home_ownership'] == 'ANY'].index)

In [4]:
# Drop non-numerical columns from X
X=loan.drop(columns=['addr_state','emp_title','issue_d','loan_status','issue_d','issue_y','issue_m','title'])

# Drop annual_inc, to use log_annual_inc
X=X.drop(columns=['annual_inc'])

# Drop outcome variables from X
X=X.drop(columns=['loan_status_grouped','grade','sub_grade', 'int_rate'])

# Select outcome variable for Y
y=loan['loan_status_grouped']

def transform_emp_length(emp_length):
    if emp_length in ["< 1 year", "1 year"]:
        return "0-1 year"
    elif emp_length in ["2 years", "3 years", "4 years", "5 years"]:
        return "2-5 years"
    elif emp_length in ["6 years", "7 years", "8 years", "9 years", "10 years"]:
        return "6-10 years"
    elif emp_length == "10+ years":
        return "> 10 years"
    
X['emp_length'] = X['emp_length'].apply(transform_emp_length)

In [5]:
#Separate numerical and categorical columns
categorical_columns = ['home_ownership', 'term','purpose','emp_length']
numerical_columns = X.select_dtypes(include=['float64']).columns

# Apply OneHotEncoding to categorical columns before splitting
# dtype=int to get binary (0 or 1) values
X_encoded = pd.get_dummies(X, columns=categorical_columns, dtype=int)

# Drop the following dummies to avoid dummy variable trap
# This ensures K-1 dummies for K groups
# Dropped dummies represent the base group
# The coefficients of the regressed dummies will be relative to the base group
X_encoded = X_encoded.drop(columns=['home_ownership_MORTGAGE','term_ 36 months','purpose_car','emp_length_0-1 year'])

In [6]:
# Now perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Ensure the train and test sets have the same structure (dummies already handled)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Apply scaling to numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [7]:
# Drop rows with NaN values in X_train or y_train
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Ensure y_train matches X_train

In [8]:
# Checking for higher collinear predictors
#from statsmodels.stats.outliers_influence import variance_inflation_factor
#vif_data = pd.DataFrame()
#vif_data["feature"] = X_train.columns
#vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
#print(vif_data)

# Drop highly collinear predictors from X_train and X_test
X_train = X_train.drop(columns=['funded_amnt','loan_amnt','total_pymnt','installment'])
X_test = X_test.drop(columns=['funded_amnt','loan_amnt','total_pymnt','installment'])

In [9]:
# The data has about 5 times as many repaid loans (0) as defaulted loans (1)
# Such class balance issues can skew predictions greatly
# We can use SMOTE (Synthetic Minority Over-sampling Technique) to oversample the minority class

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [10]:
# Using statsmodels library to run Logit Regression

# Add a constant to the independent variables
X_train_with_const = sm.add_constant(X_train)
X_test_with_const = sm.add_constant(X_test)

# Fit the logistic regression model
logit_model = sm.Logit(y_train, X_train_with_const)

# Fit the model and obtain the result object
result = logit_model.fit(maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.636582
         Iterations 6


In [11]:
# Print the summary of the logistic regression results
print(result.summary())

                            Logit Regression Results                           
Dep. Variable:     loan_status_grouped   No. Observations:               322574
Model:                           Logit   Df Residuals:                   322542
Method:                            MLE   Df Model:                           31
Date:                 Sat, 26 Oct 2024   Pseudo R-squ.:                 0.08161
Time:                         17:37:38   Log-Likelihood:            -2.0534e+05
converged:                        True   LL-Null:                   -2.2359e+05
Covariance Type:             nonrobust   LLR p-value:                     0.000
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                            1.7777      0.024     72.691      0.000       1.730       1.826
delinq_2yrs                      0.0858      0.004     22.244      0.

In [12]:
# Print odds ratios 
odds_ratios = np.exp(result.params)
print('\nOdds Ratios\n',odds_ratios)


Odds Ratios
 const                           5.915979
delinq_2yrs                     1.089556
dti                             1.387065
inq_last_6mths                  1.139564
mths_since_last_delinq          1.022018
pub_rec                         1.016456
total_acc                       0.825195
acc_now_delinq                  1.005455
log_annual_inc                  0.847572
Effective Federal Funds Rate    0.956983
state_unemployment              0.879857
home_ownership_NONE             0.149754
home_ownership_OTHER            0.357781
home_ownership_OWN              0.689036
home_ownership_RENT             1.056945
term_ 60 months                 2.179576
purpose_credit_card             0.140782
purpose_debt_consolidation      0.171083
purpose_educational             0.057970
purpose_home_improvement        0.140426
purpose_house                   0.071415
purpose_major_purchase          0.083889
purpose_medical                 0.113280
purpose_moving                  0.108793
pu