<h2> Predictive Modelling </h2>

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import sys

In [41]:
df = pd.read_csv('loan_payments_versions/loan_payments_transformed.csv')
df = df.drop(['id', 'member_id'], axis=1)

good_loan_statuses = [
    "Fully Paid",
    "Does not meet the credit policy. Status:Fully Paid",
]
bad_loan_statuses = [
    "Charged Off",
    "Does not meet the credit policy. Status:Charged Off",
]

historical_df = df[
    df["loan_status"].isin(good_loan_statuses + bad_loan_statuses)
].copy()
historical_df["loan_status"] = historical_df["loan_status"].apply(
    lambda x: 1 if x in good_loan_statuses else 0
)
leaky_columns = [
    'last_payment_date',
    'last_payment_amount',
    'last_credit_pull_date',
    'recoveries',
    'collection_recovery_fee',
    'total_payment',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',
]

historical_df = historical_df.drop(columns=leaky_columns)
categorical_cols = historical_df.select_dtypes(include='object').columns.tolist()
print(categorical_cols)

['term', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'issue_date', 'payment_plan', 'purpose', 'earliest_credit_line', 'application_type']


In [42]:
historical_df_encoded = pd.get_dummies(historical_df,columns=categorical_cols, drop_first=True)

In [43]:
X = historical_df_encoded.drop('loan_status', axis=1)
Y = historical_df_encoded['loan_status']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance >= 0.95) + 1

print(f"Number of components that explain at least 95% of variance: {n_components}")

Number of components that explain at least 95% of variance: 633


In [44]:
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42, stratify=Y)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "LightGBM": LGBMClassifier(random_state=42)
}

for name, model in models.items():
    print(f"--- Training and Evaluating {name} ---")
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    print(f"\nAccuracy Score: {accuracy_score(y_test, predictions):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("====================================================\n")


--- Training and Evaluating Logistic Regression ---

Accuracy Score: 0.8846

Confusion Matrix:
[[ 619 1066]
 [  72 8101]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.37      0.52      1685
           1       0.88      0.99      0.93      8173

    accuracy                           0.88      9858
   macro avg       0.89      0.68      0.73      9858
weighted avg       0.89      0.88      0.86      9858


--- Training and Evaluating Random Forest ---

Accuracy Score: 0.8284

Confusion Matrix:
[[  29 1656]
 [  36 8137]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.02      0.03      1685
           1       0.83      1.00      0.91      8173

    accuracy                           0.83      9858
   macro avg       0.64      0.51      0.47      9858
weighted avg       0.77      0.83      0.76      9858


--- Training and Evaluating LightGBM ---
[LightGBM] [Info]

In [45]:
#Addressing Class Imbalance because of which the models have a very low recall score

X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42, stratify=Y)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced'),
    "LightGBM": LGBMClassifier(random_state=42, class_weight='balanced')
}

for name, model in models.items():
    print(f"--- Training and Evaluating {name} (Balanced) ---")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"\nAccuracy Score: {accuracy_score(y_test, predictions):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("====================================================\n")

--- Training and Evaluating Logistic Regression (Balanced) ---

Accuracy Score: 0.8842

Confusion Matrix:
[[1349  336]
 [ 806 7367]]

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.80      0.70      1685
           1       0.96      0.90      0.93      8173

    accuracy                           0.88      9858
   macro avg       0.79      0.85      0.82      9858
weighted avg       0.90      0.88      0.89      9858


--- Training and Evaluating Random Forest (Balanced) ---

Accuracy Score: 0.8295

Confusion Matrix:
[[  12 1673]
 [   8 8165]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.01      0.01      1685
           1       0.83      1.00      0.91      8173

    accuracy                           0.83      9858
   macro avg       0.71      0.50      0.46      9858
weighted avg       0.79      0.83      0.75      9858


--- Training and Evaluating LightGBM

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42, stratify=Y)

# --- Applying SMOTE ---
print("Shape of X_train before SMOTE:", X_train.shape)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("Shape of X_train after SMOTE:", X_train_smote.shape)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LightGBM": LGBMClassifier(random_state=42)
}

for name, model in models.items():
    print(f"\n--- Training and Evaluating {name} (with SMOTE) ---")
    # Train on the SMOTE-resampled training data
    model.fit(X_train_smote, y_train_smote)
    
    # Evaluate on the original, untouched test data
    predictions = model.predict(X_test)
    
    print(f"\nAccuracy Score: {accuracy_score(y_test, predictions):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("====================================================\n")

Shape of X_train before SMOTE: (23002, 633)
Shape of X_train after SMOTE: (38142, 633)

--- Training and Evaluating Logistic Regression (with SMOTE) ---

Accuracy Score: 0.9066

Confusion Matrix:
[[1337  348]
 [ 573 7600]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.79      0.74      1685
           1       0.96      0.93      0.94      8173

    accuracy                           0.91      9858
   macro avg       0.83      0.86      0.84      9858
weighted avg       0.91      0.91      0.91      9858



--- Training and Evaluating Random Forest (with SMOTE) ---

Accuracy Score: 0.8039

Confusion Matrix:
[[ 225 1460]
 [ 473 7700]]

Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.13      0.19      1685
           1       0.84      0.94      0.89      8173

    accuracy                           0.80      9858
   macro avg       0.58      0.54      0.54      9858

In [None]:
# Parameter grids for each model
param_grid_lr = {
    'penalty': ['elasticnet'],
    'C': [0.1, 1, 10],
    'l1_ratio': [0.25, 0.5, 0.75],
    'solver': ['saga'],
    'max_iter': [2000]
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

param_grid_lgbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50]
}

# Models and grids to iterate over
tuned_models = {
    "Logistic Regression (ElasticNet)": (LogisticRegression(random_state=42), param_grid_lr),
    "Random Forest": (RandomForestClassifier(random_state=42, n_jobs=-1), param_grid_rf),
    "LightGBM": (LGBMClassifier(random_state=42), param_grid_lgbm)
}

for name, (model, params) in tuned_models.items():
    print(f"\n--- Tuning {name} ---")
    # Using F1 score for the minority class (0) as the scoring metric
    grid_search = GridSearchCV(model, params, cv=3, scoring='f1', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_smote, y_train_smote)
    
    print(f"\nBest Parameters for {name}: {grid_search.best_params_}")
    
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    
    print(f"\nResults for Tuned {name}:")
    print(f"Accuracy Score: {accuracy_score(y_test, predictions):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("----------------------------------------------------\n")


--- Tuning Logistic Regression (ElasticNet) ---
Fitting 3 folds for each of 9 candidates, totalling 27 fits





Best Parameters for Logistic Regression (ElasticNet): {'C': 10, 'l1_ratio': 0.25, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}

Results for Tuned Logistic Regression (ElasticNet):
Accuracy Score: 0.7505

Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.69      0.49      1685
           1       0.92      0.76      0.84      8173

    accuracy                           0.75      9858
   macro avg       0.65      0.73      0.66      9858
weighted avg       0.83      0.75      0.78      9858

----------------------------------------------------


--- Tuning Random Forest ---
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

Results for Tuned Random Forest:
Accuracy Score: 0.8091

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.13      0.18    

In [50]:
# Parameter grids for each model
param_grid_lr = {
    'penalty': ['elasticnet'],
    'C': [0.1, 1, 10],
    'l1_ratio': [0.25, 0.5, 0.75],
    'solver': ['saga'],
    'max_iter': [5000]
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

param_grid_lgbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50]
}

# Models and grids to iterate over
tuned_models = {
    "Logistic Regression (ElasticNet)": (LogisticRegression(random_state=42), param_grid_lr),
    "Random Forest": (RandomForestClassifier(random_state=42, n_jobs=-1), param_grid_rf),
    "LightGBM": (LGBMClassifier(random_state=42), param_grid_lgbm)
}

for name, (model, params) in tuned_models.items():
    print(f"\n--- Tuning {name} ---")
    # Using F1 score for the minority class (0) as the scoring metric
    grid_search = GridSearchCV(model, params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_smote, y_train_smote)
    
    print(f"\nBest Parameters for {name}: {grid_search.best_params_}")
    
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    
    print(f"\nResults for Tuned {name}:")
    print(f"Accuracy Score: {accuracy_score(y_test, predictions):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("----------------------------------------------------\n")


--- Tuning Logistic Regression (ElasticNet) ---
Fitting 3 folds for each of 9 candidates, totalling 27 fits





Best Parameters for Logistic Regression (ElasticNet): {'C': 10, 'l1_ratio': 0.75, 'max_iter': 5000, 'penalty': 'elasticnet', 'solver': 'saga'}

Results for Tuned Logistic Regression (ElasticNet):
Accuracy Score: 0.8322

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.75      0.60      1685
           1       0.94      0.85      0.89      8173

    accuracy                           0.83      9858
   macro avg       0.72      0.80      0.75      9858
weighted avg       0.87      0.83      0.84      9858

----------------------------------------------------


--- Tuning Random Forest ---
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

Results for Tuned Random Forest:
Accuracy Score: 0.8091

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.13      0.18    

In [51]:
class Logger(object):
    def __init__(self, filename="model_evaluation_results.txt"):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        self.terminal.flush()
        self.log.flush()

sys.stdout = Logger()
sys.stdout.log.close()
sys.stdout = sys.stdout.terminal
print("\nProcess finished. All results saved to 'model_evaluation_results.txt'")


Process finished. All results saved to 'model_evaluation_results.txt'
