In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc,
    ConfusionMatrixDisplay
)
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
rom sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

In [3]:
# Reading the dataset
# Setting low_memory=False to avoid DtypeWarning for columns with mixed data types
data = pd.read_csv("data_clean.csv", low_memory=False)
data.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,delinq_2yrs,fico_range_high,inq_last_6mths,open_acc,...,renewable_energy,small_business,vacation,wedding,MORTGAGE,OTHER,OWN,RENT,DirectPay,Y
0,0,3600.0,36,13.99,55000.0,5.91,0.0,679.0,1.0,7.0,...,False,False,False,False,True,False,False,False,False,False
1,1,24700.0,36,11.99,65000.0,16.06,1.0,719.0,4.0,22.0,...,False,True,False,False,True,False,False,False,False,False
2,2,20000.0,60,10.78,63000.0,10.78,0.0,699.0,0.0,6.0,...,False,False,False,False,True,False,False,False,False,False
3,4,10400.0,60,22.45,104433.0,25.37,1.0,699.0,3.0,12.0,...,False,False,False,False,True,False,False,False,False,False
4,5,11950.0,36,13.44,34000.0,10.2,0.0,694.0,0.0,5.0,...,False,False,False,False,False,False,False,True,False,False


In [4]:
w_p = data.loan_condition_int.value_counts()[0] / data.shape[0]
w_n = data.loan_condition_int.value_counts()[1] / data.shape[0]

print(f"Weight of positive values {w_p}")
print(f"Weight of negative values {w_n}")

Weight of positive values 0.8196420376319412
Weight of negative values 0.18035796236805873


In [5]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

print(train.shape)
print(test.shape)

(1743, 86)
(436, 86)


In [6]:
print(train[train['dti'] <= 50].shape)
print(train.shape)

(1743, 86)
(1743, 86)


In [7]:
print(train.shape)
train = train[train['annual_inc'] <= 250000]
train = train[train['dti'] <= 50]
train = train[train['open_acc'] <= 40]
train = train[train['total_acc'] <= 80]
train = train[train['revol_util'] <= 120]
train = train[train['revol_bal'] <= 250000]
print(train.shape)

(1743, 86)
(1721, 86)


In [18]:
X_train, y_train = train.drop(
    'loan_condition_int', axis=1), train.loan_condition_int
X_test, y_test = test.drop(
    'loan_condition_int', axis=1), test.loan_condition_int

In [19]:
X_train = X_train.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [20]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(
            classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

    elif train == False:
        clf_report = pd.DataFrame(
            classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [22]:
X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
y_train = np.array(y_train).astype(np.float32)
y_test = np.array(y_test).astype(np.float32)

In [23]:
def evaluate_nn(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(
            classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

    elif train == False:
        clf_report = pd.DataFrame(
            classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")


def plot_learning_evolution(r):
    plt.figure(figsize=(12, 8))

    plt.subplot(2, 2, 1)
    plt.plot(r.history['loss'], label='Loss')
    plt.plot(r.history['val_loss'], label='val_Loss')
    plt.title('Loss evolution during trainig')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(r.history['AUC'], label='AUC')
    plt.plot(r.history['val_AUC'], label='val_AUC')
    plt.title('AUC score evolution during trainig')
    plt.legend()


def nn_model(num_columns, num_labels, hidden_units, dropout_rates, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns, ))
    x = BatchNormalization()(inp)
    x = Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = Dense(hidden_units[i], activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout_rates[i + 1])(x)
    x = Dense(num_labels, activation='sigmoid')(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=Adam(learning_rate),
                  loss='binary_crossentropy', metrics=[AUC(name='AUC')])
    return model

# XGBoost

In [24]:
# param_grid = dict(
#     n_estimators=stats.randint(10, 500),
#     max_depth=stats.randint(1, 10),
#     learning_rate=stats.uniform(0, 1)
# )

xgb_clf = XGBClassifier(use_label_encoder=False)
# xgb_cv = RandomizedSearchCV(
#     xgb_clf, param_grid, cv=3, n_iter=60,
#     scoring='roc_auc', n_jobs=-1, verbose=1
# )
# xgb_cv.fit(X_train, y_train)

# best_params = xgb_cv.best_params_
# best_params['tree_method'] = 'gpu_hist'
# # best_params = {'n_estimators': 50, 'tree_method': 'gpu_hist'}
# print(f"Best Parameters: {best_params}")

# xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)

y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0.0    1.0  accuracy  macro avg  weighted avg
precision     1.0    1.0       1.0        1.0           1.0
recall        1.0    1.0       1.0        1.0           1.0
f1-score      1.0    1.0       1.0        1.0           1.0
support    1401.0  320.0       1.0     1721.0        1721.0
_______________________________________________
Confusion Matrix: 
 [[1401    0]
 [   0  320]]

Test Result:
Accuracy Score: 97.71%
_______________________________________________
CLASSIFICATION REPORT:
                  0.0        1.0  accuracy   macro avg  weighted avg
precision    0.981030   0.955224  0.977064    0.968127      0.976827
recall       0.991781   0.901408  0.977064    0.946595      0.977064
f1-score     0.986376   0.927536  0.977064    0.956956      0.976794
support    365.000000  71.000000  0.977064  436.000000    436.000000
___________________________________________

# Random Forest

In [26]:
# param_grid = dict(
#     n_estimators=stats.randint(100, 1500),
#     max_depth=stats.randint(10, 100),
#     min_samples_split=stats.randint(1, 10),
#     min_samples_leaf=stats.randint(1, 10),
# )

rf_clf = RandomForestClassifier(n_estimators=100)
# rf_cv = RandomizedSearchCV(
#     rf_clf, param_grid, cv=3, n_iter=60,
#     scoring='roc_auc', n_jobs=-1, verbose=1
# )
# rf_cv.fit(X_train, y_train)
# best_params = rf_cv.best_params_
# print(f"Best Parameters: {best_params}")
# rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)

y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0.0    1.0  accuracy  macro avg  weighted avg
precision     1.0    1.0       1.0        1.0           1.0
recall        1.0    1.0       1.0        1.0           1.0
f1-score      1.0    1.0       1.0        1.0           1.0
support    1401.0  320.0       1.0     1721.0        1721.0
_______________________________________________
Confusion Matrix: 
 [[1401    0]
 [   0  320]]

Test Result:
Accuracy Score: 97.02%
_______________________________________________
CLASSIFICATION REPORT:
                  0.0        1.0  accuracy   macro avg  weighted avg
precision    0.970588   0.967742  0.970183    0.969165      0.970125
recall       0.994521   0.845070  0.970183    0.919795      0.970183
f1-score     0.982409   0.902256  0.970183    0.942332      0.969356
support    365.000000  71.000000  0.970183  436.000000    436.000000
___________________________________________

# Logistic Regression

In [27]:
lr_clf = LogisticRegression()

# Train the model
lr_clf.fit(X_train, y_train)

# Predict on train and test data
y_train_pred = lr_clf.predict(X_train)
y_test_pred = lr_clf.predict(X_test)

# Print scores using your defined print_score function
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 96.86%
_______________________________________________
CLASSIFICATION REPORT:
                   0.0         1.0  accuracy    macro avg  weighted avg
precision     0.972632    0.949324  0.968623     0.960978      0.968298
recall        0.989293    0.878125  0.968623     0.933709      0.968623
f1-score      0.980892    0.912338  0.968623     0.946615      0.968145
support    1401.000000  320.000000  0.968623  1721.000000   1721.000000
_______________________________________________
Confusion Matrix: 
 [[1386   15]
 [  39  281]]

Test Result:
Accuracy Score: 96.10%
_______________________________________________
CLASSIFICATION REPORT:
                  0.0        1.0  accuracy   macro avg  weighted avg
precision    0.975410   0.885714  0.961009    0.930562      0.960803
recall       0.978082   0.873239  0.961009    0.925661      0.961009
f1-score     0.976744   0.879433  0.961009    0.928088      0.960898
support    365.000000  71.000000  0.961009  436.00000

# Light GBM

In [40]:
import random
from scipy.stats import randint, uniform

In [43]:
# Define hyperparameter grid (consider adjusting values)
param_grid = {
    'n_estimators': randint(100, 1500),  # Number of trees (use randint from SciPy)
    'learning_rate': uniform(0.01, 0.5),  # Learning rate
    'num_leaves': randint(31, 255),  # Number of leaves in each tree
    'min_data_in_leaf': randint(20, 500),  # Minimum data per leaf
    'feature_fraction': uniform(0.6, 1.0),  # Feature fraction for bagging
}

# Try importing SciPy for random number generation (optional)
try:
    from scipy.stats import randint, uniform
except ImportError:
    print("Warning: SciPy not found. Using random module for less control over random number generation.")
    from random import randint, uniform  # Fallback to random module

# Create LGBMClassifier instance
lgb_clf = LGBMClassifier()

# Randomized search for hyperparameter tuning
lgb_cv = RandomizedSearchCV(
    lgb_clf, param_grid, cv=3, n_iter=60,
    scoring='roc_auc', n_jobs=-1, verbose=1
)

try:
    lgb_cv.fit(X_train, y_train)
except ValueError as e:
    print("Error fitting LightGBM:", e)
    # Handle potential errors during fitting (e.g., data type issues)

# Get best parameters
best_params = lgb_cv.best_params_
print(f"Best Parameters: {best_params}")

# Create LightGBM model with best params
lgb_clf = LGBMClassifier(**best_params)

# Train the model
try:
    lgb_clf.fit(X_train, y_train)
except ValueError as e:
    print("Error training LightGBM:", e)
    # Handle potential errors during training (e.g., data type issues)

# Predict on train and test data
y_train_pred = lgb_clf.predict(X_train)
y_test_pred = lgb_clf.predict(X_test)

# Print scores using your defined print_score function
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


105 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 1142, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 842, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/usr/local/lib/python3.10/dist-packages/lig

[LightGBM] [Info] Number of positive: 320, number of negative: 1401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2427
[LightGBM] [Info] Number of data points in the train set: 1721, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.185938 -> initscore=-1.476621
[LightGBM] [Info] Start training from score -1.476621
Best Parameters: {'feature_fraction': 0.7213014440431132, 'learning_rate': 0.32127425193033426, 'min_data_in_leaf': 34, 'n_estimators': 492, 'num_leaves': 43}
[LightGBM] [Info] Number of positive: 320, number of negative: 1401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2427
[LightGBM] [Info] Numbe

# Stacking Model

In [51]:
from sklearn.ensemble import StackingClassifier
import numpy as np

# Define base models (already trained)
estimators = [
    ('lr', lr_clf),  # Replace with your trained Logistic Regression model
    ('rf', rf_clf),  # Replace with your trained Random Forest model
    ('xgb', xgb_clf),  # Assuming XGBoost is trained
    ('lgb', lgb_clf),  # Assuming LightGBM is trained
]

# Define the meta-model (final layer)
meta_clf = LogisticRegression()  # You can choose a different classifier

# Create the StackingClassifier object
stack_clf = StackingClassifier(estimators=estimators, final_estimator=meta_clf)

def check_data_shapes(X, y):
  """
  Checks if the number of samples in X (features) and y (target) are consistent.
  Prints a warning if shapes are inconsistent.
  """
  if X.shape[0] != y.shape[0]:
    print("Warning: Inconsistent shapes between features (X) and target variable (y).")

def create_stacked_data(X_train, estimators):
  """
  Creates a stacked representation of predictions from base models on the training data.

  Args:
      X_train: Training data for base models.
      estimators: List of tuples containing base model name and trained model.

  Returns:
      A 2D array where each row represents stacked predictions for a sample in X_train.
  """
  stacked_predictions = []
  for estimator_name, model in estimators:
    # Get predictions for the current model (assuming predict_proba for probabilities)
    predictions = model.predict_proba(X_train)[:, 1]  # Extract probabilities for class 1
    stacked_predictions.append(predictions.reshape(-1, 1))  # Reshape to a column vector

  # Concatenate predictions from all models horizontally
  return np.hstack(stacked_predictions)

# Check data shapes before stacking (optional but recommended)
check_data_shapes(X_train, y_train)

# Create stacked training data
stack_X_train = create_stacked_data(X_train, estimators)

# Ensure shapes are consistent before training
check_data_shapes(stack_X_train, y_train)  # This check should now pass

# Train the Stacking model
stack_clf.fit(stack_X_train, y_train)

# Predict using the Stacking model on both training and test data
stack_train_pred_proba = np.vstack([model.predict_proba(X_train)[:, 1] for model_name, model in estimators]).T
stack_train_pred = stack_clf.predict(stack_train_pred_proba)
stack_test_pred_proba = np.vstack([model.predict_proba(X_test)[:, 1] for model_name, model in estimators]).T
stack_test_pred = stack_clf.predict(stack_test_pred_proba)

# Print scores using your defined print_score function
print_score(y_train, stack_train_pred, train=True)
print_score(y_test, stack_test_pred, train=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 256, number of negative: 1120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 808
[LightGBM] [Info] Number of data points in the train set: 1376, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.186047 -> initscore=-1.475907
[LightGBM] [Info] Start training from score -1.475907
[LightGBM] [Info] Number of positive: 256, number of negative: 1121
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 1377, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.185911 -> initscore=-1.476799
[LightGBM] [I

# ANN

In [55]:
# building the model

model = Sequential()
model.add(Dense(units=78, activation='relu'))
model.add(Dense(units=39, activation='relu'))
model.add(Dense(units=19, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=4, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

In [56]:
model.fit(x=X_train,
          y=y_train,
          epochs=50,
          batch_size=512,
          validation_data=(X_test, y_test), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7d85e007dc90>

In [63]:
# Predict on train and test data
y_train_pred_proba = model.predict(X_train)
y_test_pred_proba = model.predict(X_test)

# Convert probabilities to class labels (0 or 1) based on a threshold (e.g., 0.5)
y_train_pred = (y_train_pred_proba > 0.5).astype(int)
y_test_pred = (y_test_pred_proba > 0.5).astype(int)

# Print scores using your defined print_score function
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 98.95%
_______________________________________________
CLASSIFICATION REPORT:
                   0.0         1.0  accuracy    macro avg  weighted avg
precision     0.995699    0.963190  0.989541     0.979445      0.989654
recall        0.991435    0.981250  0.989541     0.986342      0.989541
f1-score      0.993562    0.972136  0.989541     0.982849      0.989578
support    1401.000000  320.000000  0.989541  1721.000000   1721.000000
_______________________________________________
Confusion Matrix: 
 [[1389   12]
 [   6  314]]

Test Result:
Accuracy Score: 95.87%
_______________________________________________
CLASSIFICATION REPORT:
                  0.0        1.0  accuracy   macro avg  weighted avg
precision    0.975342   0.873239  0.958716    0.924291      0.958716
recall       0.975342   0.873239  0.958716    0.924291      0.958716
f1-score     0.975342   0.873239  0.958716    0.924291      0.958716
support    365.000000  71.000000  0.958716  436.00000