In [1]:
from DeepClassifier import DeepClassifier, create_classifier, predefined_split
from tensorflow.keras import optimizers
import tensorflow
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import os


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

df = pd.read_csv('credit_risk_dataset.csv')

df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [2]:
df["loan_status"].value_counts()

0    25473
1     7108
Name: loan_status, dtype: int64

In [3]:
print(f"{int(100*df[df['loan_status']==1].shape[0] / df.shape[0])}% of Rows are Defaults")

21% of Rows are Defaults


The dataset is imbalanced at almost 1:5 ratio, therefore the defaults are the least frequent class and the most important one.

Model performance will be assessed on Macro Averaged F1 Score, that shows robustness in this kind of scenario.

# Impute NaN and Drop Duplicates

In [4]:
df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True)

df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True)

df.drop_duplicates(inplace=True)

df[df.isnull().any(axis=1)]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


In [5]:
categorical_columns = df.columns[df.dtypes == 'object']

categorical_columns

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')

In [6]:
for column in categorical_columns:
    df = pd.concat([df.drop(columns=[column]), pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)

df

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,0,0,...,0,1,0,0,0,1,0,0,0,1
1,21,9600,5.0,1000,11.14,0,0.10,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,0,0,...,0,1,0,0,1,0,0,0,0,0
32577,54,120000,4.0,17625,7.49,0,0.15,19,0,0,...,0,1,0,0,0,0,0,0,0,0
32578,65,76000,3.0,35000,10.99,1,0.46,28,0,0,...,0,0,0,1,0,0,0,0,0,0
32579,56,150000,5.0,15000,11.48,0,0.10,26,0,0,...,0,1,0,1,0,0,0,0,0,0


In [7]:
X = df.drop(columns='loan_status')
Y = df['loan_status']

scaler = StandardScaler().fit(X)
X = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [None]:
classifcator = DeepClassifier(
    model=create_classifier,
    loss="binary_crossentropy",
    model__architecture={
        "Layers": ["LSTM", "Dense"],
        "ActivationFunctions": ["relu", "sigmoid"],
        "Neurons": [50, 1],
    },
    callbacks=EarlyStopping,
    callbacks__monitor="val_loss",
    callbacks__min_delta=0.0001,
    callbacks__patience=20,
    callbacks__verbose=2,
    callbacks__restore_best_weights=True,
    verbose=2,
    epochs=100,
    train_ratio=0.8,
    val_ratio=0.2,
    batch_size_custom=32,
    fit__shuffle=True,
    optimizer=optimizers.Adam,
)


param_grid = {
    "optimizer__learning_rate": [0.00001, 0.0001, 0.001, 0.01, 0.1],
    "loss":["binary_crossentropy"],
    "model__architecture": [
        {
            "Layers": ["Dense", "Dense"],
            "ActivationFunctions": ["relu", "sigmoid"],
            "Neurons": [20, 1],
        },
        {
            "Layers": ["Dense", "Dense"],
            "ActivationFunctions": ["relu", "sigmoid"],
            "Neurons": [50, 1],
        },
        {
            "Layers": ["Dense", "Dense", "Dense"],
            "ActivationFunctions": ["relu", "relu", "sigmoid"],
            "Neurons": [50, 25, 1],
        },
        {
            "Layers": ["Dense", "Dense", "Dense", "Dense"],
            "ActivationFunctions": ["relu", "relu", "relu", "sigmoid"],
            "Neurons": [60, 30, 15, 1],
        },
    ],
}

clf = GridSearchCV(
    refit=True,
    estimator=classifcator,
    param_grid=param_grid,
    # Train all the models in paralel
    n_jobs=-1,
    cv=3,
    #cv=predefined_split(
    #    X,
    #    batch_size_custom=32,
    #    train_ratio=0.8,
    #    val_ratio=0.2,
    #),
    verbose=3,
    scoring='f1_macro',
)


X = X.values.reshape((X.shape[0], X.shape[1], 1))

clf.fit(X, Y)

grid_search_cv = (
    pd.DataFrame.from_dict(clf.cv_results_)
    .sort_values(by=["rank_test_score"], ascending=True)
    .head(100)
)

grid_search_cv.index = grid_search_cv.rank_test_score
grid_search_cv.to_csv("GridSearchResults.csv")


pd.read_csv('GridSearchResults.csv')

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
541/541 - 3s - loss: 0.6212 - val_loss: 0.5986 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.7027 - val_loss: 0.5849 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.8157 - val_loss: 0.9070 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.8908 - val_loss: 0.9802 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.4183 - val_loss: 0.3630 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.5693 - val_loss: 0.5260 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.3249 - val_loss: 0.2878 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.4288 - val_loss: 0.3522 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.3242 - val_loss: 0.3111 - 3s/epoch - 5ms/step
Epoch 2/100
541/541 - 3s - loss: 0.3298 - val_loss: 0.2974 - 3s/epoch - 5ms/st

541/541 - 2s - loss: 0.3400 - val_loss: 0.3790 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.2470 - val_loss: 0.2499 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.2809 - val_loss: 0.3032 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.2476 - val_loss: 0.2851 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.2707 - val_loss: 0.2854 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.3561 - val_loss: 0.3902 - 2s/epoch - 3ms/step
Epoch 10/100
541/541 - 2s - loss: 0.2590 - val_loss: 0.2688 - 2s/epoch - 4ms/step
Epoch 10/100
541/541 - 2s - loss: 0.4993 - val_loss: 0.4982 - 2s/epoch - 3ms/step
Epoch 11/100
541/541 - 2s - loss: 0.5912 - val_loss: 0.6204 - 2s/epoch - 3ms/step
Epoch 11/100
541/541 - 2s - loss: 0.3460 - val_loss: 0.3630 - 2s/epoch - 3ms/step
Epoch 11/100
541/541 - 2s - loss: 0.2683 - val_loss: 0.2752 - 2s/epoch - 3ms/step
Epoch 11/100
541/541 - 2s - loss: 0.6349 - val_loss: 0.7374 - 2s/epoch - 3ms/step
Epoch 11/100
541/541 - 2s - l

541/541 - 2s - loss: 0.2406 - val_loss: 0.2914 - 2s/epoch - 3ms/step
Epoch 18/100
541/541 - 2s - loss: 0.3341 - val_loss: 0.3635 - 2s/epoch - 3ms/step
Epoch 18/100
541/541 - 2s - loss: 0.4393 - val_loss: 0.4430 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.4810 - val_loss: 0.4851 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.2572 - val_loss: 0.2773 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.3143 - val_loss: 0.3379 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.5086 - val_loss: 0.5843 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.2305 - val_loss: 0.2299 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.2370 - val_loss: 0.4581 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.3152 - val_loss: 0.3576 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.2659 - val_loss: 0.2955 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - loss: 0.2557 - val_loss: 0.2605 - 2s/epoch - 3ms/step
Epoch 19/100
541/541 - 2s - l

541/541 - 2s - loss: 0.4046 - val_loss: 0.4087 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.2501 - val_loss: 0.2774 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.4195 - val_loss: 0.4156 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.3003 - val_loss: 0.3230 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.4405 - val_loss: 0.4893 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.2232 - val_loss: 0.2665 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.3043 - val_loss: 0.3402 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.2593 - val_loss: 0.2955 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.2489 - val_loss: 0.2543 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.3071 - val_loss: 0.2738 - 2s/epoch - 3ms/step
Epoch 3/100
541/541 - 2s - loss: 0.3209 - val_loss: 0.3467 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - loss: 0.2336 - val_loss: 0.3168 - 2s/epoch - 3ms/step
Epoch 27/100
541/541 - 2s - lo

In [None]:
val_rows = int(X.shape[0]*0.2)

val_X = X[-(val_rows):, :, :]
val_y = Y[-(val_rows):]
y_pred = clf.best_estimator_.predict(val_X)

In [None]:
predictions = [(value > 0.5).astype(int) for value in y_pred]

tn, fp, fn, tp = confusion_matrix(val_y, predictions).ravel()

print(f"True Negative: {tn}\nFalse Positive: {fp}\nFalse Negative: {fn}\nTrue Positive: {tp}")

print("\n")

print(classification_report(val_y, y_pred, target_names = ['Non-Default', 'Default']))