In [1]:
from DeepClassifier import DeepClassifier, create_classifier
from tensorflow.keras import optimizers
import tensorflow
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import warnings
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
warnings.filterwarnings("ignore", category=DeprecationWarning)


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

df = pd.read_csv('credit_risk_dataset.csv')

df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [2]:
df["loan_status"].value_counts()

0    25473
1     7108
Name: loan_status, dtype: int64

In [3]:
print(f"{int(100*df[df['loan_status']==1].shape[0] / df.shape[0])}% of Rows are Defaults")

21% of Rows are Defaults


The dataset is imbalanced at almost 1:5 ratio, therefore the defaults are the least frequent class and the most important one.

Model performance will be assessed on Macro Averaged F1 Score, that shows robustness in this kind of scenario.

# Impute NaN and Drop Duplicates

In [4]:
df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True)

df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True)

df.drop_duplicates(inplace=True)

df[df.isnull().any(axis=1)]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


In [5]:
categorical_columns = df.columns[df.dtypes == 'object']

categorical_columns

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')

In [6]:
for column in categorical_columns:
    df = pd.concat([df.drop(columns=[column]), pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)

df

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,0,0,...,0,1,0,0,0,1,0,0,0,1
1,21,9600,5.0,1000,11.14,0,0.10,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,0,0,...,0,1,0,0,1,0,0,0,0,0
32577,54,120000,4.0,17625,7.49,0,0.15,19,0,0,...,0,1,0,0,0,0,0,0,0,0
32578,65,76000,3.0,35000,10.99,1,0.46,28,0,0,...,0,0,0,1,0,0,0,0,0,0
32579,56,150000,5.0,15000,11.48,0,0.10,26,0,0,...,0,1,0,1,0,0,0,0,0,0


In [7]:
X = df.drop(columns=['loan_status', 'loan_percent_income'])

Y = df['loan_status']

numerical_columns = X.columns[X.dtypes != 'uint8']

In [8]:
X.loc[:,numerical_columns]

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,cb_person_cred_hist_length
0,22,59000,123.0,35000,16.02,3
1,21,9600,5.0,1000,11.14,2
2,25,9600,1.0,5500,12.87,3
3,23,65500,4.0,35000,15.23,2
4,24,54400,8.0,35000,14.27,4
...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,30
32577,54,120000,4.0,17625,7.49,19
32578,65,76000,3.0,35000,10.99,28
32579,56,150000,5.0,15000,11.48,26


In [9]:
# MinMax Scaler preservers One Hot Encoded variables
scaler = StandardScaler().fit(X.loc[:,numerical_columns])

In [10]:
X.loc[:,numerical_columns] = scaler.transform(X.loc[:,numerical_columns])

In [11]:
X

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,-0.904471,-0.114354,28.904903,4.018287,1.623527,-0.692614,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
1,-1.061852,-0.910941,0.056504,-1.359219,0.040655,-0.938982,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
2,-0.432327,-0.910941,-0.921408,-0.647491,0.601796,-0.692614,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,-0.747089,-0.009540,-0.187974,4.018287,1.367283,-0.938982,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4,-0.589708,-0.188530,0.789938,4.018287,1.055898,-0.446246,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,4.603869,-0.211106,-0.921408,-0.600042,0.695860,5.959324,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
32577,4.131726,0.869285,-0.187974,1.270223,-1.143256,3.249275,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
32578,5.862918,0.159775,-0.432452,4.018287,-0.007999,5.466588,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
32579,4.446488,1.353041,0.056504,0.855048,0.150937,4.973852,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [None]:
classifcator = DeepClassifier(
    model=create_classifier,
    loss="binary_crossentropy",
    model__architecture={
        "Layers": ["LSTM", "Dense"],
        "ActivationFunctions": ["relu", "sigmoid"],
        "Neurons": [50, 1],
    },
    callbacks=EarlyStopping,
    callbacks__monitor="val_loss",
    callbacks__min_delta=0.00001,
    callbacks__patience=20,
    callbacks__verbose=0,
    callbacks__restore_best_weights=True,
    verbose=0,
    epochs=200,
    train_ratio=0.8,
    val_ratio=0.2,
    batch_size_custom=32*8,
    fit__shuffle=True,
    optimizer=optimizers.Adam,
    balance_class_weights=False,
)


param_grid = {
    "balance_class_weights": [True, False],
    "optimizer__learning_rate": [0.00001, 0.0001, 0.001],
    "loss":["binary_crossentropy"],
    "model__architecture": [
        {
            "Layers": ["Dense", "Dense"],
            "ActivationFunctions": ["relu", "sigmoid"],
            "Neurons": [50, 1],
        },
        {
            "Layers": ["Dense", "Dense"],
            "ActivationFunctions": ["relu", "sigmoid"],
            "Neurons": [100, 1],
        },
        {
            "Layers": ["Dense", "Dense", "Dense"],
            "ActivationFunctions": ["relu", "relu", "sigmoid"],
            "Neurons": [100, 50, 1],
        },
        {
            "Layers": ["Dense", "Dense", "Dense", "Dense"],
            "ActivationFunctions": ["relu", "relu", "relu", "sigmoid"],
            "Neurons": [200, 100, 50, 1],
        },
    ],
}

clf = GridSearchCV(
    refit=True,
    estimator=classifcator,
    param_grid=param_grid,
    # Train all the models in paralel
    n_jobs=-1,
    cv=3,
    verbose=10,
    scoring='f1_macro',
)

# 80% train / 20% test
# Train set will be splitted again in order to use 20% of it for Keras validation

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

X_train = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))

clf.fit(X_train, y_train)

grid_search_cv = (
    pd.DataFrame.from_dict(clf.cv_results_)
    .sort_values(by=["rank_test_score"], ascending=True)
    .head(100)
)

grid_search_cv.index = grid_search_cv.rank_test_score
grid_search_cv.to_csv("GridSearchResults.csv")


pd.read_csv('GridSearchResults.csv')

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
X_test = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

y_pred = clf.best_estimator_.predict(X_test)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"True Negative: {tn}\nFalse Positive: {fp}\nFalse Negative: {fn}\nTrue Positive: {tp}")

print("\n")

print(classification_report(y_test, y_pred, target_names = ['Non-Default', 'Default']))

In [None]:
print('Best Estimator')
print('-'*90)
print(clf.best_estimator_)
print('-'*90)

In [None]:
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))

In [None]:
from sklearn.metrics import f1_score
import numpy as np
import shap
import warnings
warnings.filterwarnings('ignore') 




def f(X):
    X = X.reshape((X.shape[0], X.shape[1], 1))
    results = clf.best_estimator_.predict(X).flatten()
    return results

In [None]:
explainer = shap.KernelExplainer(f, X_test[:100,:])
shap_values = explainer.shap_values(X_test[300:350,:], nsamples=500)



In [None]:
plot = shap.force_plot(explainer.expected_value, shap_values, pd.DataFrame(X_test[300:350,:], columns=X.columns), matplotlib=False,show=False)

In [None]:
plot

In [None]:
plot = shap.force_plot(explainer.expected_value, shap_values[0], pd.DataFrame(X_test[1300:1301,:], columns=X.columns), matplotlib=True, show=False)
plt.savefig('uni_row_plot.png')
plt.close()
plt.figure(figsize = (20,4))
img = mpimg.imread('uni_row_plot.png')
plt.imshow(img)
plt.show()


In [None]:
import numpy as np
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight(class_weight='balanced',
                                            classes=np.unique(y_train),
                                            y=y_train)
dict(np.unique(y_train),weights)