# Folders

In [None]:
proc_data_folder = '/data/preprocessed/'
syn_data_floder = '/data/synthetic/'
proc_shock_data_folder = '/data/preprocessed/'
artifacts_folder = '/artifacts/'

# Configuration

In [3]:
target = 'loan_condition_int'

date_columns = ['issue_d']

drop_columns = ['issue_d']

synthetic_data_names = ['synth_lending_club.csv',
                        'synth_lending_club_out0.01.csv',
                        'synth_lending_club_out0.03.csv',
                        'synth_lending_club_out0.05.csv',
                        'synth_lending_club_out0.07.csv',
                        'synth_lending_club_out0.1.csv',
                        'synth_lending_club_out0.5.csv',
                        'synth_lending_club_out1.0.csv']

columns_names = ['without', '0.01', '0.03', '0.05', '0.07', '0.1', '0.5', '1.0']

real_data_names = {'base_train_data': 'sample_base_train_data_lending_club.parquet',
                   'shock_test_data': 'shock_data_lending_club.parquet',
                   'base_test_data': 'base_test_data_lending_club.parquet'}

columns_name = ['grade',
                 'term',
                 'sub_grade',
                 'int_rate',
                 'annual_inc',
                 'acc_open_past_24mths',
                 'dti',
                 'unemployment_rate',
                 'total_bc_limit',
                 'installment',
                 'fico_range_high',
                 'home_ownership',
                 'total_acc',
                 'revol_bal',
                 'addr_state',
                 'fico_range_low',
                 'avg_cur_bal',
                 'mo_sin_old_rev_tl_op',
                 'mths_since_recent_bc',
                 'funded_amnt',
                 'federal_funds_rate',
                 'loan_condition_int']

# Load Libs

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F

import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from ngboost import NGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from pytorch_tabnet.tab_model import TabNetClassifier
import rtdl
import inspect

from src.calculations import (compute_auc_scores, compute_uplift, culc_uplift_by_monthes)
from src.utilities import preprocess_for_model, fill_missing_with_mode
from src.models import TabNetWrapper, NGBoostWrapper, FTTransformerWrapper

from xgboost import XGBClassifier

# Versions

* xgboost: 3.0.4
* lightgbm: 4.6.0
* ngboost: 0.5.6
* pytorch-tabnet: 4.1.0
* sdv: 1.26.0
* rtdl: 0.0.13

 # Load Data

In [None]:
random_state = 100
n_samples = 8000

synthetic_data_dict = dict()

train_data = pd.read_parquet(proc_data_folder+real_data_names['base_train_data'])
train_data = train_data.drop(columns=drop_columns)
train_data = train_data[columns_name].sample(n=n_samples, random_state=random_state)
shock_test_data = pd.read_parquet(proc_shock_data_folder+real_data_names['shock_test_data'])
shock_test_data = shock_test_data.drop(columns=drop_columns)
shock_test_data = shock_test_data[columns_name].sample(n=n_samples, random_state=random_state)
base_test_data = pd.read_parquet(proc_data_folder+real_data_names['base_test_data'])
base_test_data = base_test_data.drop(columns=drop_columns)
base_test_data = base_test_data[columns_name].sample(n=n_samples, random_state=random_state)

for col_name, data_name in zip(columns_names, synthetic_data_names):

  if col_name == 'without':

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

  else:

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

train_data.shape, shock_test_data.shape, base_test_data.shape

((8000, 22), (8000, 22), (8000, 22))

In [None]:
for key, value in synthetic_data_dict.items():
  print(key, value.shape)

without (8000, 22)
0.01 (8000, 22)
0.03 (8000, 22)
0.05 (8000, 22)
0.07 (8000, 22)
0.1 (8000, 22)
0.5 (8000, 22)
1.0 (8000, 22)


In [None]:
train_data = fill_missing_with_mode(train_data)
shock_test_data = fill_missing_with_mode(shock_test_data)
base_test_data = fill_missing_with_mode(base_test_data)

for key, value in synthetic_data_dict.items():
    synthetic_data_dict[key] = fill_missing_with_mode(value)

# ML Modeling

In [None]:
models = {
    "lightgbm": lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    max_depth=2,
    num_leaves=4,
    min_data_in_leaf=80,
    learning_rate=0.01,
    n_estimators=1500,
    feature_fraction=0.6,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_alpha=1.0,
    reg_lambda=1.0,
    min_gain_to_split=0.1,
    verbose=-1,
    random_state=42),
    "xgb": XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    max_depth=3,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.7,
    learning_rate=0.01,
    n_estimators=1000,
    reg_alpha=1.0,
    reg_lambda=1.0,
    verbosity=0,
    random_state=42,
    enable_categorical=True),
    "hgb": HistGradientBoostingClassifier(
    max_depth=3,
    min_samples_leaf=30,
    max_iter=300,
    learning_rate=0.03,
    max_bins=255,
    l2_regularization=1.0,
    random_state=42,
    early_stopping=True,
    scoring='roc_auc'),
    "ngboost": NGBoostWrapper(),
    "tabnet": TabNetWrapper(
    n_d=24,
    n_a=24,
    n_steps=4,
    gamma=1.3,
    lambda_sparse=1e-2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.015),
    mask_type="entmax",
    verbose=0,
    seed=42
    ),
    "fttransformer": FTTransformerWrapper(
    d_token=64,
    n_blocks=3,
    ffn_d_hidden=128,
    attention_dropout=0.2,
    residual_dropout=0.1,
    lr=1e-3,
    batch_size=512,
    n_epochs=200,
    patience=20,
    verbose=1
    )
    }

In [None]:
model_scores = {}
model_scores_data = {}

for name, model in models.items():
    print(f"=== Processing and execution {name} ===")
    tr, bt, sh, syn = preprocess_for_model(
        train_data, base_test_data, shock_test_data, synthetic_data_dict, name
    )
    scores_dict = compute_auc_scores(tr, bt, sh, syn, target, model)
    model_scores[name] = scores_dict
    scores_data = pd.DataFrame(scores_dict, index=columns_names)

    model_scores_data[name] = scores_data

In [None]:
model_scores_data['lightgbm']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.774434,0.68219,0.774434,0.695545,0.775191,0.673604,0.775191,0.688614
0.01,0.774434,0.68219,0.774434,0.695545,0.773506,0.678685,0.773506,0.688353
0.03,0.774434,0.68219,0.774434,0.695545,0.775272,0.678712,0.775272,0.684582
0.05,0.774434,0.68219,0.774434,0.695545,0.777317,0.676036,0.777317,0.686452
0.07,0.774434,0.68219,0.774434,0.695545,0.775016,0.676098,0.775016,0.685906
0.1,0.774434,0.68219,0.774434,0.695545,0.774309,0.676902,0.774309,0.687897
0.5,0.774434,0.68219,0.774434,0.695545,0.770654,0.677735,0.770654,0.688646
1.0,0.774434,0.68219,0.774434,0.695545,0.771301,0.673887,0.771301,0.68841


In [None]:
model_scores_data['xgb']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.798533,0.681664,0.798533,0.694111,0.788231,0.672797,0.788231,0.689916
0.01,0.798533,0.681664,0.798533,0.694111,0.787272,0.679026,0.787272,0.686963
0.03,0.798533,0.681664,0.798533,0.694111,0.788781,0.678099,0.788781,0.68544
0.05,0.798533,0.681664,0.798533,0.694111,0.790139,0.675616,0.790139,0.685959
0.07,0.798533,0.681664,0.798533,0.694111,0.787921,0.675256,0.787921,0.686127
0.1,0.798533,0.681664,0.798533,0.694111,0.786695,0.676094,0.786695,0.685961
0.5,0.798533,0.681664,0.798533,0.694111,0.784364,0.676826,0.784364,0.687775
1.0,0.798533,0.681664,0.798533,0.694111,0.784871,0.673354,0.784871,0.68884


In [None]:
model_scores_data['hgb']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.766495,0.676365,0.766495,0.691396,0.783997,0.670723,0.783997,0.687843
0.01,0.766495,0.676365,0.766495,0.691396,0.77571,0.674937,0.77571,0.683878
0.03,0.766495,0.676365,0.766495,0.691396,0.785236,0.676677,0.785236,0.687071
0.05,0.766495,0.676365,0.766495,0.691396,0.771912,0.671441,0.771912,0.683424
0.07,0.766495,0.676365,0.766495,0.691396,0.78491,0.673967,0.78491,0.683892
0.1,0.766495,0.676365,0.766495,0.691396,0.784552,0.676437,0.784552,0.687042
0.5,0.766495,0.676365,0.766495,0.691396,0.737244,0.663369,0.737244,0.672658
1.0,0.766495,0.676365,0.766495,0.691396,0.776316,0.670689,0.776316,0.687452


In [None]:
model_scores_data['ngboost']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.798928,0.673843,0.798928,0.686273,0.785554,0.669507,0.785554,0.686724
0.01,0.798928,0.673843,0.798928,0.686273,0.782588,0.677045,0.782588,0.68556
0.03,0.798928,0.673843,0.798928,0.686273,0.787526,0.676069,0.787526,0.68396
0.05,0.798928,0.673843,0.798928,0.686273,0.787801,0.672776,0.787801,0.682662
0.07,0.798928,0.673843,0.798928,0.686273,0.785793,0.675066,0.785793,0.682822
0.1,0.798928,0.673843,0.798928,0.686273,0.787416,0.674383,0.787416,0.685575
0.5,0.798928,0.673843,0.798928,0.686273,0.783327,0.674965,0.783327,0.685409
1.0,0.798928,0.673843,0.798928,0.686273,0.780936,0.671806,0.780936,0.687101


In [None]:
model_scores_data['tabnet']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.820722,0.629296,0.820722,0.631789,0.811734,0.6419,0.811734,0.630727
0.01,0.820722,0.629296,0.820722,0.631789,0.810616,0.657711,0.810616,0.597223
0.03,0.820722,0.629296,0.820722,0.631789,0.815634,0.647451,0.815634,0.641034
0.05,0.820722,0.629296,0.820722,0.631789,0.833427,0.644102,0.833427,0.598452
0.07,0.820722,0.629296,0.820722,0.631789,0.801093,0.648102,0.801093,0.648535
0.1,0.820722,0.629296,0.820722,0.631789,0.780559,0.663295,0.780559,0.648731
0.5,0.820722,0.629296,0.820722,0.631789,0.781529,0.659062,0.781529,0.629468
1.0,0.820722,0.629296,0.820722,0.631789,0.803906,0.657808,0.803906,0.663115


In [None]:
model_scores_data['fttransformer']

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.642313,0.632768,0.642313,0.63994,0.724232,0.655448,0.724232,0.664643
0.01,0.642313,0.632768,0.642313,0.63994,0.715924,0.656094,0.715924,0.678968
0.03,0.642313,0.632768,0.642313,0.63994,0.708393,0.644729,0.708393,0.654162
0.05,0.642313,0.632768,0.642313,0.63994,0.700181,0.638377,0.700181,0.641982
0.07,0.642313,0.632768,0.642313,0.63994,0.714246,0.654669,0.714246,0.665565
0.1,0.642313,0.632768,0.642313,0.63994,0.716336,0.650522,0.716336,0.665658
0.5,0.642313,0.632768,0.642313,0.63994,0.70029,0.632482,0.70029,0.635274
1.0,0.642313,0.632768,0.642313,0.63994,0.718111,0.661932,0.718111,0.684826


In [None]:
model_scores_uplift = {}

for name, scores_dict in model_scores.items():
    print(f"=== Calculation of Stabilization Uplift for the model {name} ===")
    uplift_dict = compute_uplift(scores_dict, train_data, base_test_data, shock_test_data)
    model_scores_uplift[name] = uplift_dict

=== Calculation of Stabilization Uplift for the model lightgbm ===


Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 23613.25it/s]

=== Calculation of Stabilization Uplift for the model xgb ===



Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 23014.01it/s]


=== Calculation of Stabilization Uplift for the model hgb ===


Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 33723.05it/s]


=== Calculation of Stabilization Uplift for the model ngboost ===


Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 33222.21it/s]


=== Calculation of Stabilization Uplift for the model tabnet ===


Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 33387.49it/s]


=== Calculation of Stabilization Uplift for the model fttransformer ===


Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 33554.43it/s]


In [None]:
model_results_data = {}

selected_columns = [
    'auc_base_A', 'auc_shock_A', 'auc_base_B', 'auc_shock_B',
    'dist_shift', 'score_A', 'score_B', 'difference_uplift', 'uplift_score'
]

for name, scores_dict in model_scores_uplift.items():
    print(f"=== Generate the final table for the model {name} ===")
    results_data = pd.DataFrame(scores_dict, index=columns_names)

    results_data = results_data[selected_columns].round(4)

    model_results_data[name] = results_data

=== Generate the final table for the model lightgbm ===
=== Generate the final table for the model xgb ===
=== Generate the final table for the model hgb ===
=== Generate the final table for the model ngboost ===
=== Generate the final table for the model tabnet ===
=== Generate the final table for the model fttransformer ===


In [None]:
model_results_data['lightgbm']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6822,0.6955,0.6736,0.6886,0.1201,0.988,0.9865,0.0,0.0
0.01,0.6822,0.6955,0.6787,0.6884,0.1201,0.988,0.9913,0.0033,0.0
0.03,0.6822,0.6955,0.6787,0.6846,0.1201,0.988,0.9947,0.0067,0.0
0.05,0.6822,0.6955,0.676,0.6865,0.1201,0.988,0.9906,0.0026,0.0
0.07,0.6822,0.6955,0.6761,0.6859,0.1201,0.988,0.9912,0.0032,0.0
0.1,0.6822,0.6955,0.6769,0.6879,0.1201,0.988,0.9901,0.0021,0.0
0.5,0.6822,0.6955,0.6777,0.6886,0.1201,0.988,0.9902,0.0022,0.0
1.0,0.6822,0.6955,0.6739,0.6884,0.1201,0.988,0.987,0.0,0.0


In [None]:
model_results_data['xgb']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6817,0.6941,0.6728,0.6899,0.1201,0.9888,0.9846,0.0,0.0
0.01,0.6817,0.6941,0.679,0.687,0.1201,0.9888,0.9929,0.0041,0.0
0.03,0.6817,0.6941,0.6781,0.6854,0.1201,0.9888,0.9934,0.0046,0.0
0.05,0.6817,0.6941,0.6756,0.686,0.1201,0.9888,0.9907,0.0019,0.0
0.07,0.6817,0.6941,0.6753,0.6861,0.1201,0.9888,0.9902,0.0014,0.0
0.1,0.6817,0.6941,0.6761,0.686,0.1201,0.9888,0.9911,0.0023,0.0
0.5,0.6817,0.6941,0.6768,0.6878,0.1201,0.9888,0.9902,0.0013,0.0
1.0,0.6817,0.6941,0.6734,0.6888,0.1201,0.9888,0.9861,0.0,0.0


In [None]:
model_results_data['hgb']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6764,0.6914,0.6707,0.6878,0.1201,0.9865,0.9846,0.0,0.0
0.01,0.6764,0.6914,0.6749,0.6839,0.1201,0.9865,0.992,0.0055,0.0
0.03,0.6764,0.6914,0.6767,0.6871,0.1201,0.9865,0.9907,0.0042,0.0
0.05,0.6764,0.6914,0.6714,0.6834,0.1201,0.9865,0.9892,0.0027,0.0
0.07,0.6764,0.6914,0.674,0.6839,0.1201,0.9865,0.9911,0.0046,0.0
0.1,0.6764,0.6914,0.6764,0.687,0.1201,0.9865,0.9905,0.004,0.0
0.5,0.6764,0.6914,0.6634,0.6727,0.1201,0.9865,0.9917,0.0052,0.0
1.0,0.6764,0.6914,0.6707,0.6875,0.1201,0.9865,0.9849,0.0,0.0


In [None]:
model_results_data['ngboost']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6738,0.6863,0.6695,0.6867,0.1201,0.9888,0.9845,0.0,0.0
0.01,0.6738,0.6863,0.677,0.6856,0.1201,0.9888,0.9924,0.0035,0.018
0.03,0.6738,0.6863,0.6761,0.684,0.1201,0.9888,0.9929,0.0041,0.0
0.05,0.6738,0.6863,0.6728,0.6827,0.1201,0.9888,0.9911,0.0023,0.0
0.07,0.6738,0.6863,0.6751,0.6828,0.1201,0.9888,0.993,0.0042,0.0
0.1,0.6738,0.6863,0.6744,0.6856,0.1201,0.9888,0.9899,0.0011,0.0
0.5,0.6738,0.6863,0.675,0.6854,0.1201,0.9888,0.9906,0.0018,0.0
1.0,0.6738,0.6863,0.6718,0.6871,0.1201,0.9888,0.9863,0.0,0.0


In [None]:
model_results_data['tabnet']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6293,0.6318,0.6419,0.6307,0.1201,0.9978,0.99,0.0,0.0131
0.01,0.6293,0.6318,0.6577,0.5972,0.1201,0.9978,0.9457,0.0,0.0
0.03,0.6293,0.6318,0.6475,0.641,0.1201,0.9978,0.9942,0.0,0.288
0.05,0.6293,0.6318,0.6441,0.5985,0.1201,0.9978,0.959,0.0,0.0
0.07,0.6293,0.6318,0.6481,0.6485,0.1201,0.9978,0.9996,0.0019,0.4809
0.1,0.6293,0.6318,0.6633,0.6487,0.1201,0.9978,0.9869,0.0,0.182
0.5,0.6293,0.6318,0.6591,0.6295,0.1201,0.9978,0.9734,0.0,0.001
1.0,0.6293,0.6318,0.6578,0.6631,0.1201,0.9978,0.9952,0.0,0.6237


In [None]:
model_results_data['fttransformer']

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6328,0.6399,0.6554,0.6646,0.1201,0.9936,0.9917,0.0,0.6971
0.01,0.6328,0.6399,0.6561,0.679,0.1201,0.9936,0.9795,0.0,0.8861
0.03,0.6328,0.6399,0.6447,0.6542,0.1201,0.9936,0.9915,0.0,0.6198
0.05,0.6328,0.6399,0.6384,0.642,0.1201,0.9936,0.9968,0.0032,0.1671
0.07,0.6328,0.6399,0.6547,0.6656,0.1201,0.9936,0.9902,0.0,0.7289
0.1,0.6328,0.6399,0.6505,0.6657,0.1201,0.9936,0.9864,0.0,0.7896
0.5,0.6328,0.6399,0.6325,0.6353,0.1201,0.9936,0.9975,0.0039,0.0
1.0,0.6328,0.6399,0.6619,0.6848,0.1201,0.9936,0.9794,0.0,0.8884


# Save Data

In [None]:
for key in model_results_data.keys():
 model_results_data[key].to_csv(artifacts_folder + key + '_open_data_results.csv', index=True)