# Folders

In [None]:
proc_data_folder = '/data/preprocessed/'
syn_data_floder = '/data/synthetic/'
proc_shock_data_folder = '/data/preprocessed/'
artifacts_folder = '/artifacts/'

# Configuration

In [4]:
target = 'loan_condition_int'

date_columns = ['issue_d']

drop_columns = ['issue_d']

synthetic_data_names = ['synth_lending_club.csv',
                        'synth_lending_club_out0.01.csv',
                        'synth_lending_club_out0.03.csv',
                        'synth_lending_club_out0.05.csv',
                        'synth_lending_club_out0.07.csv',
                        'synth_lending_club_out0.1.csv',
                        'synth_lending_club_out0.5.csv',
                        'synth_lending_club_out1.0.csv']

columns_names = ['without', '0.01', '0.03', '0.05', '0.07', '0.1', '0.5', '1.0']

real_data_names = {'base_train_data': 'sample_base_train_data_lending_club.parquet',
                   'shock_test_data': 'shock_data_lending_club.parquet',
                   'base_test_data': 'base_test_data_lending_club.parquet'}

columns_name = ['grade',
                 'term',
                 'sub_grade',
                 'int_rate',
                 'annual_inc',
                 'acc_open_past_24mths',
                 'dti',
                 'unemployment_rate',
                 'total_bc_limit',
                 'installment',
                 'fico_range_high',
                 'home_ownership',
                 'total_acc',
                 'revol_bal',
                 'addr_state',
                 'fico_range_low',
                 'avg_cur_bal',
                 'mo_sin_old_rev_tl_op',
                 'mths_since_recent_bc',
                 'funded_amnt',
                 'federal_funds_rate',
                 'loan_condition_int']

# Load Libs

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

from src.calculations import (compute_auc_scores, compute_uplift, culc_uplift_by_monthes)

Versions
---
*   numpy: 2.0.2
*   pandas: 2.2.2
*   scikit-learn: 1.6.1
*   catboost: 1.2.8
*   sdv: 1.25.0

 # Load Data

In [7]:
random_state = 100 # 42
n_samples = 4000

synthetic_data_dict = dict()

train_data = pd.read_parquet(proc_data_folder+real_data_names['base_train_data'])
train_data = train_data.drop(columns=drop_columns)
train_data = train_data[columns_name].sample(n=n_samples, random_state=random_state)
shock_test_data = pd.read_parquet(proc_shock_data_folder+real_data_names['shock_test_data'])
shock_test_data = shock_test_data.drop(columns=drop_columns)
shock_test_data = shock_test_data[columns_name].sample(n=n_samples, random_state=random_state)
base_test_data = pd.read_parquet(proc_data_folder+real_data_names['base_test_data'])
base_test_data = base_test_data.drop(columns=drop_columns)
base_test_data = base_test_data[columns_name].sample(n=n_samples, random_state=random_state)

for col_name, data_name in zip(columns_names, synthetic_data_names):

  if col_name == 'without':

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

  else:

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

train_data.shape, shock_test_data.shape, base_test_data.shape

((4000, 22), (4000, 22), (4000, 22))

In [8]:
for key, value in synthetic_data_dict.items():
  print(key, value.shape)

without (4000, 22)
0.01 (4000, 22)
0.03 (4000, 22)
0.05 (4000, 22)
0.07 (4000, 22)
0.1 (4000, 22)
0.5 (4000, 22)
1.0 (4000, 22)


# ML Modeling

In [9]:
model = CatBoostClassifier(
    random_seed=random_state,
    cat_features=['grade',
                  'term',
                  'sub_grade',
                  'home_ownership',
                  'addr_state'],
    verbose=False
    )

scores_dict = compute_auc_scores(train_data,
                                 base_test_data,
                                 shock_test_data,
                                 synthetic_data_dict,
                                 target,
                                 model
                                 )

AUC Calculations: 100%|██████████| 8/8 [03:35<00:00, 26.95s/it]


In [10]:
scores_data = pd.DataFrame(scores_dict, index = columns_names)
scores_data

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.94204,0.679308,0.94204,0.677569,0.916752,0.673116,0.916752,0.678757
0.01,0.94204,0.679308,0.94204,0.677569,0.912654,0.67939,0.912654,0.682076
0.03,0.94204,0.679308,0.94204,0.677569,0.912005,0.678488,0.912005,0.679599
0.05,0.94204,0.679308,0.94204,0.677569,0.917569,0.673369,0.917569,0.678009
0.07,0.94204,0.679308,0.94204,0.677569,0.910843,0.675053,0.910843,0.678508
0.1,0.94204,0.679308,0.94204,0.677569,0.910358,0.673066,0.910358,0.672291
0.5,0.94204,0.679308,0.94204,0.677569,0.911006,0.675779,0.911006,0.667175
1.0,0.94204,0.679308,0.94204,0.677569,0.913825,0.680041,0.913825,0.676423


In [11]:
display(scores_data[['auc_base_A', 'auc_shock_A', 'auc_base_B', 'auc_shock_B']])

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B
without,0.679308,0.677569,0.673116,0.678757
0.01,0.679308,0.677569,0.67939,0.682076
0.03,0.679308,0.677569,0.678488,0.679599
0.05,0.679308,0.677569,0.673369,0.678009
0.07,0.679308,0.677569,0.675053,0.678508
0.1,0.679308,0.677569,0.673066,0.672291
0.5,0.679308,0.677569,0.675779,0.667175
1.0,0.679308,0.677569,0.680041,0.676423


In [13]:
scores_dict = compute_uplift(scores_dict, train_data, base_test_data, shock_test_data)

Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 11263.66it/s]


In [14]:
results = pd.DataFrame(scores_dict, index=columns_names)
results[['auc_base_A', 'auc_shock_A', 'auc_base_B',
         'auc_shock_B', 'dist_shift', 'score_A', 'score_B',
         'difference_uplift', 'uplift_score']].round(4)

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6793,0.6776,0.6731,0.6788,0.1193,0.9984,0.9949,0.0,0.0
0.01,0.6793,0.6776,0.6794,0.6821,0.1193,0.9984,0.9976,0.0,0.168
0.03,0.6793,0.6776,0.6785,0.6796,0.1193,0.9984,0.999,0.0006,0.0578
0.05,0.6793,0.6776,0.6734,0.678,0.1193,0.9984,0.9958,0.0,0.0
0.07,0.6793,0.6776,0.6751,0.6785,0.1193,0.9984,0.9969,0.0,0.0
0.1,0.6793,0.6776,0.6731,0.6723,0.1193,0.9984,0.9993,0.0009,0.0
0.5,0.6793,0.6776,0.6758,0.6672,0.1193,0.9984,0.9923,0.0,0.0
1.0,0.6793,0.6776,0.68,0.6764,0.1193,0.9984,0.9967,0.0,0.0


In [15]:
results[['difference_uplift', 'uplift_score']].round(4)

Unnamed: 0,difference_uplift,uplift_score
without,0.0,0.0
0.01,0.0,0.168
0.03,0.0006,0.0578
0.05,0.0,0.0
0.07,0.0,0.0
0.1,0.0009,0.0
0.5,0.0,0.0
1.0,0.0,0.0


# Save Data

In [16]:
results.to_csv(artifacts_folder + 'exA_catboost_results.csv', index=True)