# Folders

In [None]:
proc_data_folder = '/data/preprocessed/'
syn_data_floder = '/data/synthetic/'
proc_shock_data_folder = '/data/preprocessed/'
artifacts_folder = '/artifacts/'

# Configuration

In [4]:
target = 'loan_condition_int'

date_columns = ['issue_d']

drop_columns = []

synthetic_data_names = ['synth_lending_club.csv',
                        'synth_lending_club_out0.01.csv',
                        'synth_lending_club_out0.03.csv',
                        'synth_lending_club_out0.05.csv',
                        'synth_lending_club_out0.07.csv',
                        'synth_lending_club_out0.1.csv',
                        'synth_lending_club_out0.5.csv',
                        'synth_lending_club_out1.0.csv']

columns_names = ['without', '0.01', '0.03', '0.05', '0.07', '0.1', '0.5', '1.0']

real_data_names = {'base_train_data': 'sample_base_train_data_lending_club.parquet',
                   'shock_test_data': 'shock_data_lending_club.parquet',
                   'base_test_data': 'base_test_data_lending_club.parquet'}

columns_name = ['grade',
                 'term',
                 'sub_grade',
                 'int_rate',
                 'annual_inc',
                 'acc_open_past_24mths',
                 'dti',
                 'unemployment_rate',
                 'total_bc_limit',
                 'installment',
                 'fico_range_high',
                 'home_ownership',
                 'total_acc',
                 'revol_bal',
                 'addr_state',
                 'fico_range_low',
                 'avg_cur_bal',
                 'mo_sin_old_rev_tl_op',
                 'mths_since_recent_bc',
                 'funded_amnt',
                 'issue_d',
                 'federal_funds_rate',
                 'loan_condition_int']

# Load Libs

In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/stabilization_uprift_edits')

from src.calculations import (compute_auc_scores, compute_uplift, culc_uplift_by_monthes)

Versions
---
*   numpy: 2.0.2
*   pandas: 2.2.2
*   scikit-learn: 1.6.1
*   tabpfn: 2.1.2
*   sdv: 1.25.0

 # Load Data

In [None]:
random_state = 100
n_samples = 4000

synthetic_data_dict = dict()

train_data = pd.read_parquet(proc_data_folder+real_data_names['base_train_data'])
train_data = train_data.drop(columns=drop_columns)
train_data = train_data[columns_name].sample(n=n_samples, random_state=random_state)
shock_test_data = pd.read_parquet(proc_shock_data_folder+real_data_names['shock_test_data'])
shock_test_data = shock_test_data.drop(columns=drop_columns)
shock_test_data = shock_test_data[columns_name].sample(n=n_samples, random_state=random_state)
base_test_data = pd.read_parquet(proc_data_folder+real_data_names['base_test_data'])
base_test_data = base_test_data.drop(columns=drop_columns)
base_test_data = base_test_data[columns_name].sample(n=n_samples, random_state=random_state)

for col_name, data_name in zip(columns_names, synthetic_data_names):

  if col_name == 'without':

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

  else:

    synthetic_train_data = pd.read_csv(syn_data_floder + data_name, parse_dates=date_columns, index_col=0)
    synthetic_train_data = synthetic_train_data.drop(columns=drop_columns)
    synthetic_data_dict[col_name] = synthetic_train_data.sample(n=n_samples, random_state=random_state)

train_data.shape, shock_test_data.shape, base_test_data.shape

((4000, 23), (4000, 23), (4000, 23))

In [8]:
for key, value in synthetic_data_dict.items():
  print(key, value.shape)

without (4000, 23)
0.01 (4000, 23)
0.03 (4000, 23)
0.05 (4000, 23)
0.07 (4000, 23)
0.1 (4000, 23)
0.5 (4000, 23)
1.0 (4000, 23)


# ML Modeling

In [None]:
model = TabPFNClassifier(
    ignore_pretraining_limits=True
    )

scores_dict = compute_auc_scores(train_data,
                                 base_test_data,
                                 shock_test_data,
                                 synthetic_data_dict,
                                 target,
                                 model
                                 )

In [10]:
scores_data = pd.DataFrame(scores_dict, index = columns_names)
scores_data

Unnamed: 0,train_auc_base_A,auc_base_A,train_auc_shock_A,auc_shock_A,train_auc_base_B,auc_base_B,train_auc_shock_B,auc_shock_B
without,0.788976,0.689471,0.788976,0.683368,0.803322,0.681724,0.803322,0.677294
0.01,0.788976,0.689471,0.788976,0.683368,0.808252,0.687341,0.808252,0.685848
0.03,0.788976,0.689471,0.788976,0.683368,0.821747,0.677442,0.821747,0.67481
0.05,0.788976,0.689471,0.788976,0.683368,0.810142,0.683513,0.810142,0.680431
0.07,0.788976,0.689471,0.788976,0.683368,0.809423,0.683044,0.809423,0.673051
0.1,0.788976,0.689471,0.788976,0.683368,0.801836,0.682455,0.801836,0.67748
0.5,0.788976,0.689471,0.788976,0.683368,0.799112,0.676178,0.799112,0.679273
1.0,0.788976,0.689471,0.788976,0.683368,0.815856,0.67929,0.815856,0.682815


In [11]:
display(scores_data[['auc_base_A', 'auc_shock_A', 'auc_base_B', 'auc_shock_B']])

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B
without,0.689471,0.683368,0.681724,0.677294
0.01,0.689471,0.683368,0.687341,0.685848
0.03,0.689471,0.683368,0.677442,0.67481
0.05,0.689471,0.683368,0.683513,0.680431
0.07,0.689471,0.683368,0.683044,0.673051
0.1,0.689471,0.683368,0.682455,0.67748
0.5,0.689471,0.683368,0.676178,0.679273
1.0,0.689471,0.683368,0.67929,0.682815


In [12]:
scores_dict = compute_uplift(scores_dict, train_data, base_test_data, shock_test_data)

Uplift Calculations: 100%|██████████| 8/8 [00:00<00:00, 10115.90it/s]


In [13]:
results = pd.DataFrame(scores_dict, index=columns_names)
results[['auc_base_A', 'auc_shock_A', 'auc_base_B',
         'auc_shock_B', 'dist_shift', 'score_A', 'score_B',
         'difference_uplift', 'uplift_score']].round(4)

Unnamed: 0,auc_base_A,auc_shock_A,auc_base_B,auc_shock_B,dist_shift,score_A,score_B,difference_uplift,uplift_score
without,0.6895,0.6834,0.6817,0.6773,0.1193,0.9945,0.996,0.0015,0.0
0.01,0.6895,0.6834,0.6873,0.6858,0.1193,0.9945,0.9987,0.0041,0.0582
0.03,0.6895,0.6834,0.6774,0.6748,0.1193,0.9945,0.9976,0.0031,0.0
0.05,0.6895,0.6834,0.6835,0.6804,0.1193,0.9945,0.9972,0.0027,0.0
0.07,0.6895,0.6834,0.683,0.6731,0.1193,0.9945,0.991,0.0,0.0
0.1,0.6895,0.6834,0.6825,0.6775,0.1193,0.9945,0.9955,0.001,0.0
0.5,0.6895,0.6834,0.6762,0.6793,0.1193,0.9945,0.9972,0.0027,0.0
1.0,0.6895,0.6834,0.6793,0.6828,0.1193,0.9945,0.9968,0.0023,0.0


In [15]:
results[['difference_uplift', 'uplift_score']].round(4)

Unnamed: 0,difference_uplift,uplift_score
without,0.0015,0.0
0.01,0.0041,0.0582
0.03,0.0031,0.0
0.05,0.0027,0.0
0.07,0.0,0.0
0.1,0.001,0.0
0.5,0.0027,0.0
1.0,0.0023,0.0


# Save Data

In [16]:
results.to_csv(artifacts_folder + 'exA_tabpfn_results.csv', index=True)