In [1]:
import os
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from evaluation.metrics import evaluate
from tqdm import tqdm
import torch
from util import ThresholdClassifier

# Single Step With Prior

In [8]:
from sklearn.metrics import f1_score


def eval_single_step(model_name, with_prior=True):
    model_path = os.path.join('../storage', "creditcard", model_name)
    if not os.path.exists(model_path):
        return None

    test_set = pd.read_csv(os.path.join(model_path, 'test.csv'))
    dev_set = pd.read_csv(os.path.join(model_path, 'val.csv'))
    if not with_prior:
        if 'fst_step_pred' in test_set.columns:
            test_set['pred'] = test_set['fst_step_pred']
            dev_set['pred'] = dev_set['fst_step_pred']
        else:
            # if not within the range of [0, 1], then use logit to transform
            if test_set['fst_step_scores'].max() > 1 or test_set['fst_step_scores'].min() < 0:
                import math
                def sigmoid(x):
                    return 1 / (1 + math.exp(-x))
                test_set['fst_step_scores'] = test_set['fst_step_scores'].apply(sigmoid)

            if dev_set['fst_step_scores'].max() > 1 or dev_set['fst_step_scores'].min() < 0:
                dev_set['fst_step_scores'] = dev_set['fst_step_scores'].apply(sigmoid)
    
    test_set['pred_proba'] = test_set['fst_step_scores']
    dev_set['pred_proba'] = dev_set['fst_step_scores']
    eval_dict = evaluate(test_set, with_prior=with_prior, dev_set=dev_set)
    result_df = pd.DataFrame(eval_dict, index=[0])
    # no column limit
    pd.set_option('display.max_columns', None)
    # print(result_df)

    return eval_dict

In [11]:
all_results = []
for fold in [1, 2, 3]:
    model_name = [
        f'rf_benchmark_fold{fold}', f'xgb_benchmark_fold{fold}', f'lgbm_benchmark_fold{fold}',
        f'deepsad_benchmark_fold{fold}', f'deepisolationforest_benchmark_fold{fold}', f'feawad_benchmark_fold{fold}', f'slad_benchmark_fold{fold}',
        f'sour_100_binary_fold{fold}', f'tabtransformer_benchmark_fold{fold}',
        f'lambdamart_100_binary_1000_trees_fold{fold}', f'lambdamart_100_binary_10000_trees_fold{fold}', f'lambdamart_pabce_10000_fold{fold}'
    ]
    proposed_model_names = [
        f'rankformer_100_binary_softmax_fold{fold}', f'rankformer_100_binary_lambdaloss_fold{fold}',
        f'pariskranker_100_binary_graph_fold{fold}'
    ]
    for proposed_model_name in proposed_model_names:
        model_path = os.path.join('../storage', "creditcard", proposed_model_name)
        if not os.path.isdir(model_path):
            continue
        subfolders = [f"{proposed_model_name}/{f.name}" for f in os.scandir(model_path) if f.is_dir()]
        model_name.extend(subfolders)

    for name in tqdm(model_name):
        eval_dict = eval_single_step(name, with_prior=True)
        if eval_dict is not None:
            eval_dict["model"] = name.rsplit('_fold', 1)[0]  # Strip fold suffix
            eval_dict["fold"] = fold
            all_results.append(eval_dict)

# Combine and compute stats
df_all = pd.DataFrame(all_results)
grouped = df_all.groupby("model").agg(['mean'])

# Flatten column names
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
grouped = grouped.sort_values(by='f1_score_mean', ascending=False)

grouped

100%|██████████| 15/15 [00:04<00:00,  3.69it/s]
100%|██████████| 15/15 [00:04<00:00,  3.74it/s]
100%|██████████| 15/15 [00:04<00:00,  3.74it/s]


Unnamed: 0_level_0,f1_score_mean,PnL_mean,auc_score_mean,precision_mean,sensitivity_mean,specificity_mean,fold_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pariskranker_100_binary_graph,0.987033,-31368.39,0.9998,0.9743,0.9743,0.999767,2.0
rankformer_100_binary_softmax,0.982,-43821.78,0.999933,0.9644,0.9644,0.999633,2.0
deepsad_benchmark,0.976667,-52499.95,0.999867,0.953867,0.953867,0.999533,2.0
xgb_benchmark,0.9755,-58149.26,0.9999,0.951567,0.951567,0.9995,2.0
lgbm_benchmark,0.9729,-63659.04,0.9999,0.9463,0.9463,0.9995,2.0
lambdamart_100_binary_10000_trees,0.969367,-72817.16,0.9999,0.9393,0.9393,0.999367,2.0
lambdamart_100_binary_1000_trees,0.969367,-72757.35,0.9999,0.9393,0.9393,0.999367,2.0
rf_benchmark,0.968167,-74770.6,0.999833,0.937,0.937,0.999367,2.0
lambdamart_pabce_10000,0.966667,-83895.95,0.999067,0.934033,0.934033,0.999333,2.0
tabtransformer_benchmark,0.959333,-88705.93,0.9971,0.919467,0.919467,0.9992,2.0


In [12]:
all_results = []
for fold in [1, 2, 3]:
    model_name = [
        f'rf_benchmark_fold{fold}', f'xgb_benchmark_fold{fold}', f'lgbm_benchmark_fold{fold}',
        f'deepsad_benchmark_fold{fold}', f'deepisolationforest_benchmark_fold{fold}', f'feawad_benchmark_fold{fold}', f'slad_benchmark_fold{fold}',
        f'sour_100_binary_fold{fold}', f'tabtransformer_benchmark_fold{fold}',
        f'lambdamart_100_binary_1000_trees_fold{fold}', f'lambdamart_100_binary_10000_trees_fold{fold}', f'lambdamart_pabce_10000_fold{fold}'
    ]
    proposed_model_names = [
        f'rankformer_100_binary_softmax_fold{fold}', f'rankformer_100_binary_lambdaloss_fold{fold}',
        f'pariskranker_100_binary_graph_fold{fold}'
    ]
    for proposed_model_name in proposed_model_names:
        model_path = os.path.join('../storage', "creditcard", proposed_model_name)
        if not os.path.isdir(model_path):
            continue
        subfolders = [f"{proposed_model_name}/{f.name}" for f in os.scandir(model_path) if f.is_dir()]
        model_name.extend(subfolders)

    for name in tqdm(model_name):
        eval_dict = eval_single_step(name, with_prior=False)
        if eval_dict is not None:
            eval_dict["model"] = name.rsplit('_fold', 1)[0]  # Strip fold suffix
            eval_dict["fold"] = fold
            all_results.append(eval_dict)

# Combine and compute stats
df_all = pd.DataFrame(all_results)
grouped = df_all.groupby("model").agg(['mean'])

# Flatten column names
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
grouped = grouped.sort_values(by='f1_score_mean', ascending=False)

grouped

100%|██████████| 15/15 [00:12<00:00,  1.22it/s]
100%|██████████| 15/15 [00:12<00:00,  1.21it/s]
100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


Unnamed: 0_level_0,f1_score_mean,PnL_mean,auc_score_mean,precision_mean,sensitivity_mean,specificity_mean,fold_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pariskranker_100_binary_graph,0.985633,-35332.36,0.9998,0.970767,0.972567,0.999733,2.0
rankformer_100_binary_softmax,0.981133,-45342.84,0.999933,0.958067,0.9673,0.999567,2.0
lgbm_benchmark,0.972233,-66277.72,0.9999,0.968167,0.922933,0.9997,2.0
xgb_benchmark,0.9718,-67644.6,0.9999,0.975833,0.914767,0.999767,2.0
lambdamart_100_binary_10000_trees,0.969867,-72533.09,0.9999,0.9565,0.9247,0.9996,2.0
lambdamart_100_binary_1000_trees,0.9679,-77118.5,0.9999,0.941933,0.9317,0.9994,2.0
rf_benchmark,0.964867,-85672.36,0.999833,0.968333,0.896067,0.999733,2.0
tabtransformer_benchmark,0.959333,-89030.83,0.9971,0.900733,0.939267,0.998967,2.0
rankformer_100_binary_lambdaloss,0.9533,-104037.6,0.999733,0.8883,0.928767,0.9988,2.0
feawad_benchmark,0.696467,-657491.5,0.992133,0.305767,0.830733,0.962867,2.0
