In [1]:
import pandas as pd
import os, sys, json
root_dir = os.path.dirname(os.getcwd())
sys.path.append(root_dir)

# Datasets

In [37]:
DATASETS=(
    "openml__Australian__146818",
    "openml__LED-display-domain-7digit__125921",
    "openml__MiceProtein__146800",
    "openml__acute-inflammations__10089",
    "openml__analcatdata_authorship__3549",
    "openml__analcatdata_boxing1__3540",
    "openml__analcatdata_chlamydia__3739",
    "openml__analcatdata_dmft__3560",
    "openml__anneal__2867",
    "openml__autos__9",
    "openml__balance-scale__11",
    "openml__blood-transfusion-service-center__10101",
    "openml__blood-transfusion-service-center__145836",
    "openml__breast-cancer__145799",
    "openml__breast-w__15",
    "openml__colic__25",
    "openml__colic__27",
    "openml__credit-approval__29",
    "openml__cylinder-bands__14954",
    "openml__dermatology__35",
    "openml__diabetes__37",
    "openml__dresses-sales__125920",
    "openml__ecoli__145977",
    "openml__eucalyptus__2079",
    "openml__fertility__9984",
    "openml__fri_c0_100_5__3620",
    "openml__fri_c3_100_5__3779",
    "openml__glass__40",
    "openml__hayes-roth__146063",
    "openml__heart-c__48",
    "openml__heart-h__50",
    "openml__hill-valley__145847",
    "openml__ilpd__9971",
    "openml__ionosphere__145984",
    "openml__iris__59",
    "openml__irish__3543",
    "openml__kc2__3913",
    "openml__labor__4",
    "openml__lung-cancer__146024",
    "openml__lymph__10",
    "openml__monks-problems-2__146065",
    "openml__pc1__3918",
    "openml__postoperative-patient-data__146210",
    "openml__profb__3561",
    "openml__qsar-biodeg__9957",
    "openml__rabe_266__3647",
    "openml__socmob__3797",
    "openml__sonar__39",
    "openml__synthetic_control__3512",
    "openml__tae__47",
    "openml__tic-tac-toe__49",
    "openml__transplant__3748",
    "openml__vehicle__53",
    "openml__visualizing_environmental__3602",
    "openml__visualizing_livestock__3731",
    "openml__wdbc__9946",
    "openml__yeast__145793",
)
metadataset_df = pd.read_csv(f"{root_dir}/TabZilla/tutorials/metadataset_clean.csv")

def read_json(file):
    with open(file, "r") as f:
        return json.load(f)
    
eval = {}

In [25]:
# get the results of TabFlex
tabflex_results = {}
for dataset in DATASETS:
    tabflex_results[dataset] = {}
    result_dir = f"{root_dir}/results/TabFlexModel/{dataset}"
    for i in range(30):
        try:
            if i == 0:
                tabflex_results[dataset][i] = read_json(f"{result_dir}/default_trial0_results.json")
            else:
                tabflex_results[dataset][i] = read_json(f"{result_dir}/random_{i}_s0_trial{i}_results.json")
        except FileNotFoundError:
            print(f"TabFlexModel: {dataset} trial {i} not found")
            

In [38]:
eval['tabflex'] = {}
for dataset in tabflex_results:
    accs = []
    eval['tabflex'][dataset] = {}
    for i in tabflex_results[dataset]:
        try:
            accs.extend(tabflex_results[dataset][i]['scorers']['test']['Accuracy'])
        except:
            print(f"TabFlexModel: {dataset} trial {i} not found")
    accs = pd.Series(accs)
    eval['tabflex'][dataset]['median_acc'] = accs.median()
    eval['tabflex'][dataset]['mean_acc'] = accs.mean()
    

In [47]:
def get_baselines(method, eval):
    eval[method] = {}
    for dataset in DATASETS:
        eval[method][dataset] = {}
        result = metadataset_df.loc[
            (metadataset_df["alg_name"] == method) & 
            (metadataset_df["hparam_source"] == "default") &
            (metadataset_df["dataset_name"] == dataset),
            [
                "dataset_fold_id", 
                "alg_name", 
                "hparam_source", 
                "Accuracy__test", 
                "training_time"]
        ]
        eval[method][dataset]['median_acc'] = result["Accuracy__test"].median()
        eval[method][dataset]['mean_acc'] = result["Accuracy__test"].mean()
    

In [50]:
baselines = [
    'LinearModel', 
    'KNN', 
    'SVM', 
    'DecisionTree', 
    'RandomForest', 
    'XGBoost', 
    'CatBoost', 
    'LightGBM', 
    'MLP', 
    'TabNet', 
    'VIME', 
    'TabTransformer', 
    'NODE', 
    'DeepGBM', 
    'STG', 
    'NAM', 
    'DeepFM', 
    'SAINT', 
    'DANet', 
    'TabPFNModel', 
    'rtdl_MLP', 
    'rtdl_ResNet', 
    'rtdl_FTTransformer',
]

In [55]:
metadataset_df.columns

Index(['dataset_fold_id', 'dataset_name', 'target_type', 'alg_name',
       'hparam_source', 'Log Loss__train', 'Log Loss__val', 'Log Loss__test',
       'AUC__train', 'AUC__val', 'AUC__test', 'Accuracy__train',
       'Accuracy__val', 'Accuracy__test', 'F1__train', 'F1__val', 'F1__test',
       'training_time', 'eval-time__train', 'eval-time__val',
       'eval-time__test'],
      dtype='object')

In [51]:
for baseline in baselines: 
    try:
        get_baselines(baseline, eval)
    except:
        print(f"Error in {baseline}")

In [52]:
methods = baselines + ['tabflex']

for method in methods:
    result = pd.DataFrame(eval[method]).T
    print(f"| {method} | {result['mean_acc'].mean()} |")

| LinearModel | 0.7736275450142834 |
| KNN | 0.7397585454534903 |
| SVM | 0.7477843851667558 |
| DecisionTree | 0.7719035212915039 |
| RandomForest | 0.8065440737065069 |
| XGBoost | 0.8094498466827325 |
| CatBoost | 0.8146179830353774 |
| LightGBM | 0.7802465667858238 |
| MLP | 0.7212519755562558 |
| TabNet | 0.6878234239482723 |
| VIME | 0.6055174415744894 |
| TabTransformer | 0.6899508926823906 |
| NODE | 0.79062151711632 |
| DeepGBM | nan |
| STG | 0.6734659013628531 |
| NAM | 0.7089848212067189 |
| DeepFM | 0.8165445403160158 |
| SAINT | 0.7988066966670798 |
| DANet | 0.7585700208317423 |
| TabPFNModel | 0.8285329708589384 |
| rtdl_MLP | 0.7475208433391759 |
| rtdl_ResNet | 0.7949493862728517 |
| rtdl_FTTransformer | 0.7790287773954705 |
| tabflex | 0.7897420889634574 |


In [45]:
result.T['mean_acc'].mean()

0.7897420889634574