In [1]:
from pathlib import Path
import pandas as pd
import os, json
from analysis_utils import get_tuned_alg_perf
root_dir = os.path.dirname(os.getcwd())

In [2]:
def read_json(file):
    with open(file, 'r') as f:
        return json.load(f)

In [3]:
metadata_folder = Path("../TabZilla")
metadataset_df = pd.read_csv(metadata_folder / "tutorials/metadataset_clean.csv")

In [4]:
metadataset_df.head()

Unnamed: 0,dataset_fold_id,dataset_name,target_type,alg_name,hparam_source,Log Loss__train,Log Loss__val,Log Loss__test,AUC__train,AUC__val,...,Accuracy__train,Accuracy__val,Accuracy__test,F1__train,F1__val,F1__test,training_time,eval-time__train,eval-time__val,eval-time__test
0,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,binary,CatBoost,default,0.018501,0.02688,0.020194,0.990573,0.98676,...,0.994523,0.991053,0.993421,0.994523,0.991053,0.993421,4.039096,0.185535,0.028121,0.034166
1,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,binary,CatBoost,random_1_s0,0.011968,0.024195,0.017324,0.995348,0.988589,...,0.997533,0.991974,0.994605,0.997533,0.991974,0.994605,3.624039,0.1299,0.020647,0.017871
2,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,binary,CatBoost,random_10_s0,0.022589,0.029668,0.023737,0.990179,0.985547,...,0.992566,0.989605,0.990921,0.992566,0.989605,0.990921,2.494685,0.074099,0.021543,0.01958
3,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,binary,CatBoost,random_11_s0,0.028515,0.034531,0.028539,0.987448,0.985578,...,0.990641,0.986974,0.990132,0.990641,0.986974,0.990132,2.532277,0.061874,0.014086,0.013869
4,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,binary,CatBoost,random_12_s0,0.015119,0.02557,0.019329,0.993757,0.989098,...,0.996382,0.991447,0.993684,0.996382,0.991447,0.993684,4.22003,0.121606,0.034948,0.033927


In [5]:


new_df = []

for model in ['TabPFNModel', 'TabFlexModel', 'TabFastModel']:
    datasets = os.listdir(f'../results/{model}')
    for dataset in datasets:
        if model in ['TabFastModel', 'TabFlexModel']:
            try:
                result = read_json(f'../results/{model}/{dataset}/1000000000000000/default_trial0_results.json')
            except:
                result = read_json(f'../results/{model}/{dataset}/100000/default_trial0_results.json')
        elif model == 'TabPFNModel':
            result = read_json(f'../results/{model}/{dataset}/3000/default_trial0_results.json')
            
        for fold_id in range(10):
            try:
                new_df.append({
                    'dataset_fold_id': f'{dataset}__fold_{fold_id}',
                    'dataset_name': dataset,
                    'target_type': result['dataset']['target_type'],
                    'alg_name': model,
                    'hparam_source': result['hparam_source'],
                    'Log Loss__train': result['scorers']['train']['Log Loss'][fold_id],
                    'Log Loss__val': result['scorers']['val']['Log Loss'][fold_id],
                    'Log Loss__test': result['scorers']['test']['Log Loss'][fold_id],
                    'AUC__train': result['scorers']['train']['AUC'][fold_id],
                    'AUC__val': result['scorers']['val']['AUC'][fold_id],
                    'AUC__test': result['scorers']['test']['AUC'][fold_id],
                    'Accuracy__train': result['scorers']['train']['Accuracy'][fold_id],
                    'Accuracy__val': result['scorers']['val']['Accuracy'][fold_id],
                    'Accuracy__test': result['scorers']['test']['Accuracy'][fold_id],
                    'F1__train': result['scorers']['train']['F1'][fold_id],
                    'F1__val': result['scorers']['val']['F1'][fold_id],
                    'F1__test': result['scorers']['test']['F1'][fold_id],
                    'training_time': result['timers']['train'][1] if fold_id == 0 else result['timers']['train'][fold_id],
                    'time__val': result['timers']['val'][fold_id],
                    'time__test': result['timers']['test'][fold_id],
                    'eval-time__train': result['timers']['train-eval'][fold_id],
                    'eval-time__val': result['timers']['val'][fold_id],
                    'eval-time__test': result['timers']['test'][fold_id],
                })
            except Exception as e:
                print(e)
                print(f'Error with {dataset} fold {fold_id}')
        
new_df = pd.DataFrame(new_df)

In [6]:
# concatenate metadataset_df and new_df
updated_df = pd.concat([metadataset_df, new_df], axis=0)

In [7]:
def get_dataset_stat(dataset):
    result_dir = f"{root_dir}/results/TabFlexModel/{dataset}"
    if '100000' in os.listdir(result_dir):
        result_dir = f"{result_dir}/100000"
    else:
        result_dir = f"{result_dir}/1000000000000000"
    items = os.listdir(result_dir)
    for item in items:
        if item == 'default_trial0_results.json':
            result = read_json(f"{result_dir}/{item}")
            break
        elif os.path.isdir(f"{result_dir}/{item}"):
            result = read_json(f"{result_dir}/{item}/default_trial0_results.json")
            break
    return {
        'num_classes': result['dataset']['num_classes'],
        'num_features': result['dataset']['num_features'],
        'num_instances': result['dataset']['num_instances'],
    }

In [8]:
new_rows = []
# iterate over the datasets 
dataset_fold_ids = updated_df[updated_df.alg_name == 'TabFlexModel'].dataset_fold_id.unique()
for dataset_fold_id in dataset_fold_ids:
    tabpfn_row = updated_df[(updated_df.dataset_fold_id == dataset_fold_id) & (updated_df.alg_name == 'TabPFNModel')].iloc[-1]
    tabflex_row = updated_df[(updated_df.dataset_fold_id == dataset_fold_id) & (updated_df.alg_name == 'TabFlexModel')].iloc[-1]
    tabfast_row = updated_df[(updated_df.dataset_fold_id == dataset_fold_id) & (updated_df.alg_name == 'TabFastModel')].iloc[-1]
    data_stat = get_dataset_stat(tabpfn_row.dataset_name)
    if data_stat['num_instances'] >= 50000 and data_stat['num_features'] <= 100:
        new_row = tabfast_row.copy()
    elif data_stat['num_features'] > 100 or ((data_stat['num_features'] / data_stat['num_instances'] >= 0.2) and data_stat['num_instances'] >= 3000):
        new_row = tabflex_row.copy()
    else:
        new_row = tabpfn_row.copy()
        
    new_row['alg_name'] = 'Ours'
    new_rows.append(new_row)

In [9]:
new_df = pd.DataFrame(new_rows)
updated_df = pd.concat([updated_df, new_df], axis=0)
updated_df.to_csv(metadata_folder / "tutorials/metadataset_new.csv", index=False)