In [18]:
import os
import toml
import json

def load_dataset_config(config_path):
    datasets = []
    with open(config_path, "r") as f:
        for line in f:
            dataset_name = line.strip()
            datasets.append(dataset_name)
    return datasets

# Load benchmark datasets
kdd_config_path = "/data/Blob_EastUS/v-zhenxu2/projects/TabFM/configs/kdd16"
tabr_config_path = "/data/Blob_EastUS/v-zhenxu2/projects/TabFM/configs/default8"
tree_config_path = "/data/Blob_EastUS/v-zhenxu2/projects/TabFM/configs/tree36"
kdd_datasets = load_dataset_config(kdd_config_path)
tabr_datasets = load_dataset_config(tabr_config_path)
tree_datasets = load_dataset_config(tree_config_path)

dataset_root_dir = "/data/Blob_EastUS/xumeng/data/tabular/tabr/resample"
tabr_config_root_dir = "/home/xumengwen/repos/tabular-dl-tabr/exp"

def get_config_path(root_dir, model, dataset):
    config_path = os.path.join(root_dir, model, dataset, 'default-evaluation', '0.toml')
    if not os.path.exists(config_path):
        config_path = os.path.join(root_dir, model, dataset, 'default2-evaluation', '0.toml')
    if not os.path.exists(config_path):
        return None
    return config_path

# Config for tree models and TabR with default hyperparameters

In [15]:
error_cnt = 0
reg_default_example_dataset = 'black-friday'
cls_default_example_dataset = 'adult'
tabr_model_with_default_hyper = ['lightgbm_', 'catboost_', 'xgboost_', 'tabr']
target_config_root_dir = "/data/Blob_WestJP/xumeng/projects/TabR/exp/default_all_baseline_1017"
for dataset in tabr_datasets + tree_datasets:
    for model in tabr_model_with_default_hyper:
        config_path = get_config_path(tabr_config_root_dir, model, dataset)
        dataset_type = 'reg' if 'regression' in dataset else 'cls'
        config_dataset = dataset
        if config_path is None:
            config_dataset = reg_default_example_dataset if dataset_type == 'reg' else cls_default_example_dataset
            config_path = get_config_path(tabr_config_root_dir, model, config_dataset)
        if config_path is None:
            error_cnt += 1
        with open(config_path, 'r') as f:
            content = f.read()
            content = content.replace("cache = true", "cache = false")
            content = content.replace(f'path = ":data/{config_dataset}"', f'path = "{dataset_root_dir}/{dataset}"')
            # Fix typo bug in tabr
            if model == 'tabr':
                content = content.replace("mixer_dropout", "context_dropout")
        target_config_path = os.path.join(target_config_root_dir, model, dataset, 'default-evaluation', '0.toml')
        # Save config into target path
        os.makedirs(os.path.dirname(target_config_path), exist_ok=True)
        with open(target_config_path, 'w') as f:
            f.write(content)
print("error:", error_cnt)

error: 0


# Config for nn baseline with default hyperparameters

In [16]:
error_cnt = 0
default_example_dataset = 'covtype'
models = ['saint', 'ft_transformer', 'ffn'] # ffn means MLP
# models = ['ffn'] # ffn means MLP
target_config_root_dir = "/data/Blob_WestJP/xumeng/projects/TabR/exp/default_all_baseline_1017"
for dataset in tabr_datasets + tree_datasets:
    info = json.load(open(f"{dataset_root_dir}/{dataset}/info.json"))
    dataset_type = info['task_type']
    for model in models:
        config_path = get_config_path(tabr_config_root_dir, model, dataset)
        config_dataset = dataset
        if config_path is None:
            config_dataset = default_example_dataset
            config_path = get_config_path(tabr_config_root_dir, model, config_dataset)
        if config_path is None:
            error_cnt += 1
        config = toml.load(config_path)
        config['data']['cache'] = False
        config['data']['path'] = f"{dataset_root_dir}/{dataset}"
        config['data']['num_policy'] = "quantile"
        config['data']['cat_policy'] = "ordinal"
        if dataset_type == 'regression':
            config['data']['y_policy'] = 'standard'
        else:
            config['data']['y_policy'] = '__null__'
        # Save config into target path
        target_config_path = os.path.join(target_config_root_dir, model, dataset, 'default-evaluation', '0.toml')
        os.makedirs(os.path.dirname(target_config_path), exist_ok=True)
        with open(target_config_path, 'w') as f:
            toml.dump(config, f)
print("error:", error_cnt)

error: 0


# Baseline with default hyperparameters for KDD benchmark

In [20]:
error_cnt = 0
default_example_dataset = 'covtype'
models = ['lightgbm_', 'catboost_', 'xgboost_', 'tabr', 'saint', 'ft_transformer', 'ffn'] # ffn means MLP

target_config_root_dir = "/data/Blob_WestJP/xumeng/projects/TabR/exp/default_all_baseline_kdd_1018"
for dataset in kdd_datasets:
    info = json.load(open(f"{dataset_root_dir}/{dataset}/info.json"))
    dataset_type = info['task_type']
    for model in models:
        config_path = get_config_path(tabr_config_root_dir, model, dataset)
        config_dataset = dataset
        if config_path is None:
            config_dataset = default_example_dataset
            config_path = get_config_path(tabr_config_root_dir, model, config_dataset)
        if config_path is None:
            error_cnt += 1
        config = toml.load(config_path)
        config['data']['cache'] = False
        config['data']['path'] = f"{dataset_root_dir}/{dataset}"
        # Fix typo bug in tabr
        if model == 'tabr':
            config['model']['context_dropout'] = config['model']['mixer_dropout']
            del config['model']['mixer_dropout']
        # Set feature processing policy
        if model in ['tabr', 'ffn', 'saint', 'ft_transformer']:
            config['data']['num_policy'] = "quantile"
            config['data']['cat_policy'] = "ordinal"
        elif model in ['lightgbm_', 'xgboost_']:
            config['data']['num_policy'] = '__null__'
            config['data']['cat_policy'] = "one-hot"
        elif model in ['catboost_']:
            config['data']['num_policy'] = '__null__'
            config['data']['cat_policy'] = '__null__'
        # Set label processing policy
        if dataset_type == 'regression':
            config['data']['y_policy'] = 'standard'
        else:
            config['data']['y_policy'] = '__null__'
        # Save config into target path
        target_config_path = os.path.join(target_config_root_dir, model, dataset, 'default-evaluation', '0.toml')
        os.makedirs(os.path.dirname(target_config_path), exist_ok=True)
        with open(target_config_path, 'w') as f:
            toml.dump(config, f)
print("error:", error_cnt)

error: 0
