### Baselines

This code builds a simple bagging ensemble based on Kfold train/val split. Use it for quick idea testing with GPT-5-assisted tree generation.

In [1]:
import optuna

import os
import re
import json
import numpy as np
import pandas as pd
import openml
import smolagents
from pathlib import Path
from huggingface_hub import login
import prompting
import tree_agent
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


from task import metric_func_by_task, get_task_variables, add_tabpfn_baseline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load tabular benchmark
tabarena_version = "tabarena-v0.1"
benchmark_suite = openml.study.get_suite(tabarena_version)
task_ids = benchmark_suite.tasks
dataset_name_to_task_id = {}
for task_id in task_ids:
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    n_samples = dataset.qualities["NumberOfInstances"]
    if n_samples < 2_500:
        dataset_name_to_task_id[dataset.name] = task_id
        print(dataset.name, int(n_samples), task_id)

airfoil_self_noise 1503 363612
anneal 898 363614
Another-Dataset-on-used-Fiat-500 1538 363615
blood-transfusion-service-center 748 363621
concrete_compressive_strength 1030 363625
credit-g 1000 363626
diabetes 768 363629
Fitness_Club 1500 363671
hazelnut-spread-contaminant-detection 2400 363674
healthcare_insurance_expenses 1338 363675
Is-this-a-good-customer 1723 363682
Marketing_Campaign 2240 363684
maternal_health_risk 1014 363685
qsar-biodeg 1054 363696
QSAR_fish_toxicity 907 363698
website_phishing 1353 363707
MIC 1699 363711


### Tuning utils

In [3]:
def preprocess_for_tree(data):
    "ordinal encoding for cat features"
    categorical_indicator = data['categorical_indicator']
    X = pd.concat([data['X_train'], data['X_test']])
    
    categorical_features = [
        col
        for col, is_cat in zip(X.columns, categorical_indicator)
        if is_cat or X[col].dtype is np.dtype("O")
    ]
    numeric_features = [col for col in X.columns if col not in categorical_features]
    
    # ordinally encode cat features 
    preprocessor = (
        ColumnTransformer(
            [
                ("num", "passthrough", numeric_features),
                (
                    "cat",
                    OrdinalEncoder(
                        handle_unknown="use_encoded_value", unknown_value=-1
                    ),
                    categorical_features,
                ),
            ]
        )
        if categorical_features
        else ColumnTransformer([("num", "passthrough", numeric_features)])
    )

    preprocessor.fit(X)
    X_train = preprocessor.transform(data['X_train'])
    X_test = preprocessor.transform(data['X_test'])
    
    return data | {'X_train': X_train, 'X_test': X_test}

def tree_search_space(trial, task_type):
    """Decision tree hyperparameter search space."""
    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "criterion": trial.suggest_categorical(
            "criterion",
            ["gini", "entropy", "log_loss"]
            if task_type != "regression"
            else ["squared_error", "friedman_mse"],
        ),
    }
    if trial.suggest_categorical("use_ccp_alpha", [False, True]):
        params["ccp_alpha"] = trial.suggest_float("ccp_alpha", 1e-5, 0.1, log=True)

    return params

def get_pred_fn(model, task_type):
    if task_type == 'regression':
        return model.predict
    elif task_type == 'binary':
        return lambda x: model.predict_proba(x)[:, 1]
    else:
        return model.predict_proba

def tune(model_fn, get_params_fn, X_train, y_train, X_val, y_val, task_type):
    """Tune hyperparameters."""
    trial_params = []

    def objective(trial):
        params = get_params_fn(trial, task_type)
        model = model_fn(random_state=RANDOM_STATE, **params)
        trial_params.append(params)
        model.fit(X_train, y_train)
        pred_fn = get_pred_fn(model, task_type)
        sign = -1 if task_type in ('regression', 'multiclass') else 1
        return sign * metric_func_by_task[task_type](y_val, pred_fn(X_val))

    sampler = optuna.samplers.TPESampler(seed=0)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    optuna.logging.set_verbosity(optuna.logging.ERROR)
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

    return trial_params[study.best_trial.number]

### Actual run

In [4]:
RANDOM_STATE = 42
N_TRIALS = 200

for dataset_name in dataset_name_to_task_id:
    print('\n\n')
    print('=' * 100)
    print(f'Running for {dataset_name}')

    default_tree_scores = []
    tuned_tree_scores = []

    for repeat_index in range(5):
        print("Beginning repeat", repeat_index)
        task = openml.tasks.get_task(dataset_name_to_task_id[dataset_name])
        data = get_task_variables(task, fold=0, repeat=repeat_index)
        data = preprocess_for_tree(data)
        task_type = data['task_type']
        X_train, X_val, y_train, y_val = train_test_split(
            data["X_train"], data["y_train"], test_size=0.2, 
            random_state=RANDOM_STATE, stratify=data['y_train'] if data['task_type'] == 'multiclass' else None
        
        )
        X_test, y_test = data["X_test"], data["y_test"]

        dt_cls = (
            DecisionTreeClassifier
            if task_type != "regression"
            else DecisionTreeRegressor
        )

        # Default decision tree
        model = dt_cls(random_state=RANDOM_STATE)
        model.fit(X_train, y_train)
        pred_fn = get_pred_fn(model, task_type)

        default_tree_scores.append(
            metric_func_by_task[task_type](y_test, pred_fn(X_test))
        )

        # Tuned decision tree
        params = tune(
            dt_cls, tree_search_space, X_train, y_train, X_val, y_val, task_type
        )

        model = dt_cls(random_state=RANDOM_STATE, **params)
        model.fit(X_train, y_train)
        pred_fn = get_pred_fn(model, task_type)
        tuned_tree_scores.append(
            metric_func_by_task[task_type](y_test, pred_fn(X_test))
        )
        print(f'Default Tree score: {default_tree_scores[-1]:.4f}' )
        print(f'Tuned Tree score: {tuned_tree_scores[-1]:.4f}' )

    # >>> Load tabarena baselines (just the scores from the above df)
    df = pd.read_parquet(
        "https://tabarena.s3.us-west-2.amazonaws.com/results/df_results_leaderboard.parquet"
    )

    tabarena_baselines = [
        "TABPFNV2 (default)",
        "TABPFNV2 (tuned)",
        "XGB (default)",
        "XGB (tuned)",
    ]

    # fold ids -- see https://github.com/autogluon/tabrepo/issues/209 for why this formula
    d,m = df.fold.divmod(10)

    df_fold0_5repeats = df[(d == 0) & (m < 5)]
    df_filtered = df_fold0_5repeats[df_fold0_5repeats["method"].isin(tabarena_baselines)]

    # Convert 1-AUC back to AUC for binary classification
    # other errors are logloss and RMSE
    mask = (df_filtered["problem_type"] == "binary") & (
        df_filtered["metric"] == "roc_auc"
    )
    df_filtered.loc[mask, "metric_error"] = 1 - df_filtered.loc[mask, "metric_error"]

    baseline_errors = {
        method: df_filtered[
            (df_filtered["dataset"] == dataset_name) & 
            (df_filtered["method"] == method)
        ]["metric_error"].tolist()
        for method in tabarena_baselines
    }

    results_path = f"results/baseline_scores.{dataset_name}.json"
    Path('results').mkdir(exist_ok=True)

    with open(results_path, "w") as f:
        json.dump({
            'CART (tuned)': tuned_tree_scores,
            'CART (default)': default_tree_scores,
            } | 
            baseline_errors,
            f, indent=4
        )




Running for Fitness_Club
Beginning repeat 0
Inferred task type: binary


[I 2025-09-23 20:11:05,318] A new study created in memory with name: no-name-2e35a93a-febc-4352-b7d2-aabfbd2c9997
Best trial: 49. Best value: 0.822385: 100%|██████████| 200/200 [00:05<00:00, 39.72it/s]


Default Tree score: 0.6155
Tuned Tree score: 0.7949
Beginning repeat 1
Inferred task type: binary


Best trial: 21. Best value: 0.820499: 100%|██████████| 200/200 [00:04<00:00, 40.21it/s]


Default Tree score: 0.5921
Tuned Tree score: 0.8121
Beginning repeat 2
Inferred task type: binary


Best trial: 35. Best value: 0.805556: 100%|██████████| 200/200 [00:04<00:00, 43.61it/s]


Default Tree score: 0.6204
Tuned Tree score: 0.7672
Beginning repeat 3
Inferred task type: binary


Best trial: 12. Best value: 0.845362: 100%|██████████| 200/200 [00:05<00:00, 37.78it/s]


Default Tree score: 0.6127
Tuned Tree score: 0.8231
Beginning repeat 4
Inferred task type: binary


Best trial: 134. Best value: 0.812624: 100%|██████████| 200/200 [00:05<00:00, 37.34it/s]


Default Tree score: 0.6337
Tuned Tree score: 0.8030



Running for hazelnut-spread-contaminant-detection
Beginning repeat 0
Inferred task type: binary


Best trial: 105. Best value: 0.893601: 100%|██████████| 200/200 [00:11<00:00, 17.79it/s]


Default Tree score: 0.7975
Tuned Tree score: 0.8738
Beginning repeat 1
Inferred task type: binary


Best trial: 2. Best value: 0.914689: 100%|██████████| 200/200 [00:11<00:00, 17.83it/s]


Default Tree score: 0.8087
Tuned Tree score: 0.8957
Beginning repeat 2
Inferred task type: binary


Best trial: 2. Best value: 0.891565: 100%|██████████| 200/200 [00:10<00:00, 18.77it/s]


Default Tree score: 0.8300
Tuned Tree score: 0.8883
Beginning repeat 3
Inferred task type: binary


Best trial: 10. Best value: 0.881207: 100%|██████████| 200/200 [00:10<00:00, 18.45it/s]


Default Tree score: 0.8125
Tuned Tree score: 0.9057
Beginning repeat 4
Inferred task type: binary


Best trial: 64. Best value: 0.877976: 100%|██████████| 200/200 [00:11<00:00, 16.69it/s]


Default Tree score: 0.8387
Tuned Tree score: 0.9095



Running for healthcare_insurance_expenses
Beginning repeat 0
Inferred task type: regression


Best trial: 142. Best value: -5710.29: 100%|██████████| 200/200 [00:04<00:00, 45.55it/s]


Default Tree score: 6459.0024
Tuned Tree score: 4237.3702
Beginning repeat 1
Inferred task type: regression


Best trial: 78. Best value: -5368.88: 100%|██████████| 200/200 [00:04<00:00, 41.42it/s]


Default Tree score: 6005.9153
Tuned Tree score: 4502.4752
Beginning repeat 2
Inferred task type: regression


Best trial: 187. Best value: -4919.23: 100%|██████████| 200/200 [00:04<00:00, 42.73it/s]


Default Tree score: 6963.9839
Tuned Tree score: 4706.8975
Beginning repeat 3
Inferred task type: regression


Best trial: 22. Best value: -4436.9: 100%|██████████| 200/200 [00:04<00:00, 45.76it/s]


Default Tree score: 5919.5303
Tuned Tree score: 4505.4430
Beginning repeat 4
Inferred task type: regression


Best trial: 34. Best value: -4219.69: 100%|██████████| 200/200 [00:04<00:00, 46.68it/s]


Default Tree score: 6556.1025
Tuned Tree score: 4519.1239



Running for Is-this-a-good-customer
Beginning repeat 0
Inferred task type: binary


Best trial: 133. Best value: 0.770035: 100%|██████████| 200/200 [00:05<00:00, 39.47it/s]


Default Tree score: 0.5372
Tuned Tree score: 0.7002
Beginning repeat 1
Inferred task type: binary


Best trial: 135. Best value: 0.770651: 100%|██████████| 200/200 [00:04<00:00, 41.71it/s]


Default Tree score: 0.5356
Tuned Tree score: 0.6938
Beginning repeat 2
Inferred task type: binary


Best trial: 167. Best value: 0.710583: 100%|██████████| 200/200 [00:04<00:00, 44.04it/s]


Default Tree score: 0.5642
Tuned Tree score: 0.6919
Beginning repeat 3
Inferred task type: binary


Best trial: 14. Best value: 0.739348: 100%|██████████| 200/200 [00:05<00:00, 38.11it/s]


Default Tree score: 0.5381
Tuned Tree score: 0.6892
Beginning repeat 4
Inferred task type: binary


Best trial: 89. Best value: 0.74227: 100%|██████████| 200/200 [00:04<00:00, 40.26it/s]


Default Tree score: 0.5705
Tuned Tree score: 0.7159



Running for Marketing_Campaign
Beginning repeat 0




Inferred task type: binary


Best trial: 35. Best value: 0.822091: 100%|██████████| 200/200 [00:06<00:00, 30.39it/s]


Default Tree score: 0.6668
Tuned Tree score: 0.8280
Beginning repeat 1
Inferred task type: binary


Best trial: 118. Best value: 0.840608: 100%|██████████| 200/200 [00:05<00:00, 33.83it/s]


Default Tree score: 0.6397
Tuned Tree score: 0.8017
Beginning repeat 2
Inferred task type: binary


Best trial: 71. Best value: 0.865484: 100%|██████████| 200/200 [00:06<00:00, 29.96it/s]


Default Tree score: 0.6907
Tuned Tree score: 0.7581
Beginning repeat 3
Inferred task type: binary


Best trial: 103. Best value: 0.804992: 100%|██████████| 200/200 [00:06<00:00, 30.65it/s]


Default Tree score: 0.6910
Tuned Tree score: 0.8091
Beginning repeat 4
Inferred task type: binary


Best trial: 2. Best value: 0.829008: 100%|██████████| 200/200 [00:05<00:00, 33.44it/s]


Default Tree score: 0.6695
Tuned Tree score: 0.7977



Running for maternal_health_risk
Beginning repeat 0
Inferred task type: multiclass


Best trial: 6. Best value: -0.580748: 100%|██████████| 200/200 [00:04<00:00, 47.35it/s]


Default Tree score: 4.4932
Tuned Tree score: 0.7766
Beginning repeat 1
Inferred task type: multiclass


Best trial: 197. Best value: -0.590829: 100%|██████████| 200/200 [00:04<00:00, 47.77it/s]


Default Tree score: 4.4031
Tuned Tree score: 1.0981
Beginning repeat 2
Inferred task type: multiclass


Best trial: 114. Best value: -0.644587: 100%|██████████| 200/200 [00:04<00:00, 40.73it/s]


Default Tree score: 4.7034
Tuned Tree score: 0.7582
Beginning repeat 3
Inferred task type: multiclass


Best trial: 158. Best value: -0.589546: 100%|██████████| 200/200 [00:04<00:00, 45.07it/s]


Default Tree score: 7.2904
Tuned Tree score: 0.8266
Beginning repeat 4
Inferred task type: multiclass


Best trial: 196. Best value: -0.728742: 100%|██████████| 200/200 [00:04<00:00, 40.54it/s]


Default Tree score: 6.2996
Tuned Tree score: 0.7859



Running for qsar-biodeg
Beginning repeat 0




Inferred task type: binary


Best trial: 81. Best value: 0.906686: 100%|██████████| 200/200 [00:05<00:00, 36.33it/s]


Default Tree score: 0.7907
Tuned Tree score: 0.8293
Beginning repeat 1
Inferred task type: binary


Best trial: 28. Best value: 0.836941: 100%|██████████| 200/200 [00:05<00:00, 37.79it/s]


Default Tree score: 0.7865
Tuned Tree score: 0.8771
Beginning repeat 2
Inferred task type: binary


Best trial: 56. Best value: 0.87013: 100%|██████████| 200/200 [00:05<00:00, 35.07it/s]


Default Tree score: 0.7566
Tuned Tree score: 0.8221
Beginning repeat 3
Inferred task type: binary


Best trial: 177. Best value: 0.852152: 100%|██████████| 200/200 [00:05<00:00, 37.36it/s]


Default Tree score: 0.7988
Tuned Tree score: 0.8617
Beginning repeat 4
Inferred task type: binary


Best trial: 129. Best value: 0.896185: 100%|██████████| 200/200 [00:05<00:00, 37.44it/s]


Default Tree score: 0.7716
Tuned Tree score: 0.8706



Running for QSAR_fish_toxicity
Beginning repeat 0
Inferred task type: regression


Best trial: 28. Best value: -0.9978: 100%|██████████| 200/200 [00:04<00:00, 46.77it/s] 


Default Tree score: 1.3428
Tuned Tree score: 1.0760
Beginning repeat 1
Inferred task type: regression


Best trial: 149. Best value: -0.869975: 100%|██████████| 200/200 [00:04<00:00, 40.47it/s]


Default Tree score: 1.3149
Tuned Tree score: 0.9920
Beginning repeat 2
Inferred task type: regression


Best trial: 24. Best value: -0.772139: 100%|██████████| 200/200 [00:04<00:00, 41.21it/s]


Default Tree score: 1.3057
Tuned Tree score: 1.0459
Beginning repeat 3
Inferred task type: regression


Best trial: 36. Best value: -0.944208: 100%|██████████| 200/200 [00:04<00:00, 41.91it/s]


Default Tree score: 1.2574
Tuned Tree score: 1.0000
Beginning repeat 4
Inferred task type: regression


Best trial: 17. Best value: -0.986348: 100%|██████████| 200/200 [00:04<00:00, 48.38it/s]


Default Tree score: 1.2760
Tuned Tree score: 1.1026



Running for website_phishing
Beginning repeat 0
Inferred task type: multiclass


Best trial: 46. Best value: -0.3175: 100%|██████████| 200/200 [00:04<00:00, 41.34it/s] 


Default Tree score: 3.3068
Tuned Tree score: 0.5010
Beginning repeat 1
Inferred task type: multiclass


Best trial: 165. Best value: -0.285883: 100%|██████████| 200/200 [00:04<00:00, 41.44it/s]


Default Tree score: 3.4757
Tuned Tree score: 0.8606
Beginning repeat 2
Inferred task type: multiclass


Best trial: 36. Best value: -0.478194: 100%|██████████| 200/200 [00:04<00:00, 42.55it/s]


Default Tree score: 3.9436
Tuned Tree score: 0.9088
Beginning repeat 3
Inferred task type: multiclass


Best trial: 107. Best value: -0.264111: 100%|██████████| 200/200 [00:04<00:00, 41.86it/s]


Default Tree score: 4.2707
Tuned Tree score: 0.5931
Beginning repeat 4
Inferred task type: multiclass


Best trial: 15. Best value: -0.506636: 100%|██████████| 200/200 [00:04<00:00, 41.82it/s]


Default Tree score: 4.1170
Tuned Tree score: 0.5041



Running for MIC
Beginning repeat 0




Inferred task type: multiclass


Best trial: 21. Best value: -0.597862: 100%|██████████| 200/200 [00:06<00:00, 28.94it/s]


Default Tree score: 7.5011
Tuned Tree score: 0.5233
Beginning repeat 1
Inferred task type: multiclass


Best trial: 0. Best value: -0.568099: 100%|██████████| 200/200 [00:06<00:00, 29.51it/s]


Default Tree score: 6.6112
Tuned Tree score: 0.5395
Beginning repeat 2
Inferred task type: multiclass


Best trial: 15. Best value: -0.529511: 100%|██████████| 200/200 [00:06<00:00, 29.42it/s]


Default Tree score: 6.1026
Tuned Tree score: 0.5088
Beginning repeat 3
Inferred task type: multiclass


Best trial: 21. Best value: -0.461798: 100%|██████████| 200/200 [00:06<00:00, 29.48it/s]


Default Tree score: 7.0562
Tuned Tree score: 0.5742
Beginning repeat 4
Inferred task type: multiclass


Best trial: 21. Best value: -0.430744: 100%|██████████| 200/200 [00:06<00:00, 28.90it/s]


Default Tree score: 7.1833
Tuned Tree score: 0.5433
