In [1]:
import json
import os
import sys
from copy import deepcopy
from pathlib import Path

import pandas as pd

REPO_DIR = os.path.abspath('..')  # path to the root of the repository
sys.path.append(REPO_DIR)
os.environ["PROJECT_DIR"] = REPO_DIR
import lib

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
CALIFORNIA = 'california_housing'
ADULT = 'adult'
HELENA = 'helena'
JANNIS = 'jannis'
HIGGS = 'higgs_small'
ALOI = 'aloi'
EPSILON = 'epsilon'
YEAR = 'year'
COVTYPE = 'covtype'
YAHOO = 'yahoo'
MICROSOFT = 'microsoft'
ALL_DATASETS = [CALIFORNIA, ADULT, HELENA, JANNIS, HIGGS, ALOI, EPSILON, YEAR, COVTYPE, YAHOO, MICROSOFT]
DATASET_NAMES = {
    CALIFORNIA: 'California Housing',
    ADULT: 'Adult',
    HELENA: 'Helena',
    JANNIS: 'Jannis',
    HIGGS: 'Higgs Small',
    ALOI: 'ALOI',
    EPSILON: 'Epsilon',
    YEAR: 'Year',
    COVTYPE: 'Covertype',
    YAHOO: 'Yahoo',
    MICROSOFT: 'Microsoft',
}
REGRESSION_DATASETS = [x for x in ALL_DATASETS if lib.load_dataset_info(x)['task_type'] == lib.REGRESSION]
DETAILS = ['task_type', 'n_objects', 'n_features']
PARTS = ['test', 'val', 'train']


def format_scores(df, precision):
    def f(record):
        if record['task_type'] == lib.REGRESSION:
            for part in PARTS:
                for suffix in 'best', 'score':
                    key = f'{part}_{suffix}'
                    if key in record:
                        record[key] *= -1
        for k, v in list(record.items()):
            if isinstance(v, float):
                record[k] = round(v, precision)
        return record
    return df.apply(f, axis=1)


def load_record(output):
    output = Path(output)
    if not output.exists():
        return None
    path = output / 'stats.json'
    if not path.exists():
        print(f'WARNING! This path does not exist: {path}')
        return None
    stats = lib.load_json(path)
    metrics = stats.get('metrics')
    if metrics is None:
        return None

    dataset = Path(stats['dataset']).name
    info = lib.load_dataset_info(dataset)
    dataset, algorithm, experiment, suffix = str(output.relative_to(lib.env.OUTPUT_DIR)).split('/', 4)
    r = {
        'dataset': DATASET_NAMES[dataset],
        'task_type': info['task_type'],
        'n_objects': info['size'],
        'n_features': info['n_num_features'] + info['n_cat_features'],
        'algorithm': algorithm + f' | {experiment}',
        's': suffix
    }
    for x in PARTS:
        if x in stats['metrics']:
            r[f'{x}_score'] = stats['metrics'][x]['score']
    return r


def sort(df, by):
    if isinstance(by, str):
        by = [by]
    return df.sort_values(['n_objects'] + by, ascending=[True] + ['score' not in x for x in by]).reset_index(drop=True)


def make_df(outputs_info):
    df = []
    for output, algorithm_name in outputs_info:
        record = load_record(output)
        if not record:
            continue
        if algorithm_name is not None:
            record['algorithm'] = algorithm_name
        df.append(record)
    df = sort(pd.DataFrame(df).fillna(0.0), 'val_score').reset_index(drop=True)
    return df


def collect_outputs(experiment_dir, filter_info=None):
    if isinstance(filter_info, int):
        filter_info = [str(x) for x in range(filter_info)]
    if isinstance(filter_info, list):
        assert all(isinstance(x, str) for x in filter_info)
        filter_fn = lambda x: x.name in filter_info
    elif callable(filter_info):
        filter_fn = filter_info
    else:
        assert filter_info is None
        filter_fn = lambda x: True

    outputs = []
    if not isinstance(experiment_dir, Path):
        experiment_dir = lib.env.OUTPUT_DIR / experiment_dir
    if experiment_dir.exists():
        outputs.extend(
            filter(
                filter_fn,
                filter(Path.is_dir, experiment_dir.iterdir())
            )
        )
    return outputs


def aggregate(df):
    aggrs = dict(
        task_type=('task_type', 'first'),
        n_objects=('n_objects', 'first'),
        n_features=('n_features', 'first'),
        test_score=('test_score', 'mean'),
        test_std=('test_score', 'std'),
        val_score=('val_score', 'mean'),
        val_std=('val_score', 'std'),
        count=('test_score', 'count')
    )
    if 'train_score' in df.columns:
        aggrs.update(dict(
            train_score=('train_score', 'mean'),
            train_std=('train_score', 'std'),
        ))
    df = df.groupby(['dataset', 'algorithm']).agg(**aggrs)
    df['count'] = df['count'].astype(int)
    return df.reset_index().fillna(0.0)

In [3]:
ENSEMBLES_3_5 = ['0_4', '5_9', '10_14']
N_SEEDS = 15
all_datasets = deepcopy(ALL_DATASETS)
datasets_for_ablations = set(all_datasets) - {ADULT, HIGGS, EPSILON, YAHOO}
pretty = True
# single_models, ensembles = True, False
# single_models, ensembles = False, True
single_models, ensembles = True, True

outputs_info = []
for experiment, output_filter, ensemble_output_filter, algorithm_name, datasets in [
#     # Baseline NNs
    # MLP results for "Adult" are missing because of the issue: https://github.com/yandex-research/rtdl/issues/2
    ('mlp/tuned', N_SEEDS, ENSEMBLES_3_5, 'MLP', set(all_datasets) - {ADULT}),
    # !!! TUTORIAL !!! Uncomment the following line for completing the tutorial:
    # ('mlp/tuned_reproduced', N_SEEDS, ENSEMBLES_3_5, 'MLP | reproduced', [CALIFORNIA]),
    ('resnet/tuned', N_SEEDS, ENSEMBLES_3_5, 'ResNet', all_datasets),
    ('snn/tuned', N_SEEDS, ENSEMBLES_3_5, 'SNN', all_datasets),
    # NODE results for "Adult" are missing because of the issue: https://github.com/yandex-research/rtdl/issues/2
    ('node/tuned', N_SEEDS, ENSEMBLES_3_5, 'NODE', set(all_datasets) - {ADULT, HELENA, ALOI}),
    ('node/default', N_SEEDS, ENSEMBLES_3_5, 'NODE', {HELENA, ALOI}),
    ('autoint/tuned', N_SEEDS, ENSEMBLES_3_5, 'AutoInt', all_datasets),
    ('dcn2/tuned', N_SEEDS, ENSEMBLES_3_5, 'DCN V2', all_datasets),
    ('tabnet/tuned', N_SEEDS, ENSEMBLES_3_5, 'TabNet', all_datasets),
    ('grownet/tuned', N_SEEDS, ENSEMBLES_3_5, 'GrowNet', all_datasets),

#     # FT-Transformer
    ('ft_transformer/tuned', N_SEEDS, ENSEMBLES_3_5, 'FT-Transformer', all_datasets),
    ('ft_transformer/default', N_SEEDS, ENSEMBLES_3_5, 'FT-Transformer | default', all_datasets),

#     # GBDTs
    ('catboost/tuned', N_SEEDS, ENSEMBLES_3_5, 'CatBoost', all_datasets),
    ('xgboost/tuned', N_SEEDS, ENSEMBLES_3_5, 'XGBoost', all_datasets),
    ('lightgbm_/tuned', N_SEEDS, ENSEMBLES_3_5, 'LightGBM', all_datasets),
    ('catboost/default', N_SEEDS, ENSEMBLES_3_5, 'CatBoost | default', all_datasets),
    ('xgboost/default', N_SEEDS, ENSEMBLES_3_5, 'XGBoost | default', all_datasets),

    # Ablation (if you want to uncomment the following experiments, then comment all experiments above)
#     ('autoint/tuned', N_SEEDS, [], 'AutoInt', datasets_for_ablations),
#     ('ft_transformer/tuned_nobias', N_SEEDS, [], 'FT-Transformer | no bias', datasets_for_ablations),
#     ('ft_transformer/tuned', N_SEEDS, [], 'FT-Transformer', datasets_for_ablations),
]:
    if pretty and algorithm_name is not None:
        ensemble_algorithm_name = '(ensemble) ' + algorithm_name
    else:
        algorithm_name = None
        ensemble_algorithm_name = None
    for dataset in datasets:
        if single_models:
            for output in collect_outputs(dataset + '/' + experiment, output_filter):
                outputs_info.append((output, algorithm_name))
        if ensembles:
            for output in collect_outputs(dataset + '/' + experiment + '_ensemble', ensemble_output_filter):
                outputs_info.append((output, ensemble_algorithm_name if single_models else algorithm_name))

DF = make_df(outputs_info)
DF = aggregate(DF)
DF = sort(DF, 'test_score')
DF = format_scores(DF, 4)
# DF = DF.set_index(['dataset', 'algorithm']).drop(columns=DETAILS)
DF = DF.set_index(['dataset'] + DETAILS + ['algorithm'])
DF[['test_score', 'test_std', 'val_score', 'val_std', 'train_score', 'train_std', 'count']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,test_score,test_std,val_score,val_std,train_score,train_std,count
dataset,task_type,n_objects,n_features,algorithm,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
California Housing,regression,20640,8,(ensemble) CatBoost,0.4234,0.0011,0.4371,0.0012,0.0748,0.0033,3
California Housing,regression,20640,8,(ensemble) LightGBM,0.4268,0.0015,0.4418,0.0008,0.1343,0.0041,3
California Housing,regression,20640,8,(ensemble) CatBoost | default,0.4281,0.0001,0.4447,0.0006,0.29,0.0008,3
California Housing,regression,20640,8,CatBoost | default,0.4303,0.0008,0.4468,0.0013,0.2918,0.0007,15
California Housing,regression,20640,8,CatBoost,0.4308,0.0016,0.4445,0.0023,0.0778,0.0149,15
California Housing,regression,20640,8,(ensemble) XGBoost,0.4314,0.0004,0.4457,0.0005,0.1501,0.006,3
California Housing,regression,20640,8,LightGBM,0.4322,0.0036,0.4471,0.0036,0.1375,0.0213,15
California Housing,regression,20640,8,XGBoost,0.4334,0.0017,0.4478,0.0023,0.1512,0.0153,15
California Housing,regression,20640,8,(ensemble) FT-Transformer,0.4497,0.0004,0.4576,0.0024,0.3626,0.0034,3
California Housing,regression,20640,8,(ensemble) FT-Transformer | default,0.4551,0.0031,0.4611,0.0051,0.3576,0.0144,3
