# Benchmark Results Summary

This notebook consolidates all benchmark results into three tables:
1. **Debias Methods** - Selection bias correction methods
2. **PU Methods** - Positive-Unlabeled learning methods
3. **Debias+PU Methods** - Combined methods (placeholder)

In [6]:
import pandas as pd
import yaml
from pathlib import Path
from IPython.display import display

In [None]:
# Model categories
DEBIAS_MODELS = [
    'naive', 'ips', 'dr', 'mtips', 'mtdr', 'sdr', 'sdr2',
    'ome_ips', 'ome_dr', 'co_teaching', 'cvib', 'codis',
    'kmeidtm', 'labelwave', 'eps_softmax', 'robust_dividemix'
]

PU_MODELS = [
    'bpr', 'ubpr', 'cubpr', 'nnpu', 'upu', 'pu_naive',
    'uprl', 'wmf', 'rmf', 'ncrmf'
]

# Placeholder for combined methods
DEBIAS_PU_MODELS = []

# Datasets and metrics
DATASETS = ['hs', 'saferlhf', 'ufb']
METRICS = ['AUROC', 'NLL', 'MAE', 'RMSE']

# Results directory
RESULTS_DIR = Path('../results/cache')

In [None]:
def load_all_results(cache_dir: Path) -> dict:
    """
    Load all performance.yaml files from the cache directory.
    
    Returns:
        dict: {(model_name, dataset_name): {metric: value, ...}}
    """
    results = {}
    
    for perf_file in cache_dir.glob('*/*/performance.yaml'):
        model = perf_file.parent.parent.name
        dataset = perf_file.parent.name
        
        # Skip debug directories
        if 'debug' in dataset:
            continue
        
        try:
            with open(perf_file) as f:
                data = yaml.safe_load(f)
            results[(model, dataset)] = data
        except Exception as e:
            print(f"Warning: Failed to load {perf_file}: {e}")
    
    return results


def build_results_table(results: dict, models: list, datasets: list, metrics: list) -> pd.DataFrame:
    """
    Build a results table for a specific set of models with MultiIndex columns.
    
    Args:
        results: Dictionary of results from load_all_results
        models: List of model names to include
        datasets: List of dataset names
        metrics: List of metric names
    
    Returns:
        DataFrame with models as rows and (dataset, metric) MultiIndex columns
    """
    # Create MultiIndex columns
    columns = pd.MultiIndex.from_product([datasets, metrics], names=['Dataset', 'Metric'])
    
    # Build data
    data = []
    for model in models:
        row = []
        for dataset in datasets:
            for metric in metrics:
                key = (model, dataset)
                if key in results:
                    metric_key = f"{metric} on test"
                    value = results[key].get(metric_key, None)
                    row.append(round(value, 4) if value is not None else None)
                else:
                    row.append(None)
        data.append(row)
    
    df = pd.DataFrame(data, index=models, columns=columns)
    df.index.name = 'Model'
    
    return df


def highlight_best(df: pd.DataFrame, lower_is_better: list = ['NLL', 'MAE', 'RMSE']):
    """
    Highlight the best value in each column.
    For metrics in lower_is_better, highlight the minimum; otherwise highlight the maximum.
    """
    def highlight_col(s):
        metric = s.name[1] if isinstance(s.name, tuple) else s.name
        if metric in lower_is_better:
            is_best = s == s.min()
        else:
            is_best = s == s.max()
        return ['font-weight: bold' if v else '' for v in is_best]
    
    # Apply styles with centered Dataset header
    styled = df.style.format(precision=4, na_rep='-').apply(highlight_col, axis=0)
    # Center the top-level (Dataset) header
    styled = styled.set_table_styles([
        {'selector': 'th.col_heading.level0', 'props': [('text-align', 'center')]},
    ])
    return styled


# Load all results
all_results = load_all_results(RESULTS_DIR)
print(f"Loaded {len(all_results)} result files")
print(f"Models found: {set(k[0] for k in all_results.keys())}")
print(f"Datasets found: {set(k[1] for k in all_results.keys())}")

## Table 1: Debias Methods

In [None]:
debias_table = build_results_table(all_results, DEBIAS_MODELS, DATASETS, METRICS)
display(highlight_best(debias_table))

## Table 2: PU Methods

In [None]:
pu_table = build_results_table(all_results, PU_MODELS, DATASETS, METRICS)
display(highlight_best(pu_table))

## Table 3: Debias+PU Methods (Placeholder)

In [None]:
if DEBIAS_PU_MODELS:
    debias_pu_table = build_results_table(all_results, DEBIAS_PU_MODELS, DATASETS, METRICS)
    display(highlight_best(debias_pu_table))
else:
    print("No Debias+PU models defined yet. Add model names to DEBIAS_PU_MODELS list when available.")
    # Create empty placeholder table with MultiIndex columns
    columns = pd.MultiIndex.from_product([DATASETS, METRICS], names=['Dataset', 'Metric'])
    debias_pu_table = pd.DataFrame(columns=columns)
    debias_pu_table.index.name = 'Model'
    display(debias_pu_table)