# Benchmark Results Summary

This notebook consolidates all benchmark results into three tables:
1. **Debias Methods** - Selection bias correction methods
2. **PU Methods** - Positive-Unlabeled learning methods
3. **Debias+PU Methods** - Combined methods (placeholder)

In [None]:
import pandas as pd
import yaml
from pathlib import Path
from IPython.display import display

In [None]:
# Model categories
DEBIAS_MODELS = [
    'naive', 'ips', 'dr', 'mtips', 'mtdr', 'sdr', 'sdr2',
    'ome_ips', 'ome_dr', 'co_teaching', 'cvib', 'codis',
    'kmeidtm', 'labelwave', 'eps_softmax', 'robust_dividemix'
]

PU_MODELS = [
    'bpr', 'ubpr', 'cubpr', 'nnpu', 'upu', 'pu_naive',
    'uprl', 'wmf', 'rmf', 'ncrmf'
]

# Placeholder for combined methods
DEBIAS_PU_MODELS = []

# Datasets and metrics
DATASETS = ['hs', 'saferlhf', 'ufb']
METRICS = ['AUROC', 'NLL', 'NDCG', 'Recall']

# Results directory
RESULTS_DIR = Path('../results/cache')

In [None]:
def load_all_results(cache_dir: Path) -> dict:
    """
    Load all performance.yaml files from the cache directory.
    
    Returns:
        dict: {(model_name, dataset_name): {metric: value, ...}}
    """
    results = {}
    
    for perf_file in cache_dir.glob('*/*/performance.yaml'):
        model = perf_file.parent.parent.name
        dataset = perf_file.parent.name
        
        # Skip debug directories
        if 'debug' in dataset:
            continue
        
        try:
            with open(perf_file) as f:
                data = yaml.safe_load(f)
            results[(model, dataset)] = data
        except Exception as e:
            print(f"Warning: Failed to load {perf_file}: {e}")
    
    return results


def build_results_table(results: dict, models: list, datasets: list, metrics: list) -> pd.DataFrame:
    """
    Build a results table for a specific set of models.
    
    Args:
        results: Dictionary of results from load_all_results
        models: List of model names to include
        datasets: List of dataset names
        metrics: List of metric names
    
    Returns:
        DataFrame with models as rows and dataset_metric as columns
    """
    # Create column names
    columns = []
    for dataset in datasets:
        for metric in metrics:
            columns.append(f"{dataset}_{metric}")
    
    # Build data
    data = []
    for model in models:
        row = {'Model': model}
        for dataset in datasets:
            for metric in metrics:
                col_name = f"{dataset}_{metric}"
                key = (model, dataset)
                if key in results:
                    # Metrics in yaml are like "AUROC on test"
                    metric_key = f"{metric} on test"
                    value = results[key].get(metric_key, None)
                    row[col_name] = round(value, 4) if value is not None else None
                else:
                    row[col_name] = None
        data.append(row)
    
    df = pd.DataFrame(data)
    df = df.set_index('Model')
    
    return df


# Load all results
all_results = load_all_results(RESULTS_DIR)
print(f"Loaded {len(all_results)} result files")
print(f"Models found: {set(k[0] for k in all_results.keys())}")
print(f"Datasets found: {set(k[1] for k in all_results.keys())}")

## Table 1: Debias Methods

In [None]:
debias_table = build_results_table(all_results, DEBIAS_MODELS, DATASETS, METRICS)
display(debias_table.style.format(precision=4, na_rep='-').highlight_max(axis=0, props='font-weight: bold'))

## Table 2: PU Methods

In [None]:
pu_table = build_results_table(all_results, PU_MODELS, DATASETS, METRICS)
display(pu_table.style.format(precision=4, na_rep='-').highlight_max(axis=0, props='font-weight: bold'))

## Table 3: Debias+PU Methods (Placeholder)

In [None]:
if DEBIAS_PU_MODELS:
    debias_pu_table = build_results_table(all_results, DEBIAS_PU_MODELS, DATASETS, METRICS)
    display(debias_pu_table.style.format(precision=4, na_rep='-').highlight_max(axis=0, props='font-weight: bold'))
else:
    print("No Debias+PU models defined yet. Add model names to DEBIAS_PU_MODELS list when available.")
    # Create empty placeholder table
    columns = [f"{d}_{m}" for d in DATASETS for m in METRICS]
    debias_pu_table = pd.DataFrame(columns=columns)
    debias_pu_table.index.name = 'Model'
    display(debias_pu_table)

## Export to CSV (Optional)

In [None]:
# Uncomment to export tables to CSV
# debias_table.to_csv('debias_results.csv')
# pu_table.to_csv('pu_results.csv')
# debias_pu_table.to_csv('debias_pu_results.csv')
# print("Tables exported to CSV files.")