# Optuna Results Summary

This notebook summarizes Optuna-tuned results from `results/tune/**/best/*/best_*.json`.

It builds tables for:
1. **Debias Methods**
2. **PU Methods**
3. **Debias+PU Methods**

Metrics shown (test): **MAE**, **RMSE**, **R2**, **AUC**.

In [6]:
import json
from pathlib import Path

import pandas as pd
from IPython.display import display

In [12]:
# Model categories
DEBIAS_MODELS = [
    'naive', 'ips', 'dr', 'mtips', 'mtdr', 'sdr2'
]

PU_MODELS = [
    'bpr', 'ubpr', 'cubpr', 'nnpu', 'upu',
    'uprl', 'rmf', 'ncrmf'
]

DEBIAS_PU_MODELS = [
    'counterif', 'recrec_i', 'recrec_f'
]

# Datasets + metrics
DATASETS = ['hs', 'saferlhf', 'ufb']
METRICS = ['MAE', 'RMSE', 'R2', 'AUC']
METRIC_KEY_MAP = {
    'MAE': 'MAE on test',
    'RMSE': 'RMSE on test',
    'R2': 'R2 on test',
    'AUC': 'AUROC on test',
}

# Tune directory (relative to this notebook)
TUNE_DIR = Path('../results/tune')

# Only include runs whose folder name starts with this prefix.
# (Default matches the Optuna tuner output folders we generate.)
RUN_PREFIXES = ['benchmarks_optuna']

In [13]:
def _iter_best_json_files(tune_dir: Path, *, run_prefixes: list[str] | None = None):
    if not tune_dir.exists():
        return
    for run_dir in tune_dir.iterdir():
        if not run_dir.is_dir():
            continue
        if run_prefixes and not any(run_dir.name.startswith(p) for p in run_prefixes):
            continue
        yield from run_dir.glob('best/*/best_*.json')


def _score_record(rec: dict) -> tuple[float, float, float, float]:
    m = rec.get('metrics', {}) or {}
    rmse = float(m.get('RMSE on test', float('inf')))
    mae = float(m.get('MAE on test', float('inf')))
    r2 = float(m.get('R2 on test', float('-inf')))
    auc = float(m.get('AUROC on test', float('-inf')))
    return (rmse, mae, -r2, -auc)


def load_optuna_best_results(tune_dir: Path, *, run_prefixes: list[str] | None = None) -> dict:
    """Return {(model, dataset, alpha): record} using best_* json files."""
    best: dict[tuple[str, str, float], dict] = {}
    for path in _iter_best_json_files(tune_dir, run_prefixes=run_prefixes):
        try:
            payload = json.loads(path.read_text(encoding='utf-8'))
        except Exception:
            continue

        model = str(payload.get('model', '')).strip()
        data_name = str(payload.get('data_name', '')).strip()
        if not model or not data_name:
            continue

        try:
            alpha = float(payload.get('alpha'))
        except Exception:
            continue

        rec = {
            'model': model,
            'data_name': data_name,
            'alpha': alpha,
            'metrics': payload.get('best_metrics', {}) or {},
            'best_params': payload.get('best_params', {}) or {},
            'path': str(path),
        }

        key = (model, data_name, alpha)
        if key not in best or _score_record(rec) < _score_record(best[key]):
            best[key] = rec

    return best


def build_results_table(best: dict, models: list[str], datasets: list[str], metrics: list[str], *, alpha: float) -> pd.DataFrame:
    columns = pd.MultiIndex.from_product([datasets, metrics], names=['Dataset', 'Metric'])
    data = []
    for model in models:
        row = []
        for dataset in datasets:
            rec = best.get((model, dataset, float(alpha)))
            m = (rec or {}).get('metrics', {}) or {}
            for metric in metrics:
                key = METRIC_KEY_MAP[metric]
                val = m.get(key, None)
                if isinstance(val, (int, float)):
                    row.append(round(float(val), 4))
                else:
                    row.append(None)
        data.append(row)
    df = pd.DataFrame(data, index=models, columns=columns)
    df.index.name = 'Model'
    return df


def highlight_best(df: pd.DataFrame, *, lower_is_better: list[str] = ['MAE', 'RMSE']):
    def highlight_col(s):
        metric = s.name[1] if isinstance(s.name, tuple) else s.name
        if metric in lower_is_better:
            is_best = s == s.min()
        else:
            is_best = s == s.max()
        return ['font-weight: bold' if v else '' for v in is_best]

    styled = df.style.format(precision=4, na_rep='-').apply(highlight_col, axis=0)
    styled = styled.set_table_styles([
        {'selector': 'th.col_heading.level0', 'props': [('text-align', 'center')]},
    ])
    return styled

In [14]:
best = load_optuna_best_results(TUNE_DIR, run_prefixes=RUN_PREFIXES)
print(f"Loaded {len(best)} best records")
print(f"Models found: {sorted({k[0] for k in best.keys()})}")
print(f"Datasets found: {sorted({k[1] for k in best.keys()})}")
alphas = sorted({k[2] for k in best.keys()})
print(f"Alphas found: {alphas}")

Loaded 22 best records
Models found: ['bpr', 'counterif', 'cubpr', 'dr', 'ips', 'mtdr', 'naive', 'ncrmf', 'nnpu', 'pu_naive', 'recrec_i', 'rmf', 'sdr2', 'ubpr', 'uprl', 'upu']
Datasets found: ['hs', 'saferlhf']
Alphas found: [0.5]


In [15]:
for alpha in alphas:
    print(f"\n==================== alpha={alpha} ====================")

    print("\nDebias Methods")
    df_debias = build_results_table(best, DEBIAS_MODELS, DATASETS, METRICS, alpha=alpha)
    display(highlight_best(df_debias))

    print("\nPU Methods")
    df_pu = build_results_table(best, PU_MODELS, DATASETS, METRICS, alpha=alpha)
    display(highlight_best(df_pu))

    print("\nDebias+PU Methods")
    df_debias_pu = build_results_table(best, DEBIAS_PU_MODELS, DATASETS, METRICS, alpha=alpha)
    display(highlight_best(df_debias_pu, lower_is_better=['MAE', 'RMSE']))



Debias Methods


Dataset,hs,hs,hs,hs,saferlhf,saferlhf,saferlhf,saferlhf,ufb,ufb,ufb,ufb
Metric,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
naive,0.5011,0.5211,-0.2565,0.7117,0.2457,0.3859,0.4002,0.9572,-,-,-,-
ips,0.5019,0.5114,-0.2102,0.7102,-,-,-,-,-,-,-,-
dr,0.3085,0.3952,0.2771,0.7993,-,-,-,-,-,-,-,-
mtips,-,-,-,-,-,-,-,-,-,-,-,-
mtdr,0.2875,0.4105,0.2203,0.7851,-,-,-,-,-,-,-,-
sdr2,0.3764,0.4386,0.1096,0.7002,-,-,-,-,-,-,-,-



PU Methods


Dataset,hs,hs,hs,hs,saferlhf,saferlhf,saferlhf,saferlhf,ufb,ufb,ufb,ufb
Metric,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
bpr,0.348,0.4196,0.1852,0.741,-,-,-,-,-,-,-,-
ubpr,0.3186,0.4013,0.2546,0.7823,0.1327,0.2705,0.7053,0.9651,-,-,-,-
cubpr,0.3237,0.4007,0.2569,0.7833,0.1327,0.2705,0.7053,0.9651,-,-,-,-
nnpu,0.4192,0.4481,0.0706,0.7146,0.2084,0.3376,0.5411,0.8860,-,-,-,-
upu,0.3823,0.4343,0.1271,0.7083,-,-,-,-,-,-,-,-
uprl,0.3417,0.4119,0.2148,0.7602,0.1178,0.2612,0.7252,0.9702,-,-,-,-
rmf,0.312,0.3926,0.2867,0.8032,-,-,-,-,-,-,-,-
ncrmf,0.312,0.3926,0.2867,0.8032,-,-,-,-,-,-,-,-



Debias+PU Methods


Dataset,hs,hs,hs,hs,saferlhf,saferlhf,saferlhf,saferlhf,ufb,ufb,ufb,ufb
Metric,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC,MAE,RMSE,R2,AUC
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
counterif,0.3870,0.4304,0.1426,0.7396,-,-,-,-,-,-,-,-
recrec_i,0.3471,0.4217,0.1769,0.7387,-,-,-,-,-,-,-,-
recrec_f,-,-,-,-,-,-,-,-,-,-,-,-


In [None]:
先在naive上，调lr和l2reg和batchsize
directly apply to other baselines
如果baseline不正常，tune baseline specific hyperparams
targetR2