# Slope vs no-slope holdout checklist (all in notebooks)

**Objective 1.** Summarize non-LOO 10yr and 1yr holdout AUC; confirm pool-30 vs pool-5 are pretty similar.

**Objective 2.** Batch-0 diagnostic: predict 0–10K with pool 1–30 vs pool 10–39, same AUC eval → see `batch0_pool1_30_vs_10_39_auc.ipynb`.

## Setup

## (Optional) Generate missing result CSVs

Run the cells below to create any missing CSVs. Order: 10yr runs first (they write the π files); then 1yr runs (they read those π files).

In [None]:
import subprocess
from pathlib import Path

CLAUDE_DIR = Path('/Users/sarahurbut/aladynoulli2/claudefile').resolve()
RESULTS_DIR = CLAUDE_DIR / 'results_holdout_auc'
CSV_10YR_POOL5  = RESULTS_DIR / 'holdout_auc_slope_1phase_vs_noslope.csv'
CSV_10YR_POOL30 = RESULTS_DIR / 'holdout_auc_slope_1phase_pool30_vs_noslope.csv'
CSV_1YR_POOL5   = RESULTS_DIR / 'holdout_auc_1yr_slope_1phase_vs_noslope.csv'
CSV_1YR_POOL30  = RESULTS_DIR / 'holdout_auc_1yr_slope_1phase_pool30_vs_noslope.csv'

def run_if_missing(csv_path, cmd, description):
    if csv_path.exists():
        print(f'Already exists: {csv_path.name}')
        return
    print(f'Running: {description}')
    subprocess.run(cmd, cwd=str(CLAUDE_DIR), check=True)
    print(f'  -> {csv_path.name}')

# 10yr first (writes pi .pt files used by 1yr)
run_if_missing(CSV_10YR_POOL5,  ['python', 'slope_holdout_auc.py', '--single_phase'],
               'slope_holdout_auc.py --single_phase (pool-5 10yr + pi)')
run_if_missing(CSV_10YR_POOL30, ['python', 'slope_holdout_auc.py', '--single_phase', '--single_phase_wide'],
               'slope_holdout_auc.py --single_phase --single_phase_wide (pool-30 10yr + pi)')
# Ensure pool-30 no-slope pi exists (older runs may have saved it elsewhere)
pi_noslope_pool30 = RESULTS_DIR / 'pi_noslope_holdout_1phase_pool30.pt'
if not CSV_1YR_POOL30.exists() and not pi_noslope_pool30.exists():
    print('Running pool-30 10yr to create pi_noslope_holdout_1phase_pool30.pt...')
    subprocess.run(['python', 'slope_holdout_auc.py', '--single_phase', '--single_phase_wide'], cwd=str(CLAUDE_DIR), check=True)
# 1yr (reads pi from above)
run_if_missing(CSV_1YR_POOL5,  ['python', 'slope_holdout_auc_1yr.py', '--single_phase'],
               'slope_holdout_auc_1yr.py --single_phase (1yr pool-5)')
run_if_missing(CSV_1YR_POOL30, ['python', 'slope_holdout_auc_1yr.py', '--single_phase_wide'],
               'slope_holdout_auc_1yr.py --single_phase_wide (1yr pool-30)')
print('Done.')

Already exists: holdout_auc_slope_1phase_vs_noslope.csv
Already exists: holdout_auc_slope_1phase_pool30_vs_noslope.csv
Already exists: holdout_auc_1yr_slope_1phase_vs_noslope.csv
Running: slope_holdout_auc_1yr.py --single_phase_wide (1yr pool-30)
1-YEAR AT ENROLLMENT AUC: SLOPE_1PHASE_POOL30 vs NO-SLOPE (from saved pi)


Traceback (most recent call last):
  File "/Users/sarahurbut/aladynoulli2/claudefile/slope_holdout_auc_1yr.py", line 151, in <module>
    main()
  File "/Users/sarahurbut/aladynoulli2/claudefile/slope_holdout_auc_1yr.py", line 66, in main
    pi_noslope = torch.load(pi_noslope_path, weights_only=False)
  File "/opt/miniconda3/envs/new_env_pyro2/lib/python3.9/site-packages/torch/serialization.py", line 1319, in load
    with _open_file_like(f, "rb") as opened_file:
  File "/opt/miniconda3/envs/new_env_pyro2/lib/python3.9/site-packages/torch/serialization.py", line 659, in _open_file_like
    return _open_file(name_or_buffer, mode)
  File "/opt/miniconda3/envs/new_env_pyro2/lib/python3.9/site-packages/torch/serialization.py", line 640, in __init__
    super().__init__(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: '/Users/sarahurbut/aladynoulli2/claudefile/results_holdout_auc/pi_noslope_holdout_1phase_pool30.pt'


CalledProcessError: Command '['python', 'slope_holdout_auc_1yr.py', '--single_phase_wide']' returned non-zero exit status 1.

In [None]:
from pathlib import Path
import pandas as pd

CLAUDE_DIR = Path('/Users/sarahurbut/aladynoulli2/claudefile').resolve()
RESULTS_DIR = CLAUDE_DIR / 'results_holdout_auc'

# 10yr CSVs
CSV_10YR_POOL30 = RESULTS_DIR / 'holdout_auc_slope_1phase_pool30_vs_noslope.csv'
CSV_10YR_POOL5  = RESULTS_DIR / 'holdout_auc_slope_1phase_vs_noslope.csv'
# 1yr CSVs
CSV_1YR_POOL5  = RESULTS_DIR / 'holdout_auc_1yr_slope_1phase_vs_noslope.csv'
CSV_1YR_POOL30 = RESULTS_DIR / 'holdout_auc_1yr_slope_1phase_pool30_vs_noslope.csv'

(1yr pool-30 is created by the 'Generate missing result CSVs' cell above if missing.)

In [None]:
# No-op: 1yr pool-30 is generated by the 'Generate missing result CSVs' cell above when missing.

## Objective 1a: 10yr — pool-30 vs pool-5 comparison

In [None]:
d30 = pd.read_csv(CSV_10YR_POOL30)
d5  = pd.read_csv(CSV_10YR_POOL5)

# Restrict to 10yr horizons
d30_10 = d30[d30['horizon'].str.contains('10yr')].copy()
d5_10  = d5[d5['horizon'].str.contains('10yr')].copy()

print('10yr: pool-30 vs pool-5 (confirm similar)')
for horizon in d30_10['horizon'].unique():
    h30 = d30_10[d30_10['horizon'] == horizon]
    h5  = d5_10[d5_10['horizon'] == horizon]
    s30 = h30[h30['model'] == 'slope_1phase_pool30'].set_index('disease')['auc']
    s5  = h5[h5['model'] == 'slope_1phase'].set_index('disease')['auc']
    n30 = h30[h30['model'] == 'noslope'].set_index('disease')['auc']
    n5  = h5[h5['model'] == 'noslope'].set_index('disease')['auc']
    common = s30.index.intersection(s5.index)
    print(f'  {horizon}: slope mean AUC diff (pool30-pool5) = {(s30 - s5).reindex(common).dropna().mean():.4f}; noslope = {(n30 - n5).reindex(common).dropna().mean():.4f}')

print('\nPer-disease 10yr static (pool-30): slope vs noslope')
h = d30_10[d30_10['horizon'] == 'static_10yr']
slope = h[h['model'] == 'slope_1phase_pool30'].set_index('disease')['auc']
noslope = h[h['model'] == 'noslope'].set_index('disease')['auc']
common = slope.index.intersection(noslope.index)
pd.DataFrame({'slope_auc': slope, 'noslope_auc': noslope, 'diff': slope - noslope}).reindex(common).round(4)

## Objective 1b: 1yr — pool-30 vs pool-5 comparison

In [None]:
if not CSV_1YR_POOL30.exists():
    print('Run the optional cell above to generate holdout_auc_1yr_slope_1phase_pool30_vs_noslope.csv')
else:
    d30_1yr = pd.read_csv(CSV_1YR_POOL30)
    d5_1yr  = pd.read_csv(CSV_1YR_POOL5)

    print('1yr AUC: pool-30 vs pool-5 (slope vs noslope)')
    for horizon in d30_1yr['horizon'].unique():
        h30 = d30_1yr[d30_1yr['horizon'] == horizon]
        h5  = d5_1yr[d5_1yr['horizon'] == horizon]
        s30 = h30[h30['model'].str.contains('slope')].set_index('disease')['auc']
        n30 = h30[h30['model'] == 'noslope'].set_index('disease')['auc']
        s5  = h5[h5['model'].str.contains('slope')].set_index('disease')['auc']
        n5  = h5[h5['model'] == 'noslope'].set_index('disease')['auc']
        common = s30.index.intersection(s5.index)
        print(f'  {horizon}: slope pool30−pool5 mean diff = {(s30 - s5).reindex(common).dropna().mean():.4f}; noslope = {(n30 - n5).reindex(common).dropna().mean():.4f}')

    print('\nPer-disease 1yr static: slope vs noslope (pool-30)')
    h = d30_1yr[d30_1yr['horizon'] == 'static_1yr']
    slope = h[h['model'].str.contains('slope')].set_index('disease')['auc']
    noslope = h[h['model'] == 'noslope'].set_index('disease')['auc']
    common = slope.index.intersection(noslope.index)
    pd.DataFrame({'slope_auc': slope, 'noslope_auc': noslope, 'diff': slope - noslope}).reindex(common).round(4)

## Objective 2: Batch-0 diagnostic

Predict batch 0 (0–10K) with pool 1–30 vs pool 10–39, same static 10yr AUC evaluation. See **`batch0_pool1_30_vs_10_39_auc.ipynb`**.