# Slope LOO vs No-Slope LOO: Master Comparison (400k)

**Purpose:** Compare slope LOO-like and no-slope LOO prediction performance on the full 400k cohort using the same evaluation protocol (no pretrained_delta; fair comparison).

**Contents:**
1. **AUC comparison** — Static/dynamic 10yr and 1yr AUC by disease (from CSV + no-slope eval).
2. **Predictions at time of prediction** — How 1-year risk at enrollment differs between slope and no-slope (distributions, scatter).
3. **Individual trajectories** — Example patients: predicted risk over age (30–81) for slope vs no-slope.
4. **Gamma** — Slope model's genetic effects: level vs slope (PRS × signature), and relation to prediction gains.

In [None]:
# Setup: paths and imports
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

CLAUDE = Path('/Users/sarahurbut/aladynoulli2/claudefile')
RESULTS_LOO = CLAUDE / 'results_slope_loo_like'
sys.path.insert(0, str(CLAUDE))
sys.path.insert(0, '/Users/sarahurbut/aladynoulli2/pyScripts/')

# Data paths (adjust if your Dropbox path differs)
DATA_DIR = Path('/Users/sarahurbut/Library/CloudStorage/Dropbox-Personal/data_for_running/')
PCE_PATH = '/Users/sarahurbut/Library/CloudStorage/Dropbox-Personal/pce_prevent_full.csv'
NOSLOPE_PI_PATH = Path('/Users/sarahurbut/Library/CloudStorage/Dropbox/enrollment_predictions_nokappa_v3_loo_all40/pi_enroll_fixedphi_sex_FULL.pt')
# Slope LOO-like π (from running slope_loo_like_400k_eval.py); try both naming conventions
SLOPE_PI_PATHS = [RESULTS_LOO / 'pi_slope_loo_like_400k.pt', RESULTS_LOO / 'pi_slope_loo_like_400000.pt']
SLOPE_AUC_CSV = RESULTS_LOO / 'loo_like_auc_400000.csv'  # or loo_like_auc_400k.csv
NOSLOPE_AUC_CSV = RESULTS_LOO / 'no_slope_loo_auc_400000.csv'
N_PATIENTS = 400_000

---
## 1. AUC comparison (400k LOO)

Load slope LOO-like AUC from CSV. For no-slope, either load a pre-computed CSV or run the same 4 evaluations from the no-slope π tensor (takes a few minutes).

In [None]:
# Load slope LOO-like AUC (pre-computed)
df_slope = pd.read_csv(SLOPE_AUC_CSV)
print('Slope LOO-like AUC:')
print(df_slope.groupby('horizon').size())
df_slope.head(10)

In [None]:
# No-slope LOO AUC: load from CSV or compute from π
if NOSLOPE_AUC_CSV.exists():
    df_noslope = pd.read_csv(NOSLOPE_AUC_CSV)
    print('Loaded no-slope LOO AUC from', NOSLOPE_AUC_CSV)
else:
    print('Computing no-slope LOO AUC from π (this may take several minutes)...')
    from fig5utils import (
        evaluate_major_diseases_wsex_with_bootstrap_from_pi,
        evaluate_major_diseases_wsex_with_bootstrap_dynamic_from_pi,
    )
    Y = torch.load(DATA_DIR / 'Y_tensor.pt', weights_only=False)[:N_PATIENTS]
    E = torch.load(DATA_DIR / 'E_enrollment_full.pt', weights_only=False)[:N_PATIENTS]
    essentials = torch.load(DATA_DIR / 'model_essentials.pt', weights_only=False)
    disease_names = essentials['disease_names']
    pce_df = pd.read_csv(PCE_PATH).iloc[:N_PATIENTS].reset_index(drop=True)
    if 'Sex' not in pce_df.columns and 'sex' in pce_df.columns:
        pce_df['Sex'] = pce_df['sex'].map({0: 'Female', 1: 'Male'}).fillna('Unknown')
    if 'age' not in pce_df.columns and 'Age' in pce_df.columns:
        pce_df['age'] = pce_df['Age']
    pi_noslope = torch.load(NOSLOPE_PI_PATH, weights_only=False)[:N_PATIENTS]
    n_boot = 100  # reduce to 50 for faster run if needed

    all_rows = []
    for horizon, follow_yr, dyn in [
        ('static_10yr', 10, False), ('dynamic_10yr', 10, True),
        ('static_1yr', 1, False),   ('dynamic_1yr', 1, True),
    ]:
        if dyn:
            res = evaluate_major_diseases_wsex_with_bootstrap_dynamic_from_pi(
                pi=pi_noslope, Y_100k=Y, E_100k=E, disease_names=disease_names,
                pce_df=pce_df, n_bootstraps=n_boot, follow_up_duration_years=follow_yr)
        else:
            res = evaluate_major_diseases_wsex_with_bootstrap_from_pi(
                pi=pi_noslope, Y_100k=Y, E_100k=E, disease_names=disease_names,
                pce_df=pce_df, n_bootstraps=n_boot, follow_up_duration_years=follow_yr)
        for dg, m in res.items():
            row = {'model': 'no_slope_loo', 'horizon': horizon, 'disease': dg}
            if isinstance(m, dict):
                row.update(m)
            all_rows.append(row)
    df_noslope = pd.DataFrame(all_rows)
    RESULTS_LOO.mkdir(parents=True, exist_ok=True)
    df_noslope.to_csv(NOSLOPE_AUC_CSV, index=False)
    print('Saved', NOSLOPE_AUC_CSV)
df_noslope.head(10)

In [None]:
# Merge slope vs no-slope by (horizon, disease)
slope_auc = df_slope.set_index(['horizon', 'disease'])['auc'].rename('slope_auc')
noslope_auc = df_noslope.set_index(['horizon', 'disease'])['auc'].rename('noslope_auc')
compare = pd.DataFrame({'slope_auc': slope_auc, 'noslope_auc': noslope_auc}).dropna()
compare['auc_diff'] = compare['slope_auc'] - compare['noslope_auc']
compare = compare.sort_values('auc_diff', ascending=False)
print('Slope − No-slope AUC difference (positive = slope better):')
compare.head(15)

In [None]:
# Bar chart: slope vs no-slope AUC per horizon
for h in ['static_10yr', 'dynamic_10yr', 'static_1yr', 'dynamic_1yr']:
    if h not in compare.index.get_level_values(0):
        continue
    sub = compare.loc[h].sort_values('slope_auc', ascending=True)
    sub = sub.sort_values('slope_auc', ascending=True)
    fig, ax = plt.subplots(figsize=(8, max(6, len(sub) * 0.35)))
    y = np.arange(len(sub))
    ax.barh(y - 0.2, sub['noslope_auc'], height=0.35, label='No-slope LOO', color='#3498db', alpha=0.85)
    ax.barh(y + 0.2, sub['slope_auc'], height=0.35, label='Slope LOO-like', color='#e74c3c', alpha=0.85)
    ax.set_yticks(y)
    ax.set_yticklabels(sub.index)
    ax.set_xlabel('AUC')
    ax.set_title(f'Slope vs No-slope LOO (400k) — {h}')
    ax.legend(loc='lower right')
    ax.set_xlim(0.45, 1.0)
    ax.axvline(0.5, color='gray', ls='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

---
## 2. Predictions at time of prediction

Load both π tensors (slope LOO-like and no-slope LOO). We compare **1-year risk at enrollment** for key diseases: distribution and slope vs no-slope scatter.  
*If slope π was not saved (e.g. only CSV was kept), run `slope_loo_like_400k_eval.py` once to generate `pi_slope_loo_like_400k.pt`; trajectory and scatter sections will be skipped if slope π is missing.*

In [None]:
# Load π tensors (no-slope always; slope if available)
pi_noslope = torch.load(NOSLOPE_PI_PATH, weights_only=False)[:N_PATIENTS]
pi_slope = None
for p in SLOPE_PI_PATHS:
    if p.exists():
        pi_slope = torch.load(p, weights_only=False)[:N_PATIENTS]
        print('Slope π:', p.name, pi_slope.shape)
        break
if pi_slope is None:
    print('Slope π not found; trajectory and scatter plots will use no-slope only or be skipped.')
print('No-slope π:', pi_noslope.shape)

# Shared data for enrollment time and disease indices
essentials = torch.load(DATA_DIR / 'model_essentials.pt', weights_only=False)
disease_names = essentials['disease_names']
pce_df = pd.read_csv(PCE_PATH).iloc[:N_PATIENTS].reset_index(drop=True)
if 'age' not in pce_df.columns and 'Age' in pce_df.columns:
    pce_df['age'] = pce_df['Age']

def get_disease_indices(group_name):
    """Return list of disease indices for a major-disease group (same logic as fig5utils)."""
    groups = {
        'ASCVD': ['Myocardial infarction', 'Coronary atherosclerosis', 'Other acute and subacute', 'Unstable angina', 'Angina pectoris', 'Other chronic ischemic'],
        'Diabetes': ['Type 2 diabetes'],
        'Heart_Failure': ['Congestive heart failure', 'Heart failure NOS'],
    }
    if group_name not in groups:
        return []
    indices = []
    for name in groups[group_name]:
        for i, dn in enumerate(disease_names):
            if name.lower() in dn.lower() and i not in indices:
                indices.append(i)
    return [i for i in indices if i < pi_noslope.shape[1]]

def risk_1yr_at_enrollment(pi, pce_df, disease_indices):
    """For each row in pce_df, 1-year composite risk at enrollment (age 30–80)."""
    risks = []
    for i in range(min(len(pce_df), pi.shape[0])):
        t = int(pce_df.iloc[i]['age'] - 30)
        if t < 0 or t >= pi.shape[2] or not disease_indices:
            risks.append(np.nan)
            continue
        p = pi[i, disease_indices, t]
        if isinstance(p, torch.Tensor):
            p = p.detach().cpu().numpy()
        risks.append(float(1 - np.prod(1 - np.clip(p, 1e-8, 1 - 1e-8))))
    return np.array(risks)

In [None]:
# 1-year risk at enrollment: ASCVD
idx_ascvd = get_disease_indices('ASCVD')
r_noslope_ascvd = risk_1yr_at_enrollment(pi_noslope, pce_df, idx_ascvd)
valid = ~np.isnan(r_noslope_ascvd)
print('No-slope ASCVD 1yr risk at enrollment: mean = %.4f, median = %.4f' % (np.nanmean(r_noslope_ascvd[valid]), np.nanmedian(r_noslope_ascvd[valid])))

if pi_slope is not None:
    r_slope_ascvd = risk_1yr_at_enrollment(pi_slope, pce_df, idx_ascvd)
    print('Slope ASCVD 1yr risk at enrollment: mean = %.4f, median = %.4f' % (np.nanmean(r_slope_ascvd[valid]), np.nanmedian(r_slope_ascvd[valid])))

In [None]:
# Plot: distribution of 1yr risk at enrollment (ASCVD) and slope vs no-slope scatter
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Left: distributions
axes[0].hist(r_noslope_ascvd[valid], bins=80, alpha=0.7, density=True, label='No-slope', color='#3498db', range=(0, 0.3))
if pi_slope is not None:
    axes[0].hist(r_slope_ascvd[valid], bins=80, alpha=0.6, density=True, label='Slope', color='#e74c3c', range=(0, 0.3))
axes[0].set_xlabel('1-year ASCVD risk at enrollment')
axes[0].set_ylabel('Density')
axes[0].set_title('Distribution of predicted risk at time of prediction')
axes[0].legend()

# Right: scatter (sample for speed)
if pi_slope is not None:
    n_plot = min(50_000, int(valid.sum()))
    idx_plot = np.where(valid)[0][:n_plot]
    s_n = r_noslope_ascvd[idx_plot]
    s_s = r_slope_ascvd[idx_plot]
    hb = axes[1].hexbin(s_n, s_s, gridsize=60, cmap='YlOrRd', mincnt=1, extent=(0, 0.25, 0, 0.25))
    axes[1].plot([0, 0.25], [0, 0.25], 'k--', alpha=0.8, label='y=x')
    axes[1].set_xlabel('No-slope 1yr ASCVD risk')
    axes[1].set_ylabel('Slope 1yr ASCVD risk')
    axes[1].set_title('Slope vs No-slope at enrollment (n=%s)' % len(idx_plot))
    axes[1].legend()
    plt.colorbar(hb, ax=axes[1], label='Count')
else:
    axes[1].text(0.5, 0.5, 'Slope π not loaded;\nscatter requires slope LOO-like π.', ha='center', va='center', transform=axes[1].transAxes)
plt.tight_layout()
plt.show()

---
## 3. Individual trajectories

Example patients: predicted **1-year risk over age (30–81)** for slope vs no-slope for ASCVD and Diabetes. We pick a few patients (e.g. one who had an ASCVD event, one who had Diabetes, one with no events) and plot risk trajectories.

In [None]:
# Select example patients: one with ASCVD event, one with Diabetes, one with neither (by enrollment)
idx_dm = get_disease_indices('Diabetes')  # reuse idx_ascvd from earlier cell
Y_full = torch.load(DATA_DIR / 'Y_tensor.pt', weights_only=False)[:N_PATIENTS]
ages = 30 + np.arange(52)  # 30..81

def trajectory_1yr(pi, i, disease_indices):
    """1-year composite risk over time for patient i. pi: (N,D,T)."""
    if not disease_indices:
        return np.full(52, np.nan)
    r = []
    for t in range(pi.shape[2]):
        p = pi[i, disease_indices, t]
        if isinstance(p, torch.Tensor):
            p = p.detach().cpu().numpy()
        r.append(float(1 - np.prod(1 - np.clip(p, 1e-8, 1 - 1e-8))))
    return np.array(r)

# Find patients: had ASCVD after enrollment, had Diabetes after enrollment, no ASCVD/Diabetes
t_enroll = (pce_df['age'].values - 30).astype(int)
had_ascvd = np.zeros(len(pce_df), dtype=bool)
had_dm = np.zeros(len(pce_df), dtype=bool)
for i in range(min(len(pce_df), Y_full.shape[0])):
    te = t_enroll[i]
    if te < 0 or te >= Y_full.shape[2] - 1:
        continue
    for d in idx_ascvd:
        if d < Y_full.shape[1] and Y_full[i, d, te:].sum() > 0:
            had_ascvd[i] = True
            break
    for d in idx_dm:
        if d < Y_full.shape[1] and Y_full[i, d, te:].sum() > 0:
            had_dm[i] = True
            break

idx_ascvd_event = np.where(had_ascvd)[0]
idx_dm_event = np.where(had_dm & ~had_ascvd)[0]
idx_no_event = np.where(~had_ascvd & ~had_dm)[0]
# Pick one of each (e.g. middle of list)
i_ascvd = idx_ascvd_event[len(idx_ascvd_event)//2] if len(idx_ascvd_event) else 0
i_dm = idx_dm_event[len(idx_dm_event)//2] if len(idx_dm_event) else 100
i_none = idx_no_event[len(idx_no_event)//2] if len(idx_no_event) else 500
example_indices = [i_ascvd, i_dm, i_none]
labels = ['ASCVD event (after enroll)', 'Diabetes event (after enroll)', 'No ASCVD/Diabetes']

In [None]:
# Plot individual trajectories: slope vs no-slope over age
if pi_slope is None:
    print('Slope π not loaded; skipping trajectory plots.')
else:
    fig, axes = plt.subplots(3, 2, figsize=(12, 10))
    for row, (i, lab) in enumerate(zip(example_indices, labels)):
        age_enroll = int(pce_df.iloc[i]['age'])
        # ASCVD trajectory
        ax = axes[row, 0]
        tr_n = trajectory_1yr(pi_noslope, i, idx_ascvd)
        tr_s = trajectory_1yr(pi_slope, i, idx_ascvd)
        ax.plot(ages, tr_n, '-', color='#3498db', lw=2, label='No-slope')
        ax.plot(ages, tr_s, '-', color='#e74c3c', lw=2, label='Slope')
        ax.axvline(age_enroll, color='gray', ls='--', alpha=0.7, label='Enrollment')
        ax.set_ylabel('1-yr ASCVD risk')
        ax.set_title('Patient %s (%s)' % (i, lab))
        ax.legend(loc='upper right', fontsize=8)
        ax.set_ylim(0, min(0.5, max(tr_n.max(), tr_s.max()) * 1.1))
        # Diabetes trajectory
        ax = axes[row, 1]
        tr_n = trajectory_1yr(pi_noslope, i, idx_dm)
        tr_s = trajectory_1yr(pi_slope, i, idx_dm)
        ax.plot(ages, tr_n, '-', color='#3498db', lw=2, label='No-slope')
        ax.plot(ages, tr_s, '-', color='#e74c3c', lw=2, label='Slope')
        ax.axvline(age_enroll, color='gray', ls='--', alpha=0.7)
        ax.set_ylabel('1-yr Diabetes risk')
        ax.legend(loc='upper right', fontsize=8)
        ax.set_ylim(0, min(0.5, max(tr_n.max(), tr_s.max()) * 1.1))
    for ax in axes[-1, :]:
        ax.set_xlabel('Age')
    plt.suptitle('Individual risk trajectories: Slope vs No-slope LOO', y=1.02)
    plt.tight_layout()
    plt.show()

---
## 4. Gamma: genetic effects in the slope model

The slope model adds **gamma_slope** (effect of PRS on signature *over time*) on top of **gamma_level**. We load pooled slope checkpoints (single-phase LOO) and visualize:
- **Heatmap:** gamma_slope (PRS × disease signature) — which PRS drive time-varying risk.
- **Bars:** Top PRS slopes for key diseases (ASCVD, Diabetes, Heart Failure, etc.).

In [None]:
# Load pooled gamma_level and gamma_slope from slope single-phase checkpoints (sample of batches for speed)
SLOPE_CKPT_LOO = Path('/Users/sarahurbut/Library/CloudStorage/Dropbox/slope_model_nokappa_v3_single_phase/')
BATCH_SIZE_CKPT = 10000
# Use a subset of batches for faster run; use list(range(40)) for full pool
batches_to_load = list(range(0, 40, 10))  # e.g. 0, 10, 20, 30
gamma_levels, gamma_slopes = [], []
for b in batches_to_load:
    start, stop = b * BATCH_SIZE_CKPT, (b + 1) * BATCH_SIZE_CKPT
    p = SLOPE_CKPT_LOO / f'slope_model_batch_{start}_{stop}.pt'
    if not p.exists():
        print('Missing', p.name)
        continue
    ck = torch.load(p, weights_only=False)
    gamma_levels.append(ck['gamma_level'].detach().cpu().numpy())
    gamma_slopes.append(ck['gamma_slope'].detach().cpu().numpy())
if not gamma_slopes:
    print('No slope checkpoints found; gamma section will be skipped.')
else:
    gamma_level = np.mean(gamma_levels, axis=0)   # (n_features, K) e.g. (47, 21)
    gamma_slope = np.mean(gamma_slopes, axis=0)
    print('Pooled gamma_level:', gamma_level.shape, 'gamma_slope:', gamma_slope.shape)

In [None]:
# Gamma heatmap: PRS × disease signatures (gamma_slope)
if not gamma_slopes:
    pass
else:
    prs_df = pd.read_csv('/Users/sarahurbut/aladynoulli2/prs_names.csv', header=None)
    PRS_NAMES = prs_df.iloc[:, 0].astype(str).tolist()
    n_prs = min(36, gamma_slope.shape[0])
    gs_prs = gamma_slope[:n_prs, :20]  # PRS x 20 disease signatures (exclude health)
    v = np.percentile(np.abs(gs_prs), 95)
    fig, ax = plt.subplots(figsize=(14, 10))
    im = ax.imshow(gs_prs, aspect='auto', cmap='RdBu_r', vmin=-v, vmax=v)
    ax.set_yticks(range(n_prs))
    ax.set_yticklabels(PRS_NAMES, fontsize=9)
    ax.set_xticks(range(20))
    ax.set_xticklabels([f'Sig{k}' for k in range(20)], fontsize=8)
    ax.set_xlabel('Disease signature')
    ax.set_ylabel('PRS')
    ax.set_title('gamma_slope: PRS × Disease Signature (red = risk ↑ with age, blue = ↓)')
    plt.colorbar(im, ax=ax, shrink=0.6)
    plt.tight_layout()
    plt.show()

In [None]:
# Bar: top PRS slopes (by |gamma_slope|) across all PRS × signature
if not gamma_slopes:
    pass
else:
    flat = []
    for prs_idx in range(gs_prs.shape[0]):
        for sig in range(gs_prs.shape[1]):
            flat.append({'PRS': PRS_NAMES[prs_idx], 'Sig': sig, 'slope': gs_prs[prs_idx, sig]})
    flat_df = pd.DataFrame(flat)
    flat_df['abs_slope'] = flat_df['slope'].abs()
    top = flat_df.nlargest(18, 'abs_slope')
    fig, ax = plt.subplots(figsize=(8, 6))
    y_pos = np.arange(len(top))
    colors = ['#e74c3c' if s > 0 else '#3498db' for s in top['slope']]
    ax.barh(y_pos, top['slope'], color=colors, height=0.7)
    ax.set_yticks(y_pos)
    ax.set_yticklabels([f"{r['PRS']} (Sig{r['Sig']})" for _, r in top.iterrows()], fontsize=9)
    ax.set_xlabel('gamma_slope')
    ax.set_title('Top PRS × signature slopes (red = risk ↑ with age, blue = ↓)')
    ax.axvline(0, color='black', linewidth=0.5)
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()

---
**Summary**

- **AUC:** Slope vs no-slope LOO compared across static/dynamic 10yr and 1yr; no-slope AUC is computed once and cached in `no_slope_loo_auc_400000.csv` if missing.
- **Trajectories and scatter** require slope π from `slope_loo_like_400k_eval.py` (saved as `pi_slope_loo_like_400k.pt` or `pi_slope_loo_like_400000.pt`).
- **Gamma** uses slope single-phase checkpoints (Dropbox); increase `batches_to_load` (e.g. `list(range(40))`) for full LOO pool.

Run top-to-bottom and knit to HTML/PDF for the full report.