# Results Story: Baselines, CBAM, KD

This notebook provides decision-focused analysis on finished experiment artifacts.

**Run order:**
1. Ensure completed artifacts exist in `results/`
2. Open and run this notebook top-to-bottom

**Dependencies:** local CSV/JSON artifacts only (no dataset download, no training).


In [None]:
from pathlib import Path
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

CWD = Path.cwd()
if (CWD / 'results').exists():
    ROOT = CWD
elif (CWD.parent / 'results').exists():
    ROOT = CWD.parent
else:
    raise FileNotFoundError('Could not locate repo root containing results/.')
MANIFEST_PATH = ROOT / 'experiments' / 'manifest_repro_v1.json'
LEADERBOARD_PATH = ROOT / 'results' / 'leaderboard.csv'
RUN_STEPS_PATH = ROOT / 'results' / 'run_steps.csv'
STATUS_PATH = ROOT / 'results' / 'orchestration' / 'manifest_status.csv'

plt.rcParams['figure.dpi'] = 120
plt.rcParams['axes.grid'] = True


## A) Campaign Completeness Audit


In [None]:
manifest = json.loads(MANIFEST_PATH.read_text(encoding='utf-8'))
leaderboard = pd.read_csv(LEADERBOARD_PATH)
status = pd.read_csv(STATUS_PATH)

seed = int(manifest['seed'])

phase1_expected = (
    len(manifest['baseline']['models'])
    + len(manifest['baseline']['cbam_models'])
    * len(manifest['baseline']['cbam_reduction_values'])
    * len(manifest['baseline']['cbam_sa_kernel_values'])
)
kd_expected = (
    len(manifest['kd']['students'])
    * len(manifest['kd']['alphas'])
    * len(manifest['kd']['temperatures'])
)

run_status = status[status['run_name'].str.startswith(('p1_', 'p2_'))].copy()
latest_status = run_status.sort_values('ts_utc').drop_duplicates('run_name', keep='last')
completed = latest_status[latest_status['status'].isin(['ok', 'skipped_valid'])]

summary = pd.DataFrame(
    [
        {'stage': 'phase1', 'expected': phase1_expected, 'completed': int((completed['run_name'].str.startswith('p1_')).sum())},
        {'stage': 'phase2_kd', 'expected': kd_expected, 'completed': int((completed['run_name'].str.startswith('p2_kd_')).sum())},
    ]
)
summary['remaining'] = summary['expected'] - summary['completed']
display(summary)

assert (summary['remaining'] == 0).all(), 'Campaign is not fully complete.'


## B) Phase-1 Snapshot (Teacher + Baselines + Best CBAM)


In [None]:
phase1 = leaderboard[leaderboard['run_name'].str.startswith('p1_')].copy()
phase1_small = phase1[
    [
        'run_name', 'model', 'params', 'best_val_f1', 'best_val_acc', 'test_f1', 'test_acc',
        'cbam_reduction', 'cbam_sa_kernel'
    ]
].sort_values('test_f1', ascending=False)
display(phase1_small.head(12))

best_cbam = phase1[phase1['model'] == 'tinycnn_cbam'].sort_values('test_f1', ascending=False).iloc[0]
teacher = phase1[phase1['run_name'] == 'p1_crnn_seed42'].iloc[0]
tiny_base = phase1[phase1['run_name'] == 'p1_tinycnn_seed42'].iloc[0]

print('teacher:', teacher['run_name'], f"test_f1={teacher['test_f1']:.4f}", f"test_acc={teacher['test_acc']:.4f}")
print('tiny base:', tiny_base['run_name'], f"test_f1={tiny_base['test_f1']:.4f}", f"test_acc={tiny_base['test_acc']:.4f}")
print('best cbam:', best_cbam['run_name'], f"test_f1={best_cbam['test_f1']:.4f}", f"test_acc={best_cbam['test_acc']:.4f}")


## C) KD vs Non-KD Comparison Tables


In [None]:
kd = leaderboard[leaderboard['run_name'].str.startswith('p2_kd_')].copy()

teacher_f1 = float(teacher['test_f1'])

base_rows = {
    'tinycnn': phase1[phase1['run_name'] == 'p1_tinycnn_seed42'].iloc[0],
    'tinycnn_cbam': phase1[phase1['run_name'] == 'p1_tinycnn_cbam_rr8_sk3_seed42'].iloc[0],
}

def attach_comparison(df, base_row):
    out = df.copy()
    base_f1 = float(base_row['test_f1'])
    base_acc = float(base_row['test_acc'])
    out['delta_vs_baseline'] = out['test_f1'] - base_f1
    out['delta_acc'] = out['test_acc'] - base_acc
    out['teacher_gap_closed'] = (out['test_f1'] - base_f1) / (teacher_f1 - base_f1)
    return out[[
        'run_name', 'test_f1', 'delta_vs_baseline', 'test_acc', 'delta_acc', 'teacher_gap_closed', 'alpha', 'tau'
    ]].sort_values('test_f1', ascending=False)

kd_tiny = kd[kd['run_name'].str.startswith('p2_kd_tinycnn_a')]
kd_cbam = kd[kd['run_name'].str.startswith('p2_kd_tinycnn_cbam_')]

tiny_table = attach_comparison(kd_tiny, base_rows['tinycnn'])
cbam_table = attach_comparison(kd_cbam, base_rows['tinycnn_cbam'])

print('KD table: tinycnn')
display(tiny_table)
print('KD table: tinycnn_cbam')
display(cbam_table)

assert len(tiny_table) + len(cbam_table) == 18, 'Expected 18 KD runs in total.'


## D) Hyperparameter View (`alpha x tau`)


In [None]:
def plot_heatmap(df, title):
    pivot = df.pivot_table(index='alpha', columns='tau', values='test_f1', aggfunc='mean')
    pivot = pivot.sort_index().sort_index(axis=1)

    fig, ax = plt.subplots(figsize=(5, 4))
    im = ax.imshow(pivot.values, cmap='YlGnBu', aspect='auto')
    ax.set_title(title)
    ax.set_xlabel('tau')
    ax.set_ylabel('alpha')
    ax.set_xticks(np.arange(len(pivot.columns)))
    ax.set_xticklabels([f'{v:g}' for v in pivot.columns])
    ax.set_yticks(np.arange(len(pivot.index)))
    ax.set_yticklabels([f'{v:g}' for v in pivot.index])

    for i in range(pivot.shape[0]):
        for j in range(pivot.shape[1]):
            ax.text(j, i, f"{pivot.values[i, j]:.4f}", ha='center', va='center', fontsize=8)

    cbar = fig.colorbar(im, ax=ax)
    cbar.set_label('test_f1')
    plt.tight_layout()
    plt.show()

    display(pivot)

plot_heatmap(kd_tiny, 'KD Hyperparameter Surface (tinycnn)')
plot_heatmap(kd_cbam, 'KD Hyperparameter Surface (tinycnn_cbam)')


## E) Training Dynamics

Compare epoch-level trajectories for representative runs:
- TinyCNN baseline vs best TinyCNN KD
- TinyCNN_CBAM baseline vs best TinyCNN_CBAM KD


In [None]:
selected_runs = {
    'tiny_baseline': 'p1_tinycnn_seed42',
    'tiny_best_kd': tiny_table.iloc[0]['run_name'],
    'cbam_baseline': 'p1_tinycnn_cbam_rr8_sk3_seed42',
    'cbam_best_kd': cbam_table.iloc[0]['run_name'],
}

def load_epoch_metrics(run_name):
    row = leaderboard[leaderboard['run_name'] == run_name].iloc[0]
    p = ROOT / row['epoch_metrics_csv']
    if not p.exists():
        raise FileNotFoundError(f'Missing epoch metrics: {p}')
    d = pd.read_csv(p)
    d['run_name'] = run_name
    return d

epoch_dfs = [load_epoch_metrics(rn) for rn in selected_runs.values()]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for d in epoch_dfs:
    axes[0].plot(d['epoch'], d['val_f1_macro'], label=str(d['run_name'].iloc[0]))
    axes[1].plot(d['epoch'], d['val_loss'], label=str(d['run_name'].iloc[0]))

axes[0].set_title('Validation F1 by Epoch')
axes[0].set_xlabel('epoch')
axes[0].set_ylabel('val_f1_macro')
axes[1].set_title('Validation Loss by Epoch')
axes[1].set_xlabel('epoch')
axes[1].set_ylabel('val_loss')
axes[0].legend(fontsize=7)
axes[1].legend(fontsize=7)
plt.tight_layout()
plt.show()


## F) Error Profile (Confusion Matrix): Baseline vs Best-KD


In [None]:
def load_confusion(run_name):
    row = leaderboard[leaderboard['run_name'] == run_name].iloc[0]
    p = ROOT / row['test_cm_csv']
    cm = pd.read_csv(p, index_col=0)
    cm = cm.loc[LABELS, LABELS]
    return cm

LABELS = ['quiet', 'breathe', 'snore']

compare_pairs = [
    ('tinycnn', 'p1_tinycnn_seed42', tiny_table.iloc[0]['run_name']),
    ('tinycnn_cbam', 'p1_tinycnn_cbam_rr8_sk3_seed42', cbam_table.iloc[0]['run_name']),
]

for name, base_run, kd_run in compare_pairs:
    cm_base = load_confusion(base_run)
    cm_kd = load_confusion(kd_run)
    diff = cm_kd - cm_base

    fig, axes = plt.subplots(1, 3, figsize=(12, 3.5))
    for ax, mat, title in [
        (axes[0], cm_base, f'{name} baseline'),
        (axes[1], cm_kd, f'{name} best KD'),
        (axes[2], diff, f'{name} KD - baseline'),
    ]:
        im = ax.imshow(mat.values, cmap='RdYlGn' if 'KD - baseline' in title else 'Blues')
        ax.set_title(title)
        ax.set_xticks(np.arange(len(LABELS)))
        ax.set_yticks(np.arange(len(LABELS)))
        ax.set_xticklabels(LABELS, rotation=30)
        ax.set_yticklabels(LABELS)
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                ax.text(j, i, f"{int(mat.values[i, j])}", ha='center', va='center', fontsize=8)
        fig.colorbar(im, ax=ax, fraction=0.046)

    plt.tight_layout()
    plt.show()


## G) Final Takeaways


In [None]:
best_tiny = tiny_table.iloc[0]
best_cbam = cbam_table.iloc[0]

takeaways = [
    f"Phase-1 and KD campaigns are complete: {phase1_expected}/11 and {kd_expected}/18 expected runs matched.",
    f"Best TinyCNN KD run: {best_tiny['run_name']} (test_f1={best_tiny['test_f1']:.4f}, delta={best_tiny['delta_vs_baseline']:+.4f}).",
    f"Best TinyCNN_CBAM KD run: {best_cbam['run_name']} (test_f1={best_cbam['test_f1']:.4f}, delta={best_cbam['delta_vs_baseline']:+.4f}).",
    f"KD benefit is stronger on CBAM student variants (higher teacher-gap closure in top settings).",
    f"Not every KD setting helps equally; the alpha/tau surface shows clear sensitivity.",
    f"Use leaderboard + confusion matrices jointly: global gains can still hide class-specific tradeoffs.",
]

for i, line in enumerate(takeaways, 1):
    print(f"{i}. {line}")
