In [None]:
# Cell 0: Mount Drive, download data from Kaggle
import os, json

# Mount Drive for saving outputs
from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

# Install pinned kaggle + set credentials
!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "KGAT_17c43012d9e77edf2c183a25acb1489b"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

# Download + unzip dataset
os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

In [None]:
# Cell 1: Setup — clone repo, link data
import os, subprocess
REPO = "/content/competition_package"

os.chdir("/content")
os.system(f"rm -rf {REPO}")
os.system(f"git clone https://github.com/vincentvdo6/competition_package.git {REPO}")
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Link data from Kaggle download
os.system('ln -sf /content/data/train.parquet datasets/train.parquet')
os.system('ln -sf /content/data/valid.parquet datasets/valid.parquet')

# Verify
assert os.path.exists("datasets/train.parquet"), "train.parquet not found!"
assert os.path.exists("datasets/valid.parquet"), "valid.parquet not found!"
print("Commit:", subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip())
print(f"GPU: {os.popen('nvidia-smi --query-gpu=name --format=csv,noheader').read().strip()}")
print("Ready!")

In [None]:
# Cell 2: Treatment — aux heads, 3 seeds
# Expected: ~5-7 min per seed (35 epochs max, early stopping)
import os, subprocess
os.chdir("/content/competition_package")

for seed in [42, 43, 44]:
    print(f"\n{'='*60}")
    print(f'Training gru_aux_heads_v1 seed {seed}')
    print(f"{'='*60}", flush=True)
    p = subprocess.Popen(
        ['python', '-u', 'scripts/train.py',
         '--config', 'configs/gru_aux_heads_v1.yaml',
         '--seed', str(seed), '--device', 'cuda'],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        text=True, bufsize=1
    )
    for line in p.stdout:
        print(line, end='')
    rc = p.wait()
    if rc != 0:
        print(f'ERROR: seed {seed} failed with exit code {rc}')

print('\nTreatment training done!')

In [None]:
# Cell 3: Control — pearson_combined loss (current p1), 3 seeds
# Same seeds for direct comparison
import os, subprocess
os.chdir("/content/competition_package")

for seed in [42, 43, 44]:
    print(f"\n{'='*60}")
    print(f'Training gru_pearson_v1 (control) seed {seed}')
    print(f"{'='*60}", flush=True)
    p = subprocess.Popen(
        ['python', '-u', 'scripts/train.py',
         '--config', 'configs/gru_pearson_v1.yaml',
         '--seed', str(seed), '--device', 'cuda'],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        text=True, bufsize=1
    )
    for line in p.stdout:
        print(line, end='')
    rc = p.wait()
    if rc != 0:
        print(f'ERROR: seed {seed} failed with exit code {rc}')

print('\nControl training done!')

In [None]:
# Cell 4: Evaluate all 6 models — treatment vs control
import os, torch, glob
os.chdir("/content/competition_package")

treatment_scores = []
control_scores = []

print('--- Treatment (aux heads) ---')
for pt in sorted(glob.glob('logs/gru_aux_heads_v1_seed*.pt')):
    if '_epoch' in pt:
        continue  # Skip periodic checkpoints
    ckpt = torch.load(pt, map_location='cpu', weights_only=False)
    score = float(ckpt.get('best_score', 0.0))
    epoch = ckpt.get('best_epoch', 0)
    name = os.path.basename(pt)
    treatment_scores.append(score)
    print(f'  {name}: val={score:.4f} (best epoch {epoch})')

print('\n--- Control (pearson_combined / p1) ---')
for pt in sorted(glob.glob('logs/gru_pearson_v1_seed*.pt')):
    if '_epoch' in pt:
        continue
    ckpt = torch.load(pt, map_location='cpu', weights_only=False)
    score = float(ckpt.get('best_score', 0.0))
    epoch = ckpt.get('best_epoch', 0)
    name = os.path.basename(pt)
    control_scores.append(score)
    print(f'  {name}: val={score:.4f} (best epoch {epoch})')

if treatment_scores and control_scores:
    t_mean = sum(treatment_scores) / len(treatment_scores)
    c_mean = sum(control_scores) / len(control_scores)
    deltas = [t - c for t, c in zip(treatment_scores, control_scores)]
    mean_delta = sum(deltas) / len(deltas)
    n_positive = sum(1 for d in deltas if d > 0)

    print(f'\n{"="*60}')
    print(f'KILL TEST RESULTS')
    print(f'{"="*60}')
    print(f'Treatment mean: {t_mean:.4f}')
    print(f'Control mean:   {c_mean:.4f}')
    print(f'Mean delta:     {mean_delta:+.4f}')
    print(f'Per-seed deltas: {["{:+.4f}".format(d) for d in deltas]}')
    print(f'Positive seeds: {n_positive}/{len(deltas)}')
    print()

    # Kill test criteria: mean delta >= +0.0010 AND >= 2/3 positive
    pass_mean = mean_delta >= 0.0010
    pass_count = n_positive >= 2
    if pass_mean and pass_count:
        print('PASS! Proceed to seed expansion.')
    elif pass_mean:
        print(f'MARGINAL: mean delta OK ({mean_delta:+.4f}) but only {n_positive}/3 positive.')
    elif pass_count:
        print(f'MARGINAL: {n_positive}/3 positive but mean delta too small ({mean_delta:+.4f}).')
    else:
        print(f'FAIL: mean delta {mean_delta:+.4f} < 0.0010 and {n_positive}/3 positive.')
else:
    print('ERROR: Missing checkpoints! Check training output above.')

In [None]:
# Cell 5: Training curves — compare treatment vs control
import json, glob, os
os.chdir("/content/competition_package")

for pattern, label in [
    ('logs/training_history_gru_aux_heads_v1*.json', 'Treatment'),
    ('logs/training_history_gru_pearson_v1*.json', 'Control'),
]:
    print(f'\n--- {label} ---')
    for hist_file in sorted(glob.glob(pattern)):
        with open(hist_file) as f:
            hist = json.load(f)
        name = os.path.basename(hist_file).replace('training_history_', '').replace('.json', '')
        scores = [s['avg'] for s in hist['val_scores']]
        t0_scores = [s['t0'] for s in hist['val_scores']]
        t1_scores = [s['t1'] for s in hist['val_scores']]
        lrs = hist['learning_rates']
        best_idx = scores.index(max(scores))
        print(f'  {name}:')
        print(f'    Epochs: {len(scores)}, Best avg: {max(scores):.4f} at epoch {best_idx+1}')
        print(f'    Best t0: {t0_scores[best_idx]:.4f}, Best t1: {t1_scores[best_idx]:.4f}')
        print(f'    t0/t1 ratio: {t0_scores[best_idx]/max(t1_scores[best_idx], 1e-8):.2f}')
        print(f'    Last 5 avg: {["{:.4f}".format(s) for s in scores[-5:]]}')

In [None]:
# Cell 6: Strip checkpoints + zip + save to Drive
# ONLY RUN AFTER REVIEWING KILL TEST RESULTS
import os, torch, glob, shutil
os.chdir("/content/competition_package")
os.makedirs('logs/slim', exist_ok=True)

patterns = (
    glob.glob('logs/gru_aux_heads_v1_*.pt') +
    glob.glob('logs/gru_pearson_v1_*.pt')
)

for pt in sorted(patterns):
    if '_epoch' in pt:
        continue  # Skip periodic checkpoints
    ckpt = torch.load(pt, map_location='cpu', weights_only=False)
    slim = {
        'model_state_dict': ckpt['model_state_dict'],
        'config': ckpt.get('config', {}),
        'best_score': ckpt.get('best_score', None),
    }
    out = f'logs/slim/{os.path.basename(pt)}'
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f'{os.path.basename(pt)}: {orig:.1f}MB -> {new:.1f}MB')

# Copy normalizers
for npz in sorted(
    glob.glob('logs/normalizer_gru_aux_heads*.npz') +
    glob.glob('logs/normalizer_gru_pearson_v1*.npz')
):
    shutil.copy(npz, f'logs/slim/{os.path.basename(npz)}')
    print(f'Copied {os.path.basename(npz)}')

print(f'\n--- logs/slim/ contents ({len(os.listdir("logs/slim"))} files) ---')
total_mb = 0
for f in sorted(os.listdir('logs/slim')):
    sz = os.path.getsize(f'logs/slim/{f}') / 1e6
    total_mb += sz
    print(f'  {f}: {sz:.1f}MB')
print(f'  Total: {total_mb:.1f}MB')

# Zip for download
shutil.make_archive('/content/aux_heads_kill_test', 'zip',
                    '/content/competition_package/logs/slim')
sz = os.path.getsize('/content/aux_heads_kill_test.zip') / 1e6
print(f'\naux_heads_kill_test.zip: {sz:.1f}MB')

# Save to Drive
shutil.copy('/content/aux_heads_kill_test.zip',
            '/content/drive/MyDrive/wunderfund/aux_heads_kill_test.zip')
print('Saved to Drive: MyDrive/wunderfund/aux_heads_kill_test.zip')