# Aggressive Phase: K-Fold + Simpler Models + Revised FeatDrop

**Batch 3**: K-fold ensemble (5 folds x 1 seed) — ~2.5 hours
**Batch 4**: Simpler models (h=32/1L, h=64/1L) + Revised FeatDrop (C, D) — ~5 hours

K-fold merges train+val, splits by seq_ix. Each fold model sees val-like data during training.
This directly addresses the AUC=0.959 distribution shift.

**Kill gates**:
- K-fold: avg fold-held-out val > 0.2650 (lower bar since folds mix train+val)
- Simpler/FeatDrop: mean val (3 seeds) > 0.2709

In [None]:
# Cell 1: Mount Drive, download data, clone repo
import os, json, subprocess

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

REPO = "/content/competition_package"
os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet")
assert os.path.exists("datasets/valid.parquet")

commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Batch 3 — K-Fold Ensemble (5 folds x 1 seed)
# ~2.5 hours: 5 folds x ~30 min each
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

CONFIG = "configs/gru_kfold_v1.yaml"
N_FOLDS = 5
SEED = 42
KFOLD_GATE = 0.2650  # Lower bar: folds mix train+val distributions

print(f"K-FOLD TRAINING: {N_FOLDS} folds, seed {SEED}")
print(f"Kill gate: avg fold val > {KFOLD_GATE}")
print(f"{'='*70}\n")

batch_start = time.time()
for fold in range(N_FOLDS):
    ckpt = f"logs/gru_kfold_v1_fold{fold}_seed{SEED}.pt"
    if os.path.exists(ckpt):
        print(f"  fold {fold}: exists -- skip")
        continue
    print(f"\n{'#'*70}")
    print(f"# FOLD {fold}/{N_FOLDS}")
    print(f"{'#'*70}\n", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train_kfold.py",
         "--config", CONFIG,
         "--fold", str(fold),
         "--n-folds", str(N_FOLDS),
         "--seed", str(SEED),
         "--device", "cuda",
         "--eval-original-val"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    rc = proc.wait()
    elapsed = time.time() - t0
    if rc != 0:
        print(f"  ERROR: fold {fold} failed (rc={rc})")
    else:
        print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

# Evaluate all folds
print(f"\n{'='*70}")
print(f"K-FOLD RESULTS")
print(f"{'='*70}")
print(f"  {'Fold':<6} {'Fold Val':>10} {'Epoch':>8}")
print(f"  {'-'*30}")

fold_scores = []
for fold in range(N_FOLDS):
    pt = f"logs/gru_kfold_v1_fold{fold}_seed{SEED}.pt"
    if not os.path.exists(pt):
        print(f"  fold {fold}: MISSING")
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    fold_scores.append(score)
    print(f"  fold {fold:<5} {score:>10.4f} {epoch:>8}")

if fold_scores:
    mean_val = sum(fold_scores) / len(fold_scores)
    passed = mean_val > KFOLD_GATE
    print(f"\n  Mean fold val: {mean_val:.4f}  |  Gate ({KFOLD_GATE}): {'PASS' if passed else 'FAIL'}")

batch_elapsed = time.time() - batch_start
print(f"\nBatch 3 complete ({batch_elapsed/60:.0f} min)")

In [None]:
# Cell 3: Batch 4 — Simpler Models + Revised FeatDrop
# ~5 hours: 4 configs x 3 seeds x ~30 min each
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

BATCH_4 = [
    "configs/gru_h32_1L.yaml",
    "configs/gru_h64_1L.yaml",
    "configs/gru_featdrop_C.yaml",
    "configs/gru_featdrop_D.yaml",
]
SEEDS = [42, 43, 44]
BASELINE_VALS = {42: 0.2649, 43: 0.2737, 44: 0.2690}
GATE = 0.2709
RESULTS = {}

batch_start = time.time()
for config_path in BATCH_4:
    exp_name = os.path.splitext(os.path.basename(config_path))[0]
    print(f"\n{'#'*70}")
    print(f"# EXPERIMENT: {exp_name}")
    print(f"{'#'*70}\n", flush=True)

    for seed in SEEDS:
        ckpt = f"logs/{exp_name}_seed{seed}.pt"
        if os.path.exists(ckpt):
            print(f"  seed {seed}: exists -- skip")
            continue
        print(f"\n  --- {exp_name} seed {seed} ---", flush=True)
        t0 = time.time()
        proc = subprocess.Popen(
            [sys.executable, "-u", "scripts/train.py",
             "--config", config_path,
             "--seed", str(seed), "--device", "cuda"],
            stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
        )
        for line in proc.stdout:
            print(line, end="", flush=True)
        rc = proc.wait()
        elapsed = time.time() - t0
        if rc != 0:
            print(f"  ERROR: seed {seed} failed (rc={rc})")
        else:
            print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

    scores = []
    print(f"\n  {'Seed':<6} {'Val':>10} {'Base':>10} {'Delta':>10}")
    print(f"  {'-'*40}")
    for seed in SEEDS:
        pt = f"logs/{exp_name}_seed{seed}.pt"
        if not os.path.exists(pt):
            print(f"  s{seed}: MISSING")
            continue
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
        score = float(ckpt.get("best_score", 0))
        epoch = ckpt.get("best_epoch", "?")
        baseline = BASELINE_VALS.get(seed, 0)
        scores.append(score)
        print(f"  s{seed:<5} {score:>10.4f} {baseline:>10.4f} {score - baseline:>+10.4f}  (ep {epoch})")

    if scores:
        mean_val = sum(scores) / len(scores)
        passed = mean_val > GATE
        RESULTS[exp_name] = {'mean_val': mean_val, 'passed': passed, 'scores': scores}
        print(f"\n  Mean: {mean_val:.4f}  |  Gate ({GATE}): {'PASS' if passed else 'FAIL'}")
    print()

batch_elapsed = time.time() - batch_start
print(f"\n{'='*70}")
print(f"BATCH 4 COMPLETE ({batch_elapsed/60:.0f} min)")
print(f"{'='*70}")
for name, r in RESULTS.items():
    status = 'PASS' if r['passed'] else 'FAIL'
    print(f"  {name:<30} mean={r['mean_val']:.4f}  [{status}]")

In [None]:
# Cell 4: Summary + Save ALL checkpoints to Drive
import os, torch, glob, zipfile, json
os.chdir("/content/competition_package")

DRIVE_DIR = "/content/drive/MyDrive/wunderfund"

print("="*70)
print("AGGRESSIVE PHASE RESULTS")
print("="*70)

# K-fold results
print(f"\n--- K-FOLD (5 folds x seed 42) ---")
kfold_scores = []
for fold in range(5):
    pt = f"logs/gru_kfold_v1_fold{fold}_seed42.pt"
    if os.path.exists(pt):
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
        s = float(ckpt.get("best_score", 0))
        kfold_scores.append(s)
        print(f"  fold {fold}: {s:.4f} (ep {ckpt.get('best_epoch', '?')})")
if kfold_scores:
    km = sum(kfold_scores) / len(kfold_scores)
    print(f"  Mean: {km:.4f}  |  Gate (0.2650): {'PASS' if km > 0.2650 else 'FAIL'}")

# Batch 4 results
print(f"\n--- SIMPLER MODELS + FEATDROP C/D ---")
all_exps = ["gru_h32_1L", "gru_h64_1L", "gru_featdrop_C", "gru_featdrop_D"]
for exp_name in all_exps:
    scores = []
    for seed in [42, 43, 44]:
        pt = f"logs/{exp_name}_seed{seed}.pt"
        if os.path.exists(pt):
            ckpt = torch.load(pt, map_location="cpu", weights_only=False)
            scores.append(float(ckpt.get("best_score", 0)))
    if scores:
        mean_val = sum(scores) / len(scores)
        passed = mean_val > 0.2709
        print(f"  {exp_name:<30} mean={mean_val:.4f}  [{'PASS' if passed else 'FAIL'}]")
    else:
        print(f"  {exp_name:<30} MISSING")

# Save all checkpoints to Drive
print(f"\n{'='*70}")
print("Saving checkpoints to Drive...")
saved = 0
patterns = ["logs/gru_kfold_v1_fold*_seed*.pt"] + [f"logs/{e}_seed*.pt" for e in all_exps]
for pat in patterns:
    for pt in sorted(glob.glob(pat)):
        if '_epoch' in pt:
            continue
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
        slim = {
            "model_state_dict": ckpt["model_state_dict"],
            "config": ckpt.get("config", {}),
            "best_score": ckpt.get("best_score", None),
            "best_epoch": ckpt.get("best_epoch", None),
        }
        dst = f"{DRIVE_DIR}/{os.path.basename(pt)}"
        torch.save(slim, dst)
        saved += 1
print(f"Saved {saved} checkpoints to Drive")

# Zip for download
zip_name = "/content/aggressive_phase_checkpoints.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    for pat in patterns:
        for pt in sorted(glob.glob(pat)):
            if '_epoch' in pt:
                continue
            zf.write(pt, os.path.basename(pt))
sz = os.path.getsize(zip_name) / 1e6
print(f"Download zip: {zip_name} ({sz:.1f}MB)")
print("\nDone! Review results and decide what to submit.")