# K-Fold Multi-Seed Training

Train kfold seeds 43 + 44 on folds 0/1/3/4 = 8 models, plus fold 2 seed 43 probe = 9 total.

**Why**: Mixing vanilla (10 seeds) + kfold (4 folds, seed 42) hit PB 0.2895. More kfold models = more data diversity.

**Time**: ~4.5 hours (9 x ~30 min on T4)

**Checkpoints**: `gru_kfold_v1_fold{F}_seed{S}.pt`

In [None]:
# Cell 1: Mount Drive, download data, clone repo
import os, json, subprocess

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

REPO = "/content/competition_package"
os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet")
assert os.path.exists("datasets/valid.parquet")

commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Train 9 kfold models (seeds 43+44 on folds 0/1/3/4 + fold2 probe)
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

CONFIG = "configs/gru_kfold_v1.yaml"
N_FOLDS = 5

# 8 main models + 1 fold2 probe
JOBS = []
for seed in [43, 44]:
    for fold in [0, 1, 3, 4]:
        JOBS.append((fold, seed))
JOBS.append((2, 43))  # fold 2 probe

print(f"KFOLD MULTI-SEED: {len(JOBS)} models")
print(f"Jobs: {JOBS}")
print(f"{'='*70}\n")

batch_start = time.time()
for i, (fold, seed) in enumerate(JOBS):
    ckpt = f"logs/gru_kfold_v1_fold{fold}_seed{seed}.pt"
    if os.path.exists(ckpt):
        print(f"  [{i+1}/{len(JOBS)}] fold {fold} seed {seed}: exists -- skip")
        continue
    tag = "PROBE" if fold == 2 else "MAIN"
    print(f"\n{'#'*70}")
    print(f"# [{i+1}/{len(JOBS)}] {tag}: fold {fold}, seed {seed}")
    print(f"{'#'*70}\n", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train_kfold.py",
         "--config", CONFIG,
         "--fold", str(fold),
         "--n-folds", str(N_FOLDS),
         "--seed", str(seed),
         "--device", "cuda"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    rc = proc.wait()
    elapsed = time.time() - t0
    if rc != 0:
        print(f"  ERROR: fold {fold} seed {seed} failed (rc={rc})")
    else:
        print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

# Summary
print(f"\n{'='*70}")
print(f"KFOLD MULTI-SEED RESULTS")
print(f"{'='*70}")
print(f"  {'Fold':<6} {'Seed':<6} {'Val':>10} {'Epoch':>8}")
print(f"  {'-'*35}")

for fold, seed in JOBS:
    pt = f"logs/gru_kfold_v1_fold{fold}_seed{seed}.pt"
    if not os.path.exists(pt):
        print(f"  fold {fold:<5} s{seed:<5} MISSING")
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    tag = " (PROBE)" if fold == 2 else ""
    print(f"  fold {fold:<5} s{seed:<5} {score:>10.4f} {epoch:>8}{tag}")

batch_elapsed = time.time() - batch_start
print(f"\nTotal time: {batch_elapsed/60:.0f} min ({batch_elapsed/3600:.1f} hours)")

In [None]:
# Cell 3: Save all kfold checkpoints to Drive + zip for download
import os, torch, glob, zipfile
os.chdir("/content/competition_package")

DRIVE_DIR = "/content/drive/MyDrive/wunderfund"
os.makedirs(DRIVE_DIR, exist_ok=True)

# Find all kfold checkpoints (all seeds, all folds)
pattern = "logs/gru_kfold_v1_fold*_seed*.pt"
pts = sorted(glob.glob(pattern))
pts = [p for p in pts if '_epoch' not in p]

print(f"Found {len(pts)} kfold checkpoints")
print(f"{'='*70}")

# Save slim checkpoints to Drive
saved = 0
for pt in pts:
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    dst = f"{DRIVE_DIR}/{os.path.basename(pt)}"
    torch.save(slim, dst)
    saved += 1
    print(f"  {os.path.basename(pt)}: val={score:.4f} ep={epoch}")

print(f"\nSaved {saved} checkpoints to Drive")

# Zip for download
zip_name = "/content/kfold_multiseed_checkpoints.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    for pt in pts:
        zf.write(pt, os.path.basename(pt))
sz = os.path.getsize(zip_name) / 1e6
print(f"\nDownload zip: {zip_name} ({sz:.1f}MB)")
print("Done!")