# Seed Expansion + Parity Swap Discovery
**Goal**: Train new vanilla GRU seeds (s65-s84), then run per-target swap discovery to find the best 2-slot submission candidates.

**Requires in Drive**: `MyDrive/wunderfund/vanilla_seeds.zip` (existing s42-s64 checkpoints)

**Recipe**: gru_parity_v1 (vanilla h=64, 3L, raw32, MSE) — same as s42-s64.

**Pipeline**:
1. Download data from Kaggle
2. Clone repo + restore existing checkpoints from Drive
3. Train new seeds s65-s84
4. Evaluate all seeds, rank by val score
5. Cache predictions + run swap discovery
6. Strip + zip + save to Drive

In [None]:
# Cell 0: Mount Drive, download data from Kaggle
import os, json

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

In [None]:
# Cell 1: Clone repo, symlink data, restore existing checkpoints
import os, subprocess, shutil
REPO = "/content/competition_package"

os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)
os.makedirs("logs/vanilla_all", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet"), "train.parquet not found!"
assert os.path.exists("datasets/valid.parquet"), "valid.parquet not found!"

# Restore existing checkpoints (s42-s64) from Drive
vanilla_zip = "/content/drive/MyDrive/wunderfund/vanilla_seeds.zip"
if os.path.exists(vanilla_zip):
    subprocess.run(["unzip", "-o", "-q", vanilla_zip, "-d", "logs/vanilla_all/"], check=True)
    n = len([f for f in os.listdir("logs/vanilla_all") if f.endswith(".pt") and "_epoch" not in f])
    print(f"Restored {n} existing checkpoints from Drive")
else:
    print("WARNING: vanilla_seeds.zip not found in Drive — no existing checkpoints restored")

commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Train new seeds s65-s84
import os, subprocess, sys
os.chdir("/content/competition_package")

CONFIG = "configs/gru_parity_v1.yaml"
SEEDS = list(range(65, 85))  # s65-s84, 20 new seeds

print(f"=== SEED EXPANSION s65-s84 ===")
print(f"Config: {CONFIG}")
print(f"Seeds: {SEEDS[0]}-{SEEDS[-1]} ({len(SEEDS)} seeds)")
print("=" * 60, flush=True)

for seed in SEEDS:
    ckpt = f"logs/vanilla_all/gru_parity_v1_seed{seed}.pt"
    if os.path.exists(ckpt):
        print(f"seed {seed}: already trained — skip")
        continue
    print(f"\n{'='*60}")
    print(f"Training gru_parity_v1 seed {seed}")
    print(f"{'='*60}", flush=True)
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train.py",
         "--config", CONFIG,
         "--seed", str(seed), "--device", "cuda"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    proc.wait()
    if proc.returncode != 0:
        print(f"ERROR: seed {seed} failed with rc={proc.returncode}")
    else:
        # Move checkpoint to vanilla_all/
        src = f"logs/gru_parity_v1_seed{seed}.pt"
        if os.path.exists(src):
            os.rename(src, ckpt)

print(f"\nAll seeds done!")

In [None]:
# Cell 3: Evaluate all seeds (s42-s84), rank by val score
import os, glob, torch
os.chdir("/content/competition_package")

results = []
for pt in sorted(glob.glob("logs/vanilla_all/gru_parity_v1_seed*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "N/A")
    seed = int(basename.split("seed")[1].split(".")[0])
    results.append((seed, score, epoch))

results.sort(key=lambda x: -x[1])

print(f"{'Rank':<6} {'Seed':<6} {'Val Score':>10} {'Epoch':>6}")
print("-" * 35)
for i, (seed, score, epoch) in enumerate(results, 1):
    tag = " NEW" if seed >= 65 else ""
    marker = "***" if score >= 0.2737 else ("**" if score >= 0.2700 else "")
    print(f"{i:<6} s{seed:<5} {score:>10.4f} {str(epoch):>6}  {marker}{tag}")

scores = [s for _, s, _ in results]
new_scores = [s for seed, s, _ in results if seed >= 65]
print(f"\nTotal seeds: {len(results)}")
print(f"Mean val (all): {sum(scores)/len(scores):.4f}")
print(f"Mean val (new s65-s84): {sum(new_scores)/len(new_scores):.4f}" if new_scores else "No new seeds found")
print(f"Top-5 mean: {sum(scores[:5])/5:.4f}")
print(f"Top-10 mean: {sum(scores[:10])/10:.4f}")
print(f"\nTop seeds for per-target ensemble:")
print(f"  s{[s for s,_,_ in results[:10]]}")
print(f"\n*** = val >= 0.2737  ** = val >= 0.2700")

In [None]:
# Cell 4: Cache predictions + run per-target swap discovery
import os, subprocess, sys, glob, re
os.chdir("/content/competition_package")

CACHE_DIR = "cache/all_seeds_valid_preds"
SESSION_TAG = "feb22-b1"
SLOT_A = f"{SESSION_TAG}-t1swap-a-onnx.zip"
SLOT_B = f"{SESSION_TAG}-t0swap-b-onnx.zip"
REPORT = f"logs/parity_swap_discovery_{SESSION_TAG}.json"

# Anchor ensemble (current best)
ANCHOR_SEEDS = [43, 44, 45, 46, 50, 54, 55, 57, 58, 59, 60, 61, 63, 64]
ANCHOR_W0    = [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
ANCHOR_W1    = [1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0.25, 0, 1.75]

# Step 1: cache
all_ckpts = sorted(glob.glob("logs/vanilla_all/gru_parity_v1_seed*.pt"))
all_ckpts = [c for c in all_ckpts if '_epoch' not in c]
print(f"Caching {len(all_ckpts)} checkpoints...")
proc = subprocess.Popen(
    [sys.executable, "scripts/greedy_vanilla_ensemble.py", "cache",
     "--checkpoints", *all_ckpts,
     "--data", "datasets/valid.parquet",
     "--cache-dir", CACHE_DIR],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
for line in proc.stdout:
    print(line, end="", flush=True)
assert proc.wait() == 0, "Cache step failed"

# Step 2: discover swaps
cached = sorted(int(re.search(r'seed(\d+)', f).group(1))
                for f in glob.glob(f"{CACHE_DIR}/gru_parity_v1_seed*.npz"))
print(f"\nCached seeds ({len(cached)}): {cached}")
os.makedirs("submissions/ready", exist_ok=True)
proc = subprocess.Popen(
    [sys.executable, "scripts/discover_parity_swaps.py",
     "--cache-dir", CACHE_DIR,
     "--data", "datasets/valid.parquet",
     "--required-seeds", *[str(s) for s in cached],
     "--anchor-seeds", *[str(s) for s in ANCHOR_SEEDS],
     "--anchor-w0",    *[str(w) for w in ANCHOR_W0],
     "--anchor-w1",    *[str(w) for w in ANCHOR_W1],
     "--output-report", REPORT,
     "--output-dir", "submissions/ready",
     "--slot-a-name", SLOT_A,
     "--slot-b-name", SLOT_B],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
for line in proc.stdout:
    print(line, end="", flush=True)
assert proc.wait() == 0, "Discovery step failed"

In [None]:
# Cell 5: Strip checkpoints + zip + save to Drive
import os, torch, glob, shutil, json
os.chdir("/content/competition_package")
os.makedirs("logs/slim", exist_ok=True)

for pt in sorted(glob.glob("logs/vanilla_all/gru_parity_v1_seed*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    out = f"logs/slim/{basename}"
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f"{basename}: {orig:.1f}MB -> {new:.1f}MB")

# Zip all seeds (overwrites previous vanilla_seeds.zip with expanded set)
shutil.make_archive("/content/vanilla_seeds", "zip",
                    "/content/competition_package/logs/slim")
sz = os.path.getsize("/content/vanilla_seeds.zip") / 1e6
print(f"\nvanilla_seeds.zip: {sz:.1f}MB")
shutil.copy("/content/vanilla_seeds.zip",
            "/content/drive/MyDrive/wunderfund/vanilla_seeds.zip")
print("Saved to Drive: MyDrive/wunderfund/vanilla_seeds.zip")

# Copy submission zips and report to Drive
SESSION_TAG = "feb22-b1"
for fname in [f"{SESSION_TAG}-t1swap-a-onnx.zip", f"{SESSION_TAG}-t0swap-b-onnx.zip"]:
    src = f"submissions/ready/{fname}"
    if os.path.exists(src):
        shutil.copy(src, f"/content/drive/MyDrive/wunderfund/{fname}")
        print(f"Saved: {fname}")

report = f"logs/parity_swap_discovery_{SESSION_TAG}.json"
if os.path.exists(report):
    shutil.copy(report, f"/content/drive/MyDrive/wunderfund/parity_swap_discovery_{SESSION_TAG}.json")
    data = json.loads(open(report).read())
    print(f"\nAnchor score: {data['anchor']['score']['avg']:.4f}")
    a = data['selected']['slot_a_t1']
    b = data['selected']['slot_b_t0']
    print(f"Slot A (t1): replace s{a['replace_seed']} -> s{a['candidate_seed']}  delta={a['delta_avg']:+.5f}  gate={'PASS' if a['keep_gate'] else 'FAIL'}")
    print(f"Slot B (t0): replace s{b['replace_seed']} -> s{b['candidate_seed']}  delta={b['delta_avg']:+.5f}  gate={'PASS' if b['keep_gate'] else 'FAIL'}")