In [None]:
# Cell 0: Mount Drive, download data from Kaggle
import os, json

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "KGAT_17c43012d9e77edf2c183a25acb1489b"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

In [None]:
# Cell 1: Setup — clone repo, link data
import os, subprocess
REPO = "/content/competition_package"

os.chdir("/content")
os.system(f"rm -rf {REPO}")
os.system(f"git clone https://github.com/vincentvdo6/competition_package.git {REPO}")
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

os.system('ln -sf /content/data/train.parquet datasets/train.parquet')
os.system('ln -sf /content/data/valid.parquet datasets/valid.parquet')

assert os.path.exists("datasets/train.parquet"), "train.parquet not found!"
assert os.path.exists("datasets/valid.parquet"), "valid.parquet not found!"
print("Commit:", subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip())
print(f"GPU: {os.popen('nvidia-smi --query-gpu=name --format=csv,noheader').read().strip()}")
print("Ready!")

In [None]:
# Cell 2: Train microstructure seeds s42-s46 (pre-committed contiguous block)
# Confirmation rule: mean paired delta >= +0.003 AND at least 4/5 seeds positive
import os
os.chdir("/content/competition_package")

for seed in range(42, 47):
    print(f"\n{'='*60}")
    print(f"Training gru_microstructure_v1 seed {seed}")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_microstructure_v1.yaml "
        f"--seed {seed} --device cuda"
    )

print("\nAll 5 microstructure seeds complete!")

In [None]:
# Cell 3: Train tightwd_v2 control seeds s42-s46 (paired comparison)
import os
os.chdir("/content/competition_package")

for seed in range(42, 47):
    print(f"\n{'='*60}")
    print(f"Training gru_derived_tightwd_v2 seed {seed} (control)")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_derived_tightwd_v2.yaml "
        f"--seed {seed} --device cuda"
    )

print("\nAll 5 tightwd_v2 control seeds complete!")

In [None]:
# Cell 4: Paired comparison — microstructure vs tightwd_v2 per seed
import os, glob, torch
os.chdir("/content/competition_package")

micro_scores = {}
tw2_scores = {}

for pt in sorted(glob.glob("logs/*.pt")):
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    score = ckpt.get("best_score", ckpt.get("val_score", 0.0))
    name = os.path.basename(pt)
    if isinstance(score, (int, float)):
        score = float(score)
    else:
        score = 0.0
    
    for seed in range(42, 47):
        if f"microstructure_v1_seed{seed}" in name:
            micro_scores[seed] = score
        elif f"tightwd_v2_seed{seed}" in name:
            tw2_scores[seed] = score

print(f"{'Seed':>6} {'Micro':>8} {'TW2':>8} {'Delta':>8}")
print("-" * 35)

deltas = []
positive = 0
for seed in range(42, 47):
    m = micro_scores.get(seed, 0)
    t = tw2_scores.get(seed, 0)
    d = m - t
    deltas.append(d)
    if d > 0:
        positive += 1
    marker = " +" if d > 0 else " -"
    print(f"{seed:>6} {m:>8.4f} {t:>8.4f} {d:>+8.4f}{marker}")

mean_delta = sum(deltas) / len(deltas)
print(f"\n{'='*60}")
print(f"Mean paired delta: {mean_delta:+.4f}")
print(f"Positive seeds: {positive}/5")
print(f"\nConfirmation rule: mean delta >= +0.003 AND >= 4/5 positive")

if mean_delta >= 0.003 and positive >= 4:
    print(f"\n  CONFIRMED: Microstructure features HELP (+{mean_delta:.4f} mean delta)")
    print(f"  Next: Replace weakest champion GRU with micro at same seed")
else:
    print(f"\n  REJECTED: Signal too weak or inconsistent")
    print(f"  Microstructure features are noise, not signal")
print(f"{'='*60}")

In [None]:
# Cell 5: Strip checkpoints + zip + save to Drive
import os, torch, glob, shutil
os.chdir("/content/competition_package")
os.makedirs("logs/slim", exist_ok=True)

for pt in sorted(glob.glob("logs/*.pt")):
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
    }
    out = f"logs/slim/{os.path.basename(pt)}"
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f"{os.path.basename(pt)}: {orig:.1f}MB -> {new:.1f}MB")

for npz in sorted(glob.glob("logs/normalizer_*.npz")):
    shutil.copy(npz, f"logs/slim/{os.path.basename(npz)}")
    print(f"Copied {os.path.basename(npz)}")

shutil.make_archive("/content/micro_expansion", "zip",
                     "/content/competition_package/logs/slim")
sz = os.path.getsize("/content/micro_expansion.zip") / 1e6
print(f"\nmicro_expansion.zip: {sz:.1f}MB")

shutil.copy("/content/micro_expansion.zip", "/content/drive/MyDrive/wunderfund/micro_expansion.zip")
print("Saved to Drive: MyDrive/wunderfund/micro_expansion.zip")