In [None]:
# CELL 0 - Mount Drive and install PyG + deps from saved wheels

from google.colab import drive
drive.mount('/content/drive')

WHEEL_DIR = "/content/drive/MyDrive/PyG_wheels_torch29_cu126"

import torch
print("torch version before installing wheels:", torch.__version__)

# üëâ This assumes Colab still has torch==2.9.0+cu126.
# If torch version changes in the future, you'll want a new wheel set.
!pip install {WHEEL_DIR}/*.whl

import torch_geometric
print("torch:", torch.__version__)
print("torch_geometric:", torch_geometric.__version__)

from torch_geometric.datasets import QM9
print("QM9 import OK")


Mounted at /content/drive
torch version before installing wheels: 2.9.0+cu126
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/aiohappyeyeballs-2.6.1-py3-none-any.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/aiosignal-1.4.0-py3-none-any.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/attrs-25.4.0-py3-none-any.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/certifi-2025.11.12-py3-none-any.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl
Processing ./drive/MyDrive/PyG_wheels_torch29_cu126/fsspec-2025.10.0-py3-none-any.whl
Processing ./drive/MyDrive/PyG_wheels

KeyboardInterrupt: 

In [None]:
# CELL 1 - Clone or refresh QM9_project repo

%cd /content
!test -d QM9_project || git clone https://github.com/zeugirdoR/QM9_project.git
%cd QM9_project
!git pull
!ls


/content
Cloning into 'QM9_project'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 40 (delta 7), reused 36 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (40/40), 24.75 KiB | 6.19 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/QM9_project
Already up to date.
bootstrapwheels.py  how2git.txt			 README.md
data		    models			 requirements.txt
env		    newColabenv.py		 setup_pyg_wheels.py
eval_qm9_legacy.py  orig_train_qm9_baseline.py	 train_qm9_baseline.py
fastboot.sh	    qm9_project_bootstrap.ipynb


In [None]:
# CELL - V20-AGAA-MICRO MODEL (PyG-friendly + motor_strength)
%%writefile models/v20_agaa_micro.py
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.utils import to_dense_batch


def safe_norm(x, dim=-1, eps=1e-6):
    return torch.sqrt(torch.sum(x ** 2, dim=dim) + eps)


def coulomb_potential(pos, eps=1e-6):
    # pos: (B, N, 3)
    delta = pos.unsqueeze(2) - pos.unsqueeze(1)   # (B, N, N, 3)
    dist  = safe_norm(delta, dim=-1)             # (B, N, N)
    inv_dist = torch.where(dist > 1e-4, 1.0 / dist, torch.zeros_like(dist))
    return inv_dist.unsqueeze(-1)                # (B, N, N, 1)


class GaussianRBF(nn.Module):
    def __init__(self, n_rbf: int = 20, cutoff: float = 5.0):
        super().__init__()
        self.n_rbf = n_rbf
        centers = torch.linspace(0.0, cutoff, n_rbf)
        widths  = torch.full((n_rbf,), cutoff / n_rbf)
        self.register_buffer("centers", centers)
        self.register_buffer("widths", widths)

    def forward(self, dist: torch.Tensor) -> torch.Tensor:
        """
        dist: (B, N, N)
        returns: (B, N, N, n_rbf)
        """
        diff = dist.unsqueeze(-1) - self.centers.view(1, 1, 1, -1)
        widths = torch.clamp(self.widths, min=1e-3)
        return torch.exp(- (diff ** 2) / (2.0 * widths.view(1, 1, 1, -1) ** 2))


class MotorAttention(nn.Module):
    """
    Multi-head 'motor' attention in 6D screw space, gated by Coulomb + RBF.
    """
    def __init__(self, d_model: int, n_heads: int, n_rbf: int = 20):
        super().__init__()
        self.n_heads   = n_heads
        self.dim_motor = 6

        self.q_screw = nn.Linear(d_model, n_heads * self.dim_motor, bias=False)
        self.k_screw = nn.Linear(d_model, n_heads * self.dim_motor, bias=False)

        self.coulomb_proj = nn.Linear(1, n_heads, bias=False)
        self.rbf_gate  = nn.Linear(n_rbf, n_heads, bias=True)
        self.rbf_bias  = nn.Linear(n_rbf, n_heads, bias=False)

        self.v_proj   = nn.Linear(d_model, d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_model, bias=False)

        self.metric = nn.Parameter(torch.tensor([1., 1., 1., 1., 1., 1.]))
        self.scale  = nn.Parameter(torch.tensor(1.0))

    def forward(
        self,
        x: torch.Tensor,         # (B, N, d_model)
        rbf_feat: torch.Tensor,  # (B, N, N, n_rbf)
        pos: torch.Tensor,       # (B, N, 3)
        mask: torch.Tensor = None,   # (B, N) bool or 0/1
        motor_strength: float = 1.0, # 0 = motors silent
    ):
        B, N, _ = x.shape
        x_f32 = x.float()

        # screw-space Q,K
        q = self.q_screw(x_f32).view(B, N, self.n_heads, self.dim_motor).permute(0, 2, 1, 3)
        k = self.k_screw(x_f32).view(B, N, self.n_heads, self.dim_motor).permute(0, 2, 1, 3)

        metric = self.metric.view(1, 1, 1, self.dim_motor)
        q_m = q * metric
        k_m = k * metric

        motor_score = torch.einsum("bhnc,bhmc->bhnm", q_m, k_m)
        motor_score = motor_score * self.scale / math.sqrt(self.dim_motor)

        coul = coulomb_potential(pos).float()        # (B, N, N, 1)
        coul = self.coulomb_proj(coul)               # (B, N, N, H)

        rbf = rbf_feat.float()
        gate = self.rbf_gate(rbf) + self.rbf_bias(rbf)   # (B, N, N, H)

        geo = coul + gate                            # (B, N, N, H)
        geo = geo.permute(0, 3, 1, 2)                # (B, H, N, N)

        # curriculum knob
        scores = geo + motor_strength * motor_score  # (B, H, N, N)

        # motor activity scalar (for volume penalty + monitoring)
        sig_motor = (motor_score * gate.permute(0, 3, 1, 2)).abs().mean()

        if mask is not None:
            attn_mask = mask.unsqueeze(1).unsqueeze(2)  # (B,1,1,N)
            scores = scores.masked_fill(attn_mask == 0, float("-inf"))

        attn = torch.softmax(scores, dim=-1)         # (B, H, N, N)

        v = self.v_proj(x_f32)                       # (B, N, d_model)
        d_head = x_f32.size(-1) // self.n_heads
        v = v.view(B, N, self.n_heads, d_head).permute(0, 2, 1, 3)

        out = torch.matmul(attn, v)                  # (B, H, N, d_head)
        out = out.permute(0, 2, 1, 3).reshape(B, N, -1)

        out = self.out_proj(out).to(x.dtype)
        return out, sig_motor


class V20_Block_Motor(nn.Module):
    def __init__(self, d_model: int, n_heads: int, n_rbf: int = 20):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.attn  = MotorAttention(d_model, n_heads, n_rbf=n_rbf)
        self.ffn   = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.SiLU(),
            nn.Linear(4 * d_model, d_model),
        )

    def forward(self, x, rbf_feat, pos, mask=None, motor_strength: float = 1.0):
        h = self.norm1(x)
        attn_out, sig = self.attn(h, rbf_feat, pos, mask, motor_strength=motor_strength)
        x = x + attn_out

        h = self.norm2(x)
        x = x + self.ffn(h)
        return x, sig


class V20_AGAA_Motor(nn.Module):
    """
    PyG-friendly variant:
      - takes batched node tensors (z, pos, batch_idx)
      - internally pads to (B, N, ¬∑) with to_dense_batch
      - returns (pred, sig_motor) with pred shape (B, 1)
    """
    def __init__(self, num_layers=7, d_model=192, n_heads=16, max_z=100, n_rbf=20):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_rbf   = n_rbf

        self.rbf   = GaussianRBF(n_rbf=n_rbf, cutoff=5.0)
        self.emb_z = nn.Embedding(max_z, d_model)
        self.emb_geo  = nn.Linear(4, d_model)
        self.emb_fuse = nn.Linear(2 * d_model, d_model)

        self.layers = nn.ModuleList(
            [V20_Block_Motor(d_model, n_heads, n_rbf=n_rbf) for _ in range(num_layers)]
        )
        self.norm_final = nn.LayerNorm(d_model)
        self.head       = nn.Linear(d_model, 1)

    def forward(self, z, pos, batch_idx=None, motor_strength: float = 1.0):
        """
        z         : (M,)      atomic numbers for all nodes
        pos       : (M, 3)    positions for all nodes
        batch_idx : (M,)      graph index for each node (PyG's batch)
        motor_strength: curriculum knob
        """
        if batch_idx is None:
            # Single graph case: treat as batch size 1
            pos_dense = pos.unsqueeze(0)          # (1, N, 3)
            z_dense   = z.unsqueeze(0)            # (1, N)
            mask = torch.ones_like(z_dense, dtype=torch.bool, device=z_dense.device)
        else:
            pos_dense, mask = to_dense_batch(pos, batch_idx)  # (B, N, 3), (B, N)
            z_dense,  _     = to_dense_batch(z,   batch_idx)  # (B, N)

        mask_float = mask.float()                 # (B, N)
        z_clamped = z_dense.clamp(min=0).long()   # (B, N)

        # Atom type emb
        h_z = self.emb_z(z_clamped)              # (B, N, d_model)

        # Pairwise distances in each molecule
        delta = pos_dense.unsqueeze(2) - pos_dense.unsqueeze(1)  # (B, N, N, 3)
        dist  = safe_norm(delta, dim=-1)                         # (B, N, N)

        rbf_feat = self.rbf(dist)                               # (B, N, N, n_rbf)

        # Per-atom geometric summaries (mean/min/max/std of local distances)
        neigh_mask = mask_float.unsqueeze(1) * mask_float.unsqueeze(2)  # (B, N, N)
        dist_masked = dist * neigh_mask
        valid_counts = neigh_mask.sum(dim=-1).clamp(min=1.0)            # (B, N)

        mean_d = dist_masked.sum(dim=-1) / valid_counts                 # (B, N)
        min_d  = torch.where(
            neigh_mask.bool(),
            dist,
            torch.full_like(dist, 1e6)
        ).min(dim=-1).values                                           # (B, N)
        max_d  = (dist * neigh_mask).max(dim=-1).values                # (B, N)
        std_d  = torch.sqrt(
            torch.clamp((dist_masked ** 2).sum(dim=-1) / valid_counts - mean_d ** 2, min=0.0)
        )                                                              # (B, N)

        geo_feat = torch.stack([mean_d, min_d, max_d, std_d], dim=-1)  # (B, N, 4)
        h_geo = self.emb_geo(geo_feat)

        h = torch.cat([h_z, h_geo], dim=-1)                            # (B, N, 2*d_model)
        h = self.emb_fuse(h)                                           # (B, N, d_model)

        sigs = []
        for layer in self.layers:
            h, sig = layer(h, rbf_feat, pos_dense, mask, motor_strength=motor_strength)
            sigs.append(sig)

        h = self.norm_final(h)

        # mask-based pooling
        mask_f = mask_float.unsqueeze(-1)                               # (B, N, 1)
        h_pool = (h * mask_f).sum(dim=1) / mask_f.sum(dim=1).clamp(min=1.0)  # (B, d_model)

        pred = self.head(h_pool)                                       # (B, 1)
        sig_motor = torch.stack(sigs).mean()
        return pred, sig_motor


def build_v20_agaa_micro():
    return V20_AGAA_Motor(num_layers=7, d_model=192, n_heads=16, max_z=100, n_rbf=20)


Overwriting models/v20_agaa_micro.py


In [None]:
# CELL - Reload model and re-check params
import importlib
import models.v20_agaa_micro as v20

importlib.reload(v20)

model = v20.V20_AGAA_Motor(
    num_layers=7,
    d_model=192,
    n_heads=16,
    max_z=100,
    n_rbf=20,
).to("cuda")

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable params:", total_params)


Trainable params: 2950034


In [None]:
# CELL - Curriculum training for V20-AGAA-Motor (sparse ‚Üí dense handled inside model)

import time
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# üëâ Choose which QM9 property to learn.
#    Use the same index you used in the warm test.
TARGET_IDX = 12   # adjust if you trained on a different QM9 column


def _extract_target(batch):
    """
    Extract a 1D target vector (batch of scalars) from batch.y.

    Handles:
      - (B, 19): full QM9 target vector per graph  -> pick column TARGET_IDX
      - (B, 1) : already scalar                    -> squeeze to (B,)
      - (B,)   : already scalar                    -> keep as is
    """
    y = batch.y
    if y.dim() == 2:
        if y.size(1) == 19:
            y = y[:, TARGET_IDX]
        elif y.size(1) == 1:
            y = y.squeeze(-1)
        else:
            y = y[:, 0]  # fallback
    else:
        y = y.view(-1)
    return y


def train_one_epoch(
    model,
    loader,
    optimizer,
    device,
    motor_strength: float,
    lambda_motor_reg: float = 0.0,
    scale_to_meV: float = 1.0,
):
    model.train()
    total_mae = 0.0
    total_mot = 0.0
    n_batches = 0

    for batch in loader:
        batch = batch.to(device)

        z         = batch.z          # (total_nodes,)
        pos       = batch.pos        # (total_nodes, 3)
        batch_idx = batch.batch      # (total_nodes,)
        y         = _extract_target(batch)  # (B,)

        optimizer.zero_grad()

        # Let V20_AGAA_Motor do to_dense_batch internally:
        pred, sig_motor = model(z, pos, batch_idx, motor_strength=motor_strength)

        # Make prediction 1D to match y
        pred = pred.view(-1)

        if pred.numel() != y.numel():
            raise RuntimeError(
                f"Shape mismatch in train_one_epoch: pred {pred.shape}, y {y.shape}"
            )

        loss_data = F.l1_loss(pred, y)
        # approximate "volume" penalty via motor activity
        loss = loss_data + lambda_motor_reg * motor_strength * sig_motor

        loss.backward()
        optimizer.step()

        mae_meV = loss_data.item() * scale_to_meV
        total_mae += mae_meV
        total_mot += sig_motor.item()
        n_batches += 1

    return total_mae / max(n_batches, 1), total_mot / max(n_batches, 1)


@torch.no_grad()
def eval_epoch(
    model,
    loader,
    device,
    motor_strength: float,
    scale_to_meV: float = 1.0,
):
    model.eval()
    total_mae = 0.0
    total_mot = 0.0
    n_batches = 0

    for batch in loader:
        batch = batch.to(device)

        z         = batch.z
        pos       = batch.pos
        batch_idx = batch.batch
        y         = _extract_target(batch)  # (B,)

        pred, sig_motor = model(z, pos, batch_idx, motor_strength=motor_strength)
        pred = pred.view(-1)

        if pred.numel() != y.numel():
            raise RuntimeError(
                f"Shape mismatch in eval_epoch: pred {pred.shape}, y {y.shape}"
            )

        loss_data = F.l1_loss(pred, y)
        mae_meV = loss_data.item() * scale_to_meV

        total_mae += mae_meV
        total_mot += sig_motor.item()
        n_batches += 1

    return total_mae / max(n_batches, 1), total_mot / max(n_batches, 1)


def run_curriculum_training(
    model,
    train_loader,
    val_loader,
    test_loader,
    device,
    epochs=200,
    motor_unlock_meV=10.0,      # when train MAE < this, start ramp
    motor_ramp_epochs=50,
    motor_max_strength=1.0,
    lambda_motor_reg=1e-3,      # volume penalty weight
    scale_to_meV=1.0,
):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

    best_val = float("inf")
    best_epoch = -1
    curriculum_unlocked = False
    unlock_epoch = None

    print("Ep | motors | train | val   | test  | ms | best")
    print("---+--------+-------+-------+-------+----+-----")

    for ep in range(1, epochs + 1):
        # decide motor_strength for this epoch
        if not curriculum_unlocked:
            motor_strength = 0.0
        else:
            t = max(0, ep - unlock_epoch)
            motor_strength = motor_max_strength * min(1.0, t / max(motor_ramp_epochs, 1))

        t0 = time.time()

        train_mae, train_mot = train_one_epoch(
            model, train_loader, optimizer, device,
            motor_strength=motor_strength,
            lambda_motor_reg=lambda_motor_reg,
            scale_to_meV=scale_to_meV,
        )
        val_mae,   val_mot   = eval_epoch(
            model, val_loader, device,
            motor_strength=motor_strength,
            scale_to_meV=scale_to_meV,
        )
        test_mae,  test_mot  = eval_epoch(
            model, test_loader, device,
            motor_strength=motor_strength,
            scale_to_meV=scale_to_meV,
        )
        dt = time.time() - t0  # you can add a 'sec' column later if you want

        # unlock motors when scalar-only fit is good enough
        if (not curriculum_unlocked) and (train_mae <= motor_unlock_meV):
            curriculum_unlocked = True
            unlock_epoch = ep

        # best mark based on val
        is_best = val_mae < best_val
        if is_best:
            best_val = val_mae
            best_epoch = ep
        star = "*" if is_best else " "

        # motors column = val_mot (avg motor activity on val)
        # ms column      = motor_strength for this epoch
        print(
            f"{ep:3d} | {val_mot:6.3f} | "
            f"{train_mae:5.1f} | {val_mae:5.1f} | {test_mae:5.1f} | "
            f"{motor_strength:3.2f} | {star}"
        )

    print(f"\nBest val: {best_val:.3f} meV at epoch {best_epoch}")


In [None]:
# CELL - Launch curriculum training

run_curriculum_training(
    model,
    train_loader,
    val_loader,
    test_loader,
    device,
    epochs=200,
    motor_unlock_meV=10.0,   # when train drops below ~10 meV
    motor_ramp_epochs=60,    # slowly turn on motors over 60 epochs
    motor_max_strength=1.0,
    lambda_motor_reg=1e-3,   # approximate volume penalty
    scale_to_meV=1.0,
)


Ep | motors | train | val   | test  | ms | best
---+--------+-------+-------+-------+----+-----
  1 |  0.050 |  44.2 |  19.0 |   6.4 | 0.00 | *
  2 |  0.048 |   6.6 |   3.2 |   2.0 | 0.00 | *
  3 |  0.599 |   2.8 |   1.7 |   1.6 | 0.02 | *
  4 |  4.252 |   1.9 |   1.8 |   1.3 | 0.03 |  
  5 |  6.033 |   1.3 |   1.0 |   1.2 | 0.05 | *
  6 |  6.346 |   1.1 |   0.7 |   1.0 | 0.07 | *
  7 |  7.229 |   0.9 |   0.8 |   1.1 | 0.08 |  
  8 |  6.963 |   0.8 |   0.8 |   0.9 | 0.10 |  
  9 |  6.117 |   1.3 |   1.4 |   1.2 | 0.12 |  
 10 |  6.138 |   0.9 |   0.7 |   1.3 | 0.13 | *
 11 |  5.784 |   0.6 |   0.5 |   1.1 | 0.15 | *
 12 |  5.525 |   0.7 |   0.5 |   1.1 | 0.17 | *
 13 |  5.350 |   0.6 |   0.5 |   1.1 | 0.18 | *
 14 |  5.610 |   0.8 |   0.5 |   0.9 | 0.20 | *
 15 |  5.500 |   0.5 |   0.5 |   0.9 | 0.22 |  
 16 |  5.603 |   1.3 |   0.6 |   0.8 | 0.23 |  
 17 |  5.250 |   0.6 |   0.4 |   1.0 | 0.25 | *
 18 |  4.780 |   0.4 |   0.4 |   0.8 | 0.27 |  
 19 |  4.389 |   0.4 |   0.4 |   0.8 | 0

In [1]:
batch = next(iter(test_loader)).to(device)
with torch.no_grad():
    pred, _ = model(batch.z, batch.pos, batch.batch, motor_strength=0.0)
    pred = pred.view(-1)
    y    = _extract_target(batch)
    mae_eV  = (pred - y).abs().mean().item()
    mae_meV = mae_eV * 1000
print("Raw MAE:", mae_eV, "eV   =   ", mae_meV, "meV")


NameError: name 'test_loader' is not defined

In [2]:
# CELL - Snapshot current RB3m curriculum run (config + model + env + git)

import os, json, time, sys, subprocess
import torch

try:
    import torch_geometric
    pyg_version = torch_geometric.__version__
except Exception:
    pyg_version = None

# ---- 1) Create a unique run directory on Drive ----
RUN_ROOT = "/content/drive/MyDrive/GAHEAD_runs"
run_name = f"RB3m_curriculum_U0_{time.strftime('%Y%m%d_%H%M%S')}"
RUN_DIR  = os.path.join(RUN_ROOT, run_name)
os.makedirs(RUN_DIR, exist_ok=True)
print("üìÅ Snapshot directory:", RUN_DIR)

# ---- 2) Save model weights ----
ckpt_path = os.path.join(RUN_DIR, "RB3m_curriculum_ep24.pt")
torch.save(model.state_dict(), ckpt_path)
print("‚úÖ Saved model state_dict to:", ckpt_path)

# ---- 3) Try to grab basic optimizer info (if optimizer exists) ----
opt_info = None
if "optimizer" in globals():
    opt = optimizer
    try:
        opt_info = {
            "type": opt.__class__.__name__,
            "lr": opt.param_groups[0].get("lr", None),
            "weight_decay": opt.param_groups[0].get("weight_decay", None),
            "betas": tuple(opt.param_groups[0].get("betas", (None, None))),
            "eps": opt.param_groups[0].get("eps", None),
        }
    except Exception:
        opt_info = {"type": opt.__class__.__name__}

# ---- 4) Build a config dict for reproducibility ----
# Fill in anything you know exactly from your launch cell (edit if needed).
cfg = {
    "description": "RB3m (‚âà2.95M params) geometric transformer with motor curriculum on QM9 U0",
    "model": {
        "name": "V20_AGAA_Motor",
        "num_layers": 7,
        "d_model": 192,
        "n_heads": 16,
        "max_z": 100,
        "n_rbf": 20,
        "trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad),
    },
    "data": {
        "dataset": "QM9",
        "target": "U0",
        "target_index": 7,
        "root": "qm9_data",
        "train_size": len(train_loader.dataset) if 'train_loader' in globals() else None,
        "val_size": len(val_loader.dataset) if 'val_loader' in globals() else None,
        "test_size": len(test_loader.dataset) if 'test_loader' in globals() else None,
        "batch_train": train_loader.batch_size if 'train_loader' in globals() else None,
        "batch_eval":  test_loader.batch_size if 'test_loader' in globals() else None,
    },
    "curriculum": {
        "motor_unlock_meV": globals().get("MOTOR_UNLOCK_MEV", None),
        "motor_ramp_epochs": globals().get("MOTOR_RAMP_EPOCHS", None),
        "motor_max_strength": globals().get("MOTOR_MAX_STRENGTH", None),
        "lambda_motor_reg": globals().get("LAMBDA_MOTOR_REG", None),
        "scale_to_meV": globals().get("SCALE_TO_MEV", None),
        "epochs_completed": 24,  # update if you ran more
    },
    "optimizer": opt_info,
    "seeds": {
        "SEED": globals().get("SEED", None),
        "PYTHONHASHSEED": os.environ.get("PYTHONHASHSEED"),
    },
    "environment": {
        "python": sys.version,
        "torch": torch.__version__,
        "torch_geometric": pyg_version,
        "cwd": os.getcwd(),
        "device": str(next(model.parameters()).device),
    },
}

cfg_path = os.path.join(RUN_DIR, "config.json")
with open(cfg_path, "w") as f:
    json.dump(cfg, f, indent=2)
print("‚úÖ Saved config to:", cfg_path)

# ---- 5) Save metrics history if we have it in memory ----
# If your training loop kept a list `METRICS_HISTORY`, we snapshot it;
# otherwise we just skip politely.
try:
    METRICS_HISTORY  # just to see if it exists
    mh_path = os.path.join(RUN_DIR, "metrics_history.json")
    import copy
    with open(mh_path, "w") as f:
        json.dump(copy.deepcopy(METRICS_HISTORY), f, indent=2)
    print("‚úÖ Saved METRICS_HISTORY to:", mh_path)
except NameError:
    print("‚ÑπÔ∏è No METRICS_HISTORY variable found; skipping metrics JSON.")

# ---- 6) Capture git commit & diff for /content/QM9_project ----
try:
    qm9_dir = "/content/QM9_project"
    commit = subprocess.check_output(
        ["bash", "-lc", f"cd {qm9_dir} && git rev-parse HEAD"]
    ).decode().strip()
    with open(os.path.join(RUN_DIR, "git_commit.txt"), "w") as f:
        f.write(commit + "\n")

    diff = subprocess.check_output(
        ["bash", "-lc", f"cd {qm9_dir} && git diff"],
        timeout=10,
    ).decode()
    with open(os.path.join(RUN_DIR, "git_diff.patch"), "w") as f:
        f.write(diff)

    print("‚úÖ Saved git_commit.txt and git_diff.patch.")
except Exception as e:
    print("‚ö†Ô∏è Could not capture git info:", repr(e))

print("\n‚ú® Snapshot complete. This run is now reproducible from", RUN_DIR)


üìÅ Snapshot directory: /content/drive/MyDrive/GAHEAD_runs/RB3m_curriculum_U0_20251127_033245


NameError: name 'model' is not defined

In [3]:
# CELL - Robust snapshot of RB3m curriculum run (no crash if model is missing)

import os, json, time, sys, subprocess
import torch

try:
    import torch_geometric
    pyg_version = torch_geometric.__version__
except Exception:
    pyg_version = None

# Reuse the directory you already saw:
RUN_DIR = "/content/drive/MyDrive/GAHEAD_runs/RB3m_curriculum_U0_20251127_033245"
os.makedirs(RUN_DIR, exist_ok=True)
print("üìÅ Snapshot directory:", RUN_DIR)

# ---- 1) Save model weights if model is in memory ----
if "model" in globals():
    ckpt_path = os.path.join(RUN_DIR, "RB3m_curriculum_epXX.pt")  # rename XX if you know the epoch
    torch.save(model.state_dict(), ckpt_path)
    print("‚úÖ Saved model.state_dict() to:", ckpt_path)
else:
    print("‚ö† 'model' not found in this runtime; skipping direct weight save.")
    print("   If you have a checkpoint .pt file from training, you can copy it into this folder manually.")

# ---- 2) Try to grab basic optimizer info (if present) ----
opt_info = None
if "optimizer" in globals():
    opt = optimizer
    try:
        opt_info = {
            "type": opt.__class__.__name__,
            "lr": opt.param_groups[0].get("lr", None),
            "weight_decay": opt.param_groups[0].get("weight_decay", None),
            "betas": tuple(opt.param_groups[0].get("betas", (None, None))),
            "eps": opt.param_groups[0].get("eps", None),
        }
    except Exception:
        opt_info = {"type": opt.__class__.__name__}

# ---- 3) Build a config dict with whatever is available ----
def _len_or_none(obj_name):
    return len(globals()[obj_name].dataset) if obj_name in globals() else None

def _bs_or_none(obj_name):
    return globals()[obj_name].batch_size if obj_name in globals() else None

trainable_params = None
if "model" in globals():
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

cfg = {
    "description": "RB3m (~2.95M params) geometric transformer with motor curriculum on QM9 U0",
    "model": {
        "name": "V20_AGAA_Motor",
        "num_layers": 7,
        "d_model": 192,
        "n_heads": 16,
        "max_z": 100,
        "n_rbf": 20,
        "trainable_params": trainable_params,
    },
    "data": {
        "dataset": "QM9",
        "target": "U0",
        "target_index": 7,
        "root": "qm9_data",
        "train_size": _len_or_none("train_loader"),
        "val_size":   _len_or_none("val_loader"),
        "test_size":  _len_or_none("test_loader"),
        "batch_train": _bs_or_none("train_loader"),
        "batch_eval":  _bs_or_none("test_loader"),
    },
    "curriculum": {
        "motor_unlock_meV": globals().get("MOTOR_UNLOCK_MEV", None),
        "motor_ramp_epochs": globals().get("MOTOR_RAMP_EPOCHS", None),
        "motor_max_strength": globals().get("MOTOR_MAX_STRENGTH", None),
        "lambda_motor_reg": globals().get("LAMBDA_MOTOR_REG", None),
        "scale_to_meV": globals().get("SCALE_TO_MEV", None),
        "epochs_completed": 24,  # adjust if you know exact
    },
    "optimizer": opt_info,
    "seeds": {
        "SEED": globals().get("SEED", None),
        "PYTHONHASHSEED": os.environ.get("PYTHONHASHSEED"),
    },
    "environment": {
        "python": sys.version,
        "torch": torch.__version__,
        "torch_geometric": pyg_version,
        "cwd": os.getcwd(),
        "device": str(torch.device("cuda" if torch.cuda.is_available() else "cpu")),
    },
}

cfg_path = os.path.join(RUN_DIR, "config.json")
with open(cfg_path, "w") as f:
    json.dump(cfg, f, indent=2)
print("‚úÖ Saved config to:", cfg_path)

# ---- 4) Save metrics history if it exists ----
if "METRICS_HISTORY" in globals():
    import copy
    mh_path = os.path.join(RUN_DIR, "metrics_history.json")
    with open(mh_path, "w") as f:
        json.dump(copy.deepcopy(METRICS_HISTORY), f, indent=2)
    print("‚úÖ Saved METRICS_HISTORY to:", mh_path)
else:
    print("‚ÑπÔ∏è No METRICS_HISTORY in this runtime; skipping metrics JSON.")

# ---- 5) Capture git commit & diff for /content/QM9_project ----
try:
    qm9_dir = "/content/QM9_project"
    commit = subprocess.check_output(
        ["bash", "-lc", f"cd {qm9_dir} && git rev-parse HEAD"]
    ).decode().strip()
    with open(os.path.join(RUN_DIR, "git_commit.txt"), "w") as f:
        f.write(commit + "\n")

    diff = subprocess.check_output(
        ["bash", "-lc", f"cd {qm9_dir} && git diff"],
        timeout=10,
    ).decode()
    with open(os.path.join(RUN_DIR, "git_diff.patch"), "w") as f:
        f.write(diff)

    print("‚úÖ Saved git_commit.txt and git_diff.patch.")
except Exception as e:
    print("‚ö†Ô∏è Could not capture git info:", repr(e))

print("\n‚ú® Snapshot complete (as much as this runtime can see).")


üìÅ Snapshot directory: /content/drive/MyDrive/GAHEAD_runs/RB3m_curriculum_U0_20251127_033245
‚ö† 'model' not found in this runtime; skipping direct weight save.
   If you have a checkpoint .pt file from training, you can copy it into this folder manually.
‚úÖ Saved config to: /content/drive/MyDrive/GAHEAD_runs/RB3m_curriculum_U0_20251127_033245/config.json
‚ÑπÔ∏è No METRICS_HISTORY in this runtime; skipping metrics JSON.
‚ö†Ô∏è Could not capture git info: CalledProcessError(1, ['bash', '-lc', 'cd /content/QM9_project && git rev-parse HEAD'])

‚ú® Snapshot complete (as much as this runtime can see).
