In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install torch-geometric

# 3.1 Encoder (temporal per player)

In [14]:
# ============================================================
# üöÄ FULL PIPELINE: From raw tracking ‚Üí temporal embeddings
# ============================================================
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import torch
import torch.nn as nn

# ============================================================
# 1Ô∏è‚É£ Normalization
# ============================================================
def normalize_field_direction(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    mask_left = df["play_direction"].str.lower() == "left"

    # Flip x
    for col in ["x", "ball_land_x"]:
        if col in df.columns:
            df.loc[mask_left, col] = 120 - df.loc[mask_left, col]

    # Flip angles
    for ang_col in ["o", "dir"]:
        if ang_col in df.columns:
            df.loc[mask_left, ang_col] = (df.loc[mask_left, ang_col] + 180) % 360

    # Center y around midline (26.65)
    for col in ["y", "ball_land_y"]:
        if col in df.columns:
            df[col] = df[col] - 26.65

    return df

# ============================================================
# 2Ô∏è‚É£ Build per-player kinematics & contextual features
# ============================================================
def build_player_kinematics_features(df_input: pd.DataFrame, K: int = 10) -> pd.DataFrame:
    df = df_input.copy()
    df["abs_yardline_norm"] = df["absolute_yardline_number"] / 120.0
    df = df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])

    # Compute velocity
    df["vx"] = df.groupby(["game_id","play_id","nfl_id"])["x"].diff().fillna(0)
    df["vy"] = df.groupby(["game_id","play_id","nfl_id"])["y"].diff().fillna(0)

    # Select last K frames per player
    lastK = df.groupby(["game_id","play_id","nfl_id"], group_keys=False).apply(lambda g: g.tail(K))

    # Compute goal features at throw frame (first of last K)
    throw_frame = df.groupby(["game_id","play_id","nfl_id"], group_keys=False).apply(lambda g: g.tail(K).head(1))
    throw_frame["dx_land"] = throw_frame["ball_land_x"] - throw_frame["x"]
    throw_frame["dy_land"] = throw_frame["ball_land_y"] - throw_frame["y"]
    throw_frame["dist_land"] = np.hypot(throw_frame["dx_land"], throw_frame["dy_land"])
    throw_frame["az_land"] = np.degrees(np.arctan2(throw_frame["dy_land"], throw_frame["dx_land"]))
    goal_feats = throw_frame[["game_id","play_id","nfl_id","dx_land","dy_land","dist_land","az_land"]]

    # Aggregate kinematics
    kin_feats = (
        lastK.groupby(["game_id","play_id","nfl_id"])
        .agg({
            "x":["mean","std"], "y":["mean","std"],
            "vx":["mean","std"], "vy":["mean","std"],
            "s":["mean","std"], "a":["mean","std"],
            "dir":["mean"], "o":["mean"], "frame_id":["max"]
        })
        .reset_index()
    )
    kin_feats.columns = ["_".join(col).rstrip("_") for col in kin_feats.columns]
    feats = kin_feats.merge(goal_feats, on=["game_id","play_id","nfl_id"], how="left")

    # Add roster one-hots
    roster_cols = ["player_side","player_role","player_position"]
    roster = df[["game_id","play_id","nfl_id"] + roster_cols].drop_duplicates(["game_id","play_id","nfl_id"])
    feats = feats.merge(roster, on=["game_id","play_id","nfl_id"], how="left")
    feats = pd.get_dummies(feats, columns=roster_cols, prefix=roster_cols)

    # Game context
    context = (
        df.groupby(["game_id","play_id","nfl_id"], as_index=False)
        .agg(abs_yardline_norm=("abs_yardline_norm","mean"), total_frames=("frame_id","max"))
    )
    feats = feats.merge(context, on=["game_id","play_id","nfl_id"], how="left")
    feats["time_index_norm"] = feats["frame_id_max"] / feats["total_frames"]
    feats.drop(columns=["frame_id_max","total_frames"], inplace=True)
    return feats

# ============================================================
# 4Ô∏è‚É£ Temporal Encoder (GRU / Transformer)
# ============================================================
class TemporalTransformer(nn.Module):
    def __init__(self, in_dim=8, d_model=128, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()

        self.input_proj = nn.Linear(in_dim, d_model)

        self.pos_emb = nn.Parameter(torch.randn(1, 60, d_model))   # K up to 60

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model*4,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

    def forward(self, x):
        # x : (P, K, F)
        P,K,F = x.shape

        x = self.input_proj(x)

        # add positional embeddings: truncate or expand
        pos = self.pos_emb[:, :K, :]

        x = x + pos

        out = self.encoder(x)       # (P,K,D)

        # take last token
        return out[:, -1, :]        # (P,D)

# 3.2 Interaction block (multi-agent)

In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def build_interaction_graphs(df_input: pd.DataFrame, K: int = 6):
    """
    Build K-NN interaction graphs per play at the throw frame.
    Each graph connects every player to K nearest neighbors
    with edge features [dx, dy, dvx, dvy, ally_flag].

    Returns
    -------
    dict[(game_id, play_id)] = {
        "nodes": pd.DataFrame of player features,
        "edges": pd.DataFrame of edge features
    }
    """

    df = df_input.copy()

    # ------------------------------------------------------------------
    # üß≠ 1Ô∏è‚É£ Ensure velocity columns exist
    # ------------------------------------------------------------------
    if "vx" not in df.columns or "vy" not in df.columns:
        df = df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"]).copy()
        df["vx"] = df.groupby(["game_id", "play_id", "nfl_id"])["x"].diff().fillna(0)
        df["vy"] = df.groupby(["game_id", "play_id", "nfl_id"])["y"].diff().fillna(0)

    # ------------------------------------------------------------------
    # üïê 2Ô∏è‚É£ Extract throw frame (last frame of input for each player)
    # ------------------------------------------------------------------
    throw_frame = (
        df.groupby(["game_id", "play_id", "nfl_id"], group_keys=False)
          .apply(lambda g: g.tail(1))
          .reset_index(drop=True)
    )

    # ------------------------------------------------------------------
    # üß© 3Ô∏è‚É£ Build graph per play
    # ------------------------------------------------------------------
    graphs = {}
    plays = throw_frame.groupby(["game_id", "play_id"])

    for (gid, pid), play_df in tqdm(plays, desc="Building KNN graphs per play"):

        # node features (one per player)
        nodes = play_df[
            ["nfl_id", "x", "y", "vx", "vy", "player_side", "player_role"]
        ].reset_index(drop=True)

        coords = nodes[["x", "y"]].values

        if len(nodes) < 2:
            continue  # skip incomplete plays

        # Fit KNN (K+1 to include self, drop self edge later)
        nbrs = NearestNeighbors(
            n_neighbors=min(K + 1, len(nodes)),
            algorithm="ball_tree"
        ).fit(coords)
        distances, indices = nbrs.kneighbors(coords)

        edge_records = []
        for i, nbr_idxs in enumerate(indices):
            for j in nbr_idxs[1:]:  # skip self
                src = nodes.iloc[i]
                dst = nodes.iloc[j]
                dx  = dst["x"]  - src["x"]
                dy  = dst["y"]  - src["y"]
                dvx = dst["vx"] - src["vx"]
                dvy = dst["vy"] - src["vy"]

                # NEW FEATURES
                dist = np.sqrt(dx*dx + dy*dy + 1e-6)        # distance magnitude
                dv   = np.sqrt(dvx*dvx + dvy*dvy + 1e-6)    # relative speed magnitude
                bearing = np.arctan2(dy, dx)
                cos_bear = np.cos(bearing)
                sin_bear = np.sin(bearing)

                edge_records.append({
                    "src_id": src["nfl_id"],
                    "dst_id": dst["nfl_id"],
                    "dx": dx,
                    "dy": dy,
                    "dvx": dvx,
                    "dvy": dvy,

                    # NEW
                    "dist": dist,
                    "dv": dv,
                    "cos_bear": cos_bear,
                    "sin_bear": sin_bear,

                    "ally_flag": 1 if src["player_side"] == dst["player_side"] else 0
                })


        edges = pd.DataFrame(edge_records)
        graphs[(gid, pid)] = {"nodes": nodes, "edges": edges}

    return graphs

In [15]:
class SpatialTransformer(nn.Module):
    def __init__(self, d_model=128, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()

        layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model*4,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)

        # project edge_attr ‚Üí attention bias
        self.edge_proj = nn.Linear(9, d_model)

    def forward(self, h_nodes, edge_index, edge_attr):
        """
        h_nodes    : (P,D)
        edge_index : (2,E)
        edge_attr  : (E,9)
        We convert edges ‚Üí full (P,P,d_model) bias matrix.
        """

        P = h_nodes.size(0)
        device = h_nodes.device

        # build full pairwise bias matrix
        bias = torch.zeros(P, P, h_nodes.size(1), device=device)

        src, dst = edge_index
        e = self.edge_proj(edge_attr)         # (E,D)
        bias[src, dst] = e                    # direct fill-in

        # convert to additive attention bias
        # flatten into (1,P,P,D)
        bias = bias.unsqueeze(0)

        # Transformer encoder supports "src_mask" but not arbitrary bias.
        # So we fold bias into embeddings:
        h = h_nodes.unsqueeze(0)              # (1,P,D)
        h = h + bias.mean(dim=2)              # aggregate bias per node

        h = self.encoder(h)                   # (1,P,D)

        return h.squeeze(0)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATv2Conv
from sklearn.neighbors import NearestNeighbors
import numpy as np

# 3.3 Role-specific adapters

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ============================================================
# üß© 3.3 Role-Specific Adapter Module
# ============================================================

class RoleSpecificAdapters(nn.Module):
    """
    Each player_role (e.g., Targeted WR, Coverage DB, Passer, Other)
    gets its own small MLP adapter that reshapes the 128-D context
    embedding into a role-specialized representation.

    Input : h_context  ‚Üí (N_players, embed_dim)
            role_ids   ‚Üí (N_players,)  integers 0..N_roles-1
    Output: h_adapted  ‚Üí (N_players, embed_dim)
    """
    def __init__(self, embed_dim=128, hidden_dim=128, role_names=None):
        super().__init__()
        if role_names is None:
            role_names = ["Targeted Receiver", "Defensive Coverage", "Passer", "Other Route Runner"]
        self.role_names = role_names
        self.n_roles = len(role_names)

        # small MLP adapter per role
        self.adapters = nn.ModuleDict({
            name: nn.Sequential(
                nn.Linear(embed_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, embed_dim),
                nn.LayerNorm(embed_dim)
            )
            for name in role_names
        })

        # shared fallback (for unseen / undefined roles)
        self.default_adapter = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embed_dim),
            nn.LayerNorm(embed_dim)
        )

    def forward(self, h_context, role_ids, role_mapping):
        """
        h_context : (N, D)
        role_ids  : list/series of textual roles matching role_mapping keys
        role_mapping : {role_name : idx}
        """
        outputs = []
        for i, r in enumerate(role_ids):
            role_name = None
            # reverse-map index to string
            if isinstance(r, (int, np.integer)):
                # find key by index
                for k,v in role_mapping.items():
                    if v == r:
                        role_name = k
                        break
            else:
                role_name = r

            if role_name in self.adapters:
                out = self.adapters[role_name](h_context[i])
            else:
                out = self.default_adapter(h_context[i])
            outputs.append(out)

        return torch.stack(outputs, dim=0)

# 3.4 Two-stream decoder (direct multi-horizon residuals)

In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# Helpers: time embeddings
# ----------------------------
class TimeEmbedding(nn.Module):
    """
    Sinusoidal + learned projection for tau in [0,1].
    """
    def __init__(self, emb_dim=64, n_freq=8):
        super().__init__()
        self.n_freq = n_freq
        self.proj = nn.Linear(2*n_freq, emb_dim)

    def forward(self, tau):  # tau: (P, T) in [0,1]
        P, T = tau.shape
        device = tau.device
        # [P, T, 2*n_freq]
        freqs = torch.arange(self.n_freq, device=device).float()  # 0..n-1
        ang = tau.unsqueeze(-1) * (2.0 * np.pi * (freqs + 1.0))   # avoid 0 freq
        sin = torch.sin(ang)
        cos = torch.cos(ang)
        feats = torch.cat([sin, cos], dim=-1)                     # (P, T, 2*n_freq)
        return self.proj(feats)                                   # (P, T, emb_dim)


# ----------------------------
# Two-stream decoder
# ----------------------------
class TwoStreamDecoder(nn.Module):
    """
    Stream A (Goal-drift): drives motion toward (ball_land_x, ball_land_y).
    Stream B (Interaction correction): local evasive/collision adjustments from context.

    Inputs per play:
      h_role: (P, D)     role-specific embeddings from 3.3
      goal_feat: (P, G)  per-player goal features [dx0, dy0, dist0, ux, uy]
      tau_seq: (P, T)    time-to-land values in [0,1] per player
      horizon: (P,) long per-player num_frames_output (1..N_max)
    Outputs:
      dxy: (P, T, 2)     residuals Œîx,Œîy relative to last input frame
      mask: (P, T)       1 within horizon, 0 after
    """
    def __init__(self, d_model=128, time_dim=64, goal_dim=5, hidden=256, N_max=30):
        super().__init__()
        self.goal_dim = goal_dim
        self.N_max = N_max
        self.time_emb = TimeEmbedding(emb_dim=time_dim, n_freq=8)

        inA = d_model + time_dim + goal_dim
        inB = d_model + time_dim

        # Stream A: smooth drift toward goal
        self.streamA = nn.Sequential(
            nn.Linear(inA, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 2)  # Œîx, Œîy per step
        )

        # Stream B: local interaction correction
        self.streamB = nn.Sequential(
            nn.Linear(inB, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 2)
        )

    def forward(self, h_role, goal_feat, tau_seq, horizon):
        """
        h_role:   (P, D)
        goal_feat:(P, 5)  = [dx0, dy0, dist0, ux, uy]
        tau_seq:  (P, T)
        horizon:  (P,)
        """
        P, D = h_role.shape
        T = tau_seq.shape[1]

        # ‚úÖ Allow variable horizon lengths (for curriculum training)
        if T != self.N_max:
            # ensure consistent device, dtype, and leading dimension P
            pad_len = max(self.N_max - T, 0)
            if pad_len > 0:
                pad = torch.ones(
                    (P, pad_len),
                    dtype=tau_seq.dtype,
                    device=tau_seq.device
                )
                tau_seq = torch.cat([tau_seq, pad], dim=1)
            elif T > self.N_max:
                tau_seq = tau_seq[:, :self.N_max]
            T = self.N_max


        # time embedding
        t_emb = self.time_emb(tau_seq)             # (P, T, time_dim)

        # expand static inputs across time
        h_rep = h_role.unsqueeze(1).expand(P, T, D)        # (P, T, D)
        g_rep = goal_feat.unsqueeze(1).expand(P, T, goal_feat.size(1))  # (P, T, 5)

        # stream A: goal drift
        a_in = torch.cat([h_rep, t_emb, g_rep], dim=-1)    # (P, T, D+time_dim+5)
        dA = self.streamA(a_in)                            # (P, T, 2)

        # stream B: interaction correction
        b_in = torch.cat([h_rep, t_emb], dim=-1)           # (P, T, D+time_dim)
        dB = self.streamB(b_in)                            # (P, T, 2)

        dxy = dA + dB                                      # (P, T, 2)

        # horizon mask: 1..H_i active
        device = h_role.device
        t_idx = torch.arange(T, device=device).unsqueeze(0).expand(P, T)  # 0..T-1
        mask = (t_idx < horizon.unsqueeze(1)).float()                     # (P, T)

        return dxy, mask


# ----------------------------
# Per-play tensor builder
# ----------------------------
def prepare_play_decoder_inputs(play_graph_nodes: pd.DataFrame,
                                df_in_norm: pd.DataFrame,
                                game_id: int, play_id: int,
                                N_max: int = 30,
                                device: str = "cpu"):
    """
    Builds inputs for TwoStreamDecoder from one play.

    Returns:
      x0y0:      (P, 2) last input frame positions (for later reconstruction)
      h0_goal:   (P, 5) [dx0, dy0, dist0, ux, uy]
      tau_seq:   (P, N_max) tau = t / num_frames_output (clipped to 1)
      horizon:   (P,)  long
      (Plus convenience dict with ball_land per play)
    """
    nodes = play_graph_nodes.reset_index(drop=True).copy()  # needs columns: nfl_id,x,y,player_role,etc.

    # last input position per player is already in nodes['x','y'] from 1.3 throw-frame snapshot
    x0y0 = torch.tensor(nodes[["x", "y"]].to_numpy(), dtype=torch.float32, device=device)  # (P,2)

    # ball landing (per play) from df_in_norm (any row of this play has same ball_land)
    play_rows = df_in_norm[(df_in_norm.game_id == game_id) & (df_in_norm.play_id == play_id)]
    bx = float(play_rows["ball_land_x"].iloc[-1])
    by = float(play_rows["ball_land_y"].iloc[-1])

    # goal vector at throw
    dx0 = torch.tensor((bx - nodes["x"]).to_numpy(), dtype=torch.float32, device=device)
    dy0 = torch.tensor((by - nodes["y"]).to_numpy(), dtype=torch.float32, device=device)
    dist0 = torch.sqrt(dx0**2 + dy0**2) + 1e-6
    ux = dx0 / dist0
    uy = dy0 / dist0
    h0_goal = torch.stack([dx0, dy0, dist0, ux, uy], dim=-1)  # (P,5)

    # per-player horizon from input table
    # num_frames_output is per (game,play,nfl). Take the last input row per player.
    horizon_np = (
        play_rows.sort_values(["nfl_id","frame_id"])
                 .groupby("nfl_id")["num_frames_output"]
                 .last()
                 .reindex(nodes["nfl_id"])
                 .fillna(0).to_numpy(dtype=np.int64)
    )
    horizon = torch.tensor(np.minimum(horizon_np, N_max), dtype=torch.long, device=device)  # (P,)

    # tau sequence per player (P, T)
    T = N_max
    t_grid = torch.arange(1, T+1, device=device).float().unsqueeze(0).expand(len(nodes), T)  # 1..T
    denom = torch.clamp(horizon.unsqueeze(1).float(), min=1.0)
    tau_seq = torch.clamp(t_grid / denom, max=1.0)  # (P,T) in [0,1]

    meta = dict(ball_land=(bx, by))
    return x0y0, h0_goal, tau_seq, horizon, meta


# ----------------------------
# Example usage for one play
# ----------------------------
# Given from earlier steps:
# - graphs : dict[(gid,pid)] ‚Üí {"nodes": df_nodes, "edges": df_edges}
# - embeds after 3.2 & 3.3: h_role for this play (torch, shape (P, 128))
# - df_in_norm: normalized inputs for the week

def run_decoder_for_one_play(graphs, df_in_norm, h_role_dict_for_play,
                             game_id, play_id, N_max=30, device="cpu"):
    """
    h_role_dict_for_play: map nfl_id -> 128-d torch tensor for that play,
                          or a stacked tensor aligned with graphs[(gid,pid)]["nodes"]
    """
    play_nodes = graphs[(game_id, play_id)]["nodes"]  # DataFrame with nfl_id,x,y,...

    # Ensure h_role is aligned with node order
    if isinstance(h_role_dict_for_play, torch.Tensor):
        h_role = h_role_dict_for_play.to(device)  # assume already aligned (P,128)
    else:
        rows = []
        for nid in play_nodes["nfl_id"].tolist():
            rows.append(h_role_dict_for_play[nid].unsqueeze(0))
        h_role = torch.cat(rows, dim=0).to(device)  # (P,128)

    x0y0, goal_feat, tau_seq, horizon, meta = prepare_play_decoder_inputs(
        play_nodes, df_in_norm, game_id, play_id, N_max=N_max, device=device
    )

    decoder = TwoStreamDecoder(d_model=h_role.size(1), time_dim=64, goal_dim=5, hidden=256, N_max=N_max).to(device)
    dxy, mask = decoder(h_role, goal_feat, tau_seq, horizon)   # (P,T,2), (P,T)

    # reconstruct absolute positions if desired
    # cumulative residuals from last input pos
    cum_dxy = torch.cumsum(dxy, dim=1)                         # (P,T,2)
    xy_pred = x0y0.unsqueeze(1) + cum_dxy                      # (P,T,2)
    # apply mask (zero out invalid timesteps)
    xy_pred = xy_pred * mask.unsqueeze(-1)

    return dict(
        dxy=dxy, mask=mask, xy_pred=xy_pred, x0y0=x0y0, goal_feat=goal_feat,
        tau_seq=tau_seq, horizon=horizon, ball_land=meta["ball_land"]
    )

# 3.5 Huber Loss

In [9]:
class TemporalHuber(nn.Module):
    def __init__(self, delta=0.5, time_decay=0.03):
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay

    def forward(self, pred, target, mask):
        err = pred - target
        abs_err = torch.abs(err)

        huber = torch.where(
            abs_err <= self.delta,
            0.5 * err * err,
            self.delta * (abs_err - 0.5 * self.delta)
        )

        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            weight = torch.exp(-self.time_decay * t).view(1, L, 1)
            huber = huber * weight
            mask = mask.unsqueeze(-1) * weight

        return (huber * mask).sum() / (mask.sum() + 1e-8)

In [10]:
def normalize_output_like_input(df_out: pd.DataFrame, df_in: pd.DataFrame) -> pd.DataFrame:
    """Flip output x for left plays and center y, using play_direction from input."""
    df_out = df_out.copy()
    dir_map = (df_in[["game_id","play_id","play_direction"]]
               .drop_duplicates()
               .assign(is_left=lambda d: d["play_direction"].str.lower()=="left")
               .drop(columns="play_direction"))
    df_out = df_out.merge(dir_map, on=["game_id","play_id"], how="left")
    df_out.loc[df_out["is_left"]==True, "x"] = 120 - df_out.loc[df_out["is_left"]==True, "x"]
    df_out["y"] = df_out["y"] - 26.65
    return df_out.drop(columns=["is_left"])

In [11]:
import pandas as pd

data_path = "/content/drive/MyDrive/NFL Big Data Bowl 2026/nfl-big-data-bowl-2026-prediction/train"
weeks = ["w01", "w02" , "w03", "w04", "w05", "w06" , "w07", "w08", "w09", "w10", "w11" , "w12", "w13", "w14", "w15", "w16", "w17", "w18"]

df_in_list, df_out_list = [], []

for w in weeks:
    f_in = f"{data_path}/input_2023_{w}.csv"
    f_out = f"{data_path}/output_2023_{w}.csv"
    print(f"üìÑ Loading week {w} ...")
    df_in_list.append(pd.read_csv(f_in))
    df_out_list.append(pd.read_csv(f_out))

df_in_raw  = pd.concat(df_in_list, ignore_index=True)
df_out_raw = pd.concat(df_out_list, ignore_index=True)

print(f"‚úÖ Combined input shape:  {df_in_raw.shape}")
print(f"‚úÖ Combined output shape: {df_out_raw.shape}")

keys = ["game_id","play_id","nfl_id"]
common = (
    df_out_raw[keys].drop_duplicates()
    .merge(df_in_raw[keys].drop_duplicates(), on=keys, how="inner")
)
df_in_raw  = df_in_raw.merge(common, on=keys, how="inner")
df_out_raw = df_out_raw.merge(common, on=keys, how="inner")

df_in_norm  = normalize_field_direction(df_in_raw)
df_out_norm = normalize_output_like_input(df_out_raw, df_in_raw)

df_in_norm = df_in_norm.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
df_in_norm["vx"] = df_in_norm.groupby(["game_id","play_id","nfl_id"])["x"].diff().fillna(0)
df_in_norm["vy"] = df_in_norm.groupby(["game_id","play_id","nfl_id"])["y"].diff().fillna(0)

graphs = build_interaction_graphs(df_in_norm, K=6)
# /content/drive/MyDrive/NFL Big Data Bowl 2026/nfl-big-data-bowl-2026-prediction/train/input_2023_w01.csv
print("‚úÖ Graphs built successfully:", len(graphs))

üìÑ Loading week w01 ...
üìÑ Loading week w02 ...
üìÑ Loading week w03 ...
üìÑ Loading week w04 ...
üìÑ Loading week w05 ...
üìÑ Loading week w06 ...
üìÑ Loading week w07 ...
üìÑ Loading week w08 ...
üìÑ Loading week w09 ...
üìÑ Loading week w10 ...
üìÑ Loading week w11 ...
üìÑ Loading week w12 ...
üìÑ Loading week w13 ...
üìÑ Loading week w14 ...
üìÑ Loading week w15 ...
üìÑ Loading week w16 ...
üìÑ Loading week w17 ...
üìÑ Loading week w18 ...
‚úÖ Combined input shape:  (4880579, 23)
‚úÖ Combined output shape: (562936, 6)


  .apply(lambda g: g.tail(1))
Building KNN graphs per play: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14108/14108 [01:05<00:00, 213.83it/s]


‚úÖ Graphs built successfully: 12966


# 4 - Training recipe (stable, within budget)

In [12]:
def precompute_graph_tensors(graphs, df_out_norm, df_in_norm, N_max=30):
    graphs_fast = {}

    for (gid, pid), g in graphs.items():

        nodes = g["nodes"].sort_values("nfl_id").reset_index(drop=True)
        edges = g["edges"]

        player_ids = nodes["nfl_id"].tolist()
        P = len(player_ids)

        # -------- node xy --------
        node_xy = torch.tensor(nodes[["x","y"]].values, dtype=torch.float32)

        # -------- roles ----------
        roles = nodes["player_role"].tolist()

        # -------- edges ----------
        id_new = {nid:i for i,nid in enumerate(player_ids)}
        edges = edges[edges["src_id"].isin(id_new) & edges["dst_id"].isin(id_new)]

        src_idx = edges["src_id"].map(id_new).to_numpy()
        dst_idx = edges["dst_id"].map(id_new).to_numpy()

        edge_index = torch.tensor([src_idx, dst_idx], dtype=torch.long)
        edge_attr  = torch.tensor(
            edges[["dx","dy","dvx","dvy","dist","dv","cos_bear","sin_bear","ally_flag"]]
            .to_numpy(np.float32)
        )

        # -------- global context --------
        global_ctx = torch.tensor([
            nodes["x"].mean(),
            nodes["y"].mean(),
            0.42
        ], dtype=torch.float32)

        # -------- PRECOMPUTE XY_TRUE --------
        df_out = df_out_norm[(df_out_norm.game_id==gid)&(df_out_norm.play_id==pid)]
        df_out = df_out.sort_values(["nfl_id","frame_id"])
        T = N_max
        xy_true = np.zeros((P, N_max, 2), dtype=np.float32)

        for i,nid in enumerate(player_ids):
            # extract all future frames
            sub = df_out[df_out["nfl_id"]==nid][["x","y"]].to_numpy(np.float32)

            # pad or trim to N_max
            if sub.shape[0] < N_max:
                pad = np.repeat(sub[-1:], N_max - sub.shape[0], axis=0)
                sub = np.vstack([sub, pad])
            else:
                sub = sub[:N_max]

            xy_true[i] = sub

        xy_true = torch.tensor(xy_true, dtype=torch.float32)

        # -------- PRECOMPUTE DECODER INPUTS (goal_feat, tau_seq, horizon) --------
        x0y0, goal_feat, tau_seq, horizon, meta = prepare_play_decoder_inputs(
            nodes, df_in_norm, gid, pid, N_max=N_max, device="cpu"
        )
        # x0y0 should match node_xy; we keep node_xy as-is

        graphs_fast[(gid,pid)] = {
            "player_ids": player_ids,
            "node_xy": node_xy,              # (P,2)
            "roles": roles,                  # list len P
            "edge_index": edge_index,        # (2,E)
            "edge_attr": edge_attr,          # (E,9)
            "global_ctx": global_ctx,        # (3,)
            "xy_true": xy_true,              # (P,T,2)
            "goal_feat": goal_feat,          # (P,5)
            "tau_seq": tau_seq,              # (P,N_max)
            "horizon": horizon,              # (P,)
        }

    return graphs_fast

graphs_fast = precompute_graph_tensors(graphs, df_out_norm, df_in_norm, N_max=30)


  edge_index = torch.tensor([src_idx, dst_idx], dtype=torch.long)


In [17]:
# =========================
# üîß Utilities / Seeding
# =========================
import os, random, math
import numpy as np
import torch
import time
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR
from contextlib import nullcontext

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# =========================================================
# üì¶ Dataset that batches plays (variable players/frames)
# =========================================================
class PlayDataset(Dataset):
    def __init__(self, df_in_norm, df_out_norm, graphs, K=10, features=("x","y","vx","vy","s","a","dir","o")):
        self.df_in = df_in_norm
        self.df_out = df_out_norm
        self.graphs = graphs
        self.K = K
        self.features = list(features)
        # index as list of (gid, pid)
        self.index = list(graphs.keys())

    def __len__(self):
        return len(self.index)

    def _build_hist_with_temporal_dropout(self, gid, pid, player_ids, drop_last=0):
        g = self.df_in[(self.df_in.game_id==gid) & (self.df_in.play_id==pid)]

        P = len(player_ids)
        F = len(self.features)
        K = self.K

        x_hist = torch.zeros((P, K, F), dtype=torch.float32)

        for pi, nid in enumerate(player_ids):
            gp = g[g["nfl_id"] == nid].sort_values("frame_id").tail(K)
            arr = gp[self.features].to_numpy(np.float32)

            if arr.shape[0] < K:
                pad = np.zeros((K - arr.shape[0], F), dtype=np.float32)
                arr = np.vstack([pad, arr])

            if drop_last > 0:
                arr[-drop_last:] = 0.0

            x_hist[pi] = torch.from_numpy(arr)

        return x_hist

    def __getitem__(self, idx):
        gid, pid = self.index[idx]
        drop_last = np.random.randint(0, 4)
        graph = self.graphs[(gid, pid)]
        player_ids = graph["player_ids"]
        x_hist = self._build_hist_with_temporal_dropout(gid, pid, player_ids, drop_last)

        return {
            "gid": gid,
            "pid": pid,
            "x_hist": x_hist,         # (P,K,F)
            "player_ids": player_ids, # list length P
            "graph": graph            # already precomputed graphs_fast entry
        }


def play_collate(batch):
    """
    batch: list of length B, each item is a dict from __getitem__.
    We pack all players across all plays into one big batch dimension P_tot.
    """
    # For clarity
    x_hist_list = []
    node_xy_list = []
    edge_index_list = []
    edge_attr_list = []
    goal_feat_list = []
    tau_seq_list = []
    horizon_list = []
    xy_true_list = []
    roles_all = []
    global_ctx_list = []
    play_ids = []  # (gid,pid) per play
    play_ptr = []  # boundaries in node dimension

    offset = 0
    for item in batch:
        g = item["graph"]
        P_i = len(g["player_ids"])

        # history
        x_hist_list.append(item["x_hist"])              # (P_i,K,F)

        # graph stuff
        node_xy_list.append(g["node_xy"])               # (P_i,2)
        edge_index_i = g["edge_index"] + offset         # (2,E_i) reindexed
        edge_index_list.append(edge_index_i)
        edge_attr_list.append(g["edge_attr"])           # (E_i,9)

        # decoder inputs
        goal_feat_list.append(g["goal_feat"])           # (P_i,5)
        tau_seq_list.append(g["tau_seq"])               # (P_i,N_max)
        horizon_list.append(g["horizon"])               # (P_i,)

        # targets
        xy_true_list.append(g["xy_true"])               # (P_i,T,2)

        # roles and global ctx
        roles_all.extend(g["roles"])                    # flat list len P_tot
        global_ctx_list.append(g["global_ctx"])         # (3,)

        play_ids.append((item["gid"], item["pid"]))
        play_ptr.append(offset)

        offset += P_i

    # concat everything
    x_hist   = torch.cat(x_hist_list, dim=0)            # (P_tot,K,F)
    node_xy  = torch.cat(node_xy_list, dim=0)           # (P_tot,2)
    edge_index = torch.cat(edge_index_list, dim=1)      # (2,E_tot)
    edge_attr  = torch.cat(edge_attr_list, dim=0)       # (E_tot,9)
    goal_feat  = torch.cat(goal_feat_list, dim=0)       # (P_tot,5)
    tau_seq    = torch.cat(tau_seq_list, dim=0)         # (P_tot,N_max)
    horizon    = torch.cat(horizon_list, dim=0)         # (P_tot,)
    xy_true    = torch.cat(xy_true_list, dim=0)         # (P_tot,T,2)
    global_ctx_batch = torch.stack(global_ctx_list, dim=0)  # (B,3)
    # simple aggregate: mean ctx across plays (or you could ignore)
    global_ctx = global_ctx_batch.mean(dim=0)           # (3,)

    return {
        "x_hist": x_hist,                   # (P_tot,K,F)
        "node_xy": node_xy,                 # (P_tot,2)
        "edge_index": edge_index,           # (2,E_tot)
        "edge_attr": edge_attr,             # (E_tot,9)
        "goal_feat": goal_feat,             # (P_tot,5)
        "tau_seq": tau_seq,                 # (P_tot,N_max)
        "horizon": horizon,                 # (P_tot,)
        "xy_true": xy_true,                 # (P_tot,T,2)
        "roles": roles_all,                 # list len P_tot
        "global_ctx": global_ctx,           # (3,)
        "play_ids": play_ids,               # list of (gid,pid)
        "play_ptr": play_ptr,               # list of starting indices per play
    }

# =========================================================
# üß† End-to-end model wrapper (enc + GAT + role adapters + dec)
# =========================================================
class End2EndModel(nn.Module):
    def __init__(self, in_dim=8, embed_dim=128, d_model=128, time_dim=64, goal_dim=5,
                 hidden=256, N_max=30, gat_edge_dim=9, gat_heads=4, gat_layers=2,
                 role_names=("Targeted Receiver", "Defensive Coverage", "Passer", "Other Route Runner")):
        super().__init__()
        # Keep param count ~3‚Äì5M by modest dims
        self.encoder = TemporalTransformer(in_dim=in_dim, d_model=embed_dim)
        self.spatial = SpatialTransformer(d_model=embed_dim)
        self.adapters = RoleSpecificAdapters(embed_dim=embed_dim, hidden_dim=embed_dim,
                                             role_names=list(role_names))
        self.decoder = TwoStreamDecoder(d_model=embed_dim, time_dim=time_dim,
                                        goal_dim=goal_dim, hidden=hidden, N_max=N_max)
        self.role_names = list(role_names)
        self.role_map = {r:i for i,r in enumerate(self.role_names)}
        self.N_max = N_max

    def forward_batch(self, batch, N_max_curr):
        """
        batch: dict from play_collate
        Processes all players from all plays in one shot.
        """
        # unpack & move to device
        x_hist   = batch["x_hist"].to(DEVICE).float()          # (P_tot,K,F)
        node_xy  = batch["node_xy"].to(DEVICE).float()         # (P_tot,2)
        edge_index = batch["edge_index"].to(DEVICE).long()     # (2,E_tot)
        edge_attr  = batch["edge_attr"].to(DEVICE).float()     # (E_tot,9)
        goal_feat  = batch["goal_feat"].to(DEVICE).float()     # (P_tot,5)
        tau_seq    = batch["tau_seq"][:, :N_max_curr].to(DEVICE).float()  # (P_tot,T)
        horizon    = torch.clamp(batch["horizon"], max=N_max_curr).to(DEVICE).long()  # (P_tot,)
        global_ctx = batch["global_ctx"].to(DEVICE).float()    # (3,)
        roles      = batch["roles"]                            # list len P_tot

        h_nodes = self.encoder(x_hist)            # (P,D)
        h_ctx = self.spatial(h_nodes, edge_index, edge_attr)

        # role adapters
        h_role = self.adapters(h_ctx, roles, self.role_map)    # (P_tot,D)

        # decoder
        dxy, mask = self.decoder(h_role, goal_feat, tau_seq, horizon)  # (P_tot,T,2),(P_tot,T)

        xy_pred = node_xy.unsqueeze(1) + torch.cumsum(dxy, dim=1)
        xy_pred = xy_pred * mask.unsqueeze(-1)

        return xy_pred, mask

# =========================================================
# üöÄ Full-data training version (no validation split)
# =========================================================

def train_loop_full_data(df_in_norm, df_out_norm, graphs_fast,
                         K=10, N_max=30, batch_size=12, num_workers=0,
                         lr=2e-4, wd=1e-4,
                         warmup_epochs=2, total_epochs=70,
                         pretrain_epochs=20):

    # ====================================================
    # üì¶ Dataset: use ALL plays for training
    # ====================================================
    ds_full = PlayDataset(df_in_norm, df_out_norm, graphs_fast, K=K)
    dl_train = DataLoader(ds_full, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, collate_fn=play_collate, drop_last=False)

    print(f"üìä Dataset: {len(ds_full)} plays for training (no validation split)")

    # ====================================================
    # ‚öôÔ∏è Model + optimizer + schedulers
    # ====================================================
    model = End2EndModel(in_dim=8, embed_dim=128, d_model=128,
                         time_dim=64, goal_dim=5, hidden=256,
                         N_max=N_max).to(DEVICE)

    criterion = TemporalHuber(delta=0.5, time_decay=0.03)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=wd)

    cos = CosineAnnealingLR(optimizer, T_max=total_epochs - warmup_epochs, eta_min=0.0)
    warm = LinearLR(optimizer, start_factor=1e-3, end_factor=1.0, total_iters=warmup_epochs)
    scheduler = SequentialLR(optimizer, schedulers=[warm, cos], milestones=[warmup_epochs])

    scaler = torch.amp.GradScaler('cuda', enabled=(DEVICE=="cuda"))
    loss_history = []
    # ====================================================
    # üèãÔ∏è Training loop (full data)
    # ====================================================
    for epoch in range(1, total_epochs + 1):
        start_time = time.time()
        model.train()
        N_curr = 10 if epoch <= pretrain_epochs else N_max
        epoch_loss, n_plays = 0.0, 0

        # timing accumulators
        t_forward = 0.0
        t_build_xy = 0.0
        t_loss = 0.0
        t_backward = 0.0
        t_batch = 0.0

        amp_ctx = torch.amp.autocast('cuda') if DEVICE == "cuda" else nullcontext()
        with amp_ctx:
            for batch in dl_train:
                batch_start = time.time()
                optimizer.zero_grad(set_to_none=True)

                # 1) Forward (batched)
                t0 = time.time()
                xy_pred, mask = model.forward_batch(batch, N_max_curr=N_curr)
                t_forward += time.time() - t0

                # 2) Build batched xy_true
                t1 = time.time()
                xy_true = batch["xy_true"][:, :N_curr].to(DEVICE)   # (P_tot,T,2)
                t_build_xy += time.time() - t1

                # 3) Loss
                t2 = time.time()
                T_match = min(xy_pred.shape[1], xy_true.shape[1])
                xy_pred_ = xy_pred[:, :T_match, :]
                xy_true_ = xy_true[:, :T_match, :]
                mask_    = mask[:, :T_match]

                loss_val = criterion(xy_pred_, xy_true_, mask_)
                n_plays += len(batch["play_ids"])   # count plays
                t_loss += time.time() - t2

                # 4) Backward + optimizer
                t3 = time.time()
                scaler.scale(loss_val).backward()
                scaler.step(optimizer)
                scaler.update()
                t_backward += time.time() - t3

                t_batch += time.time() - batch_start
                epoch_loss += float(loss_val.detach())

        scheduler.step()
        train_loss = epoch_loss / max(1, n_plays)
        loss_history.append(train_loss)

        print(f"Epoch {epoch:03d} | N_max={N_curr:02d} | LR={scheduler.get_last_lr()[0]:.6f} | "
              f"TrainLoss={train_loss:.4f}")
        print(f"Epoch {epoch:03d} completed in {time.time() - start_time:.2f} s")
        print(f"  forward_one_play:   {t_forward:.2f} s")
        print(f"  build_xy_true:      {t_build_xy:.2f} s")
        print(f"  loss compute:        {t_loss:.2f} s")
        print(f"  backward+optim:      {t_backward:.2f} s")
        print(f"  batch loop total:    {t_batch:.2f} s\n")


    return model, loss_history

for seed in [3, 4, 5]:
    set_seed(seed)
    model, _ = train_loop_full_data(
        df_in_norm, df_out_norm, graphs_fast,
        K=10, N_max=30, batch_size=16,
        lr=1e-4, wd=1e-4,
        warmup_epochs=3, total_epochs=40, pretrain_epochs=3
    )
    torch.save(model.state_dict(), f"/content/final_model_seed{seed}.pt")

üìä Dataset: 12966 plays for training (no validation split)




Epoch 001 | N_max=10 | LR=0.000033 | TrainLoss=0.7815
Epoch 001 completed in 263.91 s
  forward_one_play:   47.93 s
  build_xy_true:      0.03 s
  loss compute:        1.04 s
  backward+optim:      68.69 s
  batch loop total:    117.97 s

Epoch 002 | N_max=10 | LR=0.000067 | TrainLoss=0.2483
Epoch 002 completed in 263.84 s
  forward_one_play:   47.71 s
  build_xy_true:      0.03 s
  loss compute:        1.04 s
  backward+optim:      69.14 s
  batch loop total:    118.21 s

Epoch 003 | N_max=10 | LR=0.000100 | TrainLoss=0.1212
Epoch 003 completed in 259.77 s
  forward_one_play:   47.16 s
  build_xy_true:      0.03 s
  loss compute:        1.04 s
  backward+optim:      68.21 s
  batch loop total:    116.72 s

Epoch 004 | N_max=30 | LR=0.000100 | TrainLoss=0.1121
Epoch 004 completed in 257.34 s
  forward_one_play:   46.76 s
  build_xy_true:      0.03 s
  loss compute:        1.04 s
  backward+optim:      67.93 s
  batch loop total:    116.03 s

Epoch 005 | N_max=30 | LR=0.000099 | TrainLo

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-1028069646.py", line 310, in <cell line: 0>
    model, _ = train_loop_full_data(
               ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1028069646.py", line 263, in train_loop_full_data
    xy_pred, mask = model.forward_batch(batch, N_max_curr=N_curr)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1028069646.py", line 190, in forward_batch
    h_nodes = self.encoder(x_hist)            # (P,D)
              ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _ca

TypeError: object of type 'NoneType' has no len()