In [1]:
# ======================================
# Step 0: Imports & Config
# ======================================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import polars as pl
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [2]:
# ======================================
# Step 1: Load input/output data
# ======================================

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 200)

import pandas as pd

# 先用 1~4 周，跑顺了再扩展
weeks = [f"{w:02d}" for w in range(1, 5)]   # ["01","02","03","04"]

input_list = []
output_list = []

for w in weeks:
    in_path  = f"/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w{w}.csv"
    out_path = f"/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w{w}.csv"
    
    print(f"Loading week {w} ...")
    input_w  = pd.read_csv(in_path)
    output_w = pd.read_csv(out_path)

    # 可选：加 week 标记，方便以后分析
    input_w["week"]  = int(w)
    output_w["week"] = int(w)
    
    input_list.append(input_w)
    output_list.append(output_w)

input_df  = pd.concat(input_list,  ignore_index=True)
output_df = pd.concat(output_list, ignore_index=True)

print("\n==== Step 1: 原始多周数据形状 ====")
print("input_df shape :", input_df.shape)
print("output_df shape:", output_df.shape)
print("weeks in input_df :", sorted(input_df['week'].unique().tolist()))
print("weeks in output_df:", sorted(output_df['week'].unique().tolist()))

print("\ninput_df columns:", list(input_df.columns))
print("output_df columns:", list(output_df.columns))

print("\ninput_df 示例行：")
print(input_df.head(3))

print("\noutput_df 示例行：")
print(output_df.head(3))

print("\nplay_direction 计数：")
print(input_df["play_direction"].value_counts(dropna=False))

Loading week 01 ...
Loading week 02 ...
Loading week 03 ...
Loading week 04 ...

==== Step 1: 原始多周数据形状 ====
input_df shape : (1144532, 24)
output_df shape: (130495, 7)
weeks in input_df : [1, 2, 3, 4]
weeks in output_df: [1, 2, 3, 4]

input_df columns: ['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id', 'play_direction', 'absolute_yardline_number', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y', 'week']
output_df columns: ['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y', 'week']

input_df 示例行：
      game_id  play_id  player_to_predict  nfl_id  frame_id play_direction  absolute_yardline_number player_name player_height  player_weight player_birth_date player_position player_side  \
0  2023090700      101              False   54527         1          right                        42  Bryan Cook           6-1           

In [3]:
print("\n==== Step 2: 过滤 player_to_predict 后 ====")

# 只保留需要预测的球员
input_df = input_df[input_df["player_to_predict"] == True].copy()

print("input_df shape (filtered):", input_df.shape)
print("player_to_predict 统计:")
print(input_df["player_to_predict"].value_counts())

# 为了方便后面 join，确保类型一致
key_cols = ["game_id", "play_id", "nfl_id", "frame_id"]
output_df = output_df.copy()



==== Step 2: 过滤 player_to_predict 后 ====
input_df shape (filtered): (304184, 24)
player_to_predict 统计:
player_to_predict
True    304184
Name: count, dtype: int64


In [4]:
def mirror_xy(df, direction_col="play_direction"):
    df = df.copy()
    is_left = df[direction_col] == "left"

    # 1. x / y 一定要镜像（input / output 都有）
    df.loc[is_left, "x"] = 120.0 - df.loc[is_left, "x"]
    df.loc[is_left, "y"] = 53.3 - df.loc[is_left, "y"]

    # 2. 如果有 dir，再镜像 dir
    if "dir" in df.columns:
        df.loc[is_left, "dir"] = (180.0 - df.loc[is_left, "dir"]) % 360

    # 3. 如果有 o，再镜像 o
    if "o" in df.columns:
        df.loc[is_left, "o"] = (180.0 - df.loc[is_left, "o"]) % 360

    return df

In [5]:
# 3.1 为每个 play 建立 play_direction 映射
play_dir_map = (
    input_df[["game_id", "play_id", "play_direction"]]
    .drop_duplicates(subset=["game_id", "play_id"])
    .set_index(["game_id", "play_id"])["play_direction"]
)

print("\n==== Step 3: play_direction 映射检查 ====")
print("不同 play_direction 个数:", play_dir_map.nunique())
print(play_dir_map.value_counts())

# 3.2 左右归一化函数（你已经定义过的 mirror_xy，直接用）
input_df_norm = mirror_xy(input_df, "play_direction")

print("\n镜像前后对比 (input_df)：")
sample_keys = input_df_norm[["game_id", "play_id"]].drop_duplicates().head(1).values[0]
g, p = sample_keys
print(f"选取 game_id={g}, play_id={p} 的前 5 行对比：\n")

print("原始：")
print(
    input_df[(input_df.game_id==g) & (input_df.play_id==p)]
    [["frame_id","play_direction","x","y","dir","o"]].head()
)
print("\n归一化后：")
print(
    input_df_norm[(input_df_norm.game_id==g) & (input_df_norm.play_id==p)]
    [["frame_id","play_direction","x","y","dir","o"]].head()
)

# 3.3 给 output 补上 play_direction 并做同样的镜像
output_df_norm = output_df.merge(
    play_dir_map.rename("play_direction"),
    on=["game_id", "play_id"],
    how="left"
)

missing_dir = output_df_norm["play_direction"].isna().sum()
print("\noutput_df 中 play_direction 缺失个数:", missing_dir)

output_df_norm = mirror_xy(output_df_norm, "play_direction")

print("\n镜像后的 output_df_norm 示例:")
print(output_df_norm.head())



==== Step 3: play_direction 映射检查 ====
不同 play_direction 个数: 2
play_direction
right    1749
left     1603
Name: count, dtype: int64

镜像前后对比 (input_df)：
选取 game_id=2023090700, play_id=101 的前 5 行对比：

原始：
    frame_id play_direction      x      y     dir       o
26         1          right  51.32  20.69   79.43  267.68
27         2          right  51.35  20.66  118.07  268.66
28         3          right  51.39  20.63  130.89  269.78
29         4          right  51.43  20.61  134.50  269.78
30         5          right  51.48  20.58  129.79  269.06

归一化后：
    frame_id play_direction      x      y     dir       o
26         1          right  51.32  20.69   79.43  267.68
27         2          right  51.35  20.66  118.07  268.66
28         3          right  51.39  20.63  130.89  269.78
29         4          right  51.43  20.61  134.50  269.78
30         5          right  51.48  20.58  129.79  269.06

output_df 中 play_direction 缺失个数: 0

镜像后的 output_df_norm 示例:
      game_id  play_id  nfl_id  fr

In [6]:
feature_cols = ["x", "y", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]
print("\nfeature_cols:", feature_cols)

print("\ninput_df_norm 中这些列的前几行：")
print(input_df_norm[["game_id", "play_id", "nfl_id", "frame_id"] + feature_cols].head())

# 检查 NaN / 范围
print("\ninput_df_norm 是否有 NaN:")
print(input_df_norm[["x", "y"]].isna().sum())

print("\noutput_df_norm 是否有 NaN:")
print(output_df_norm[["x", "y"]].isna().sum())

print("\ninput x/y 范围:",
      input_df_norm["x"].min(), input_df_norm["x"].max(),
      input_df_norm["y"].min(), input_df_norm["y"].max())

print("output x/y 范围:",
      output_df_norm["x"].min(), output_df_norm["x"].max(),
      output_df_norm["y"].min(), output_df_norm["y"].max())



feature_cols: ['x', 'y', 's', 'a', 'dir', 'o', 'ball_land_x', 'ball_land_y']

input_df_norm 中这些列的前几行：
       game_id  play_id  nfl_id  frame_id      x      y     s     a     dir       o  ball_land_x  ball_land_y
26  2023090700      101   46137         1  51.32  20.69  0.31  0.49   79.43  267.68    63.259998        -0.22
27  2023090700      101   46137         2  51.35  20.66  0.36  0.74  118.07  268.66    63.259998        -0.22
28  2023090700      101   46137         3  51.39  20.63  0.44  0.76  130.89  269.78    63.259998        -0.22
29  2023090700      101   46137         4  51.43  20.61  0.48  0.62  134.50  269.78    63.259998        -0.22
30  2023090700      101   46137         5  51.48  20.58  0.54  0.44  129.79  269.06    63.259998        -0.22

input_df_norm 是否有 NaN:
x    0
y    0
dtype: int64

output_df_norm 是否有 NaN:
x    0
y    0
dtype: int64

input x/y 范围: 6.45 119.27 0.6899999999999977 52.58
output x/y 范围: 11.89 120.83 0.3200000000000003 53.72


In [7]:
import numpy as np

def build_sequences_with_mask(input_df, output_df, feature_cols, T_in=32, T_out=21):

    print("\n==== Step 4: build_sequences (with mask) ====")

    X_list, Y_list, M_list = [], [], []
    keys = []

    grouped_in  = input_df.groupby(["game_id", "play_id", "nfl_id"])
    grouped_out = output_df.groupby(["game_id", "play_id", "nfl_id"])

    len_out_counter = {}
    dropped_no_output = 0

#    total_keys = 0

    for key, g_in in grouped_in:
        if key not in grouped_out.groups:
            continue
        g_out = grouped_out.get_group(key)
        
#        total_keys += 1

        # 按 frame_id 排序，保证时间顺序
        g_in = g_in.sort_values("frame_id")
        g_out = g_out.sort_values("frame_id")

        # --- 构造 X 序列 ---
        x_feat = g_in[feature_cols].to_numpy(dtype="float32")  # [L_in, F]
        L_in, F = x_feat.shape

        if L_in >= T_in:
            X_seq = x_feat[-T_in:]  # 取最后 T_in 帧
        else:
            pad_len = T_in - L_in
            pad = np.zeros((pad_len, F), dtype="float32")
            X_seq = np.concatenate([pad, x_feat], axis=0)

        # --- 构造 Y 序列 & mask ---
        out_xy = g_out[["x", "y"]].to_numpy(dtype="float32")   # [L_out, 2]
        L_out = out_xy.shape[0]

        len_out_counter[L_out] = len_out_counter.get(L_out, 0) + 1

        # 没有任何输出，跳过
        if L_out == 0:
            dropped_no_output += 1
            continue

        if L_out >= T_out:
            Y_seq = out_xy[:T_out]
            mask = np.ones(T_out, dtype="float32")
        else:
            # 用最后一帧的位置重复填充
            last_xy = out_xy[-1]
            pad_len = T_out - L_out
            pad_xy = np.repeat(last_xy[None, :], pad_len, axis=0)  # [pad_len, 2]
            Y_seq = np.concatenate([out_xy, pad_xy], axis=0)

            mask = np.concatenate([
                np.ones(L_out, dtype="float32"),
                np.zeros(pad_len, dtype="float32"),
            ], axis=0)  # [T_out]

        X_list.append(X_seq)
        Y_list.append(Y_seq)
        M_list.append(mask)
        keys.append(key)

    # 堆叠成 ndarray
    X_all = np.stack(X_list, axis=0)
    Y_all = np.stack(Y_list, axis=0)
    M_all = np.stack(M_list, axis=0)

    print("共有样本 key 数:", len(keys))
    print("build_sequences 完成:")
    print("X_all shape:", X_all.shape)  # 期望 [N, T_in, F]
    print("Y_all shape:", Y_all.shape)  # 期望 [N, T_out, 2]
    print("M_all shape:", M_all.shape)  # 期望 [N, T_out]")

    print("\n被丢弃的样本（L_out == 0）数量:", dropped_no_output)

    print("\n输出长度分布 (原始 L_out):")
    for L_out in sorted(len_out_counter.keys()):
        print(f"  len_out = {L_out}: {len_out_counter[L_out]} 条")

    # 打印一个样本看一下
    if X_all.shape[0] > 0:
        print("\n示例样本 0:")
        print("X_all[0] 前 3 帧:\n", X_all[0, :3])
        print("Y_all[0] 前 3 帧:\n", Y_all[0, :3])
        print("M_all[0]:\n", M_all[0])

    return X_all, Y_all, M_all, keys


In [8]:
T_in  = 32
T_out = 21

X_all, Y_all, M_all, keys = build_sequences_with_mask(
    input_df_norm,
    output_df_norm,
    feature_cols,
    T_in=T_in,
    T_out=T_out,
)
print("\n==== Step 4: build_sequences ====")
print("共有样本 key 数:", len(X_all))
print("X_all shape:", X_all.shape)  # 期望 (N, 32, F)
print("Y_all shape:", Y_all.shape)  # 期望 (N, 21, 2)
print("M_all Shape:",  M_all.shape)


==== Step 4: build_sequences (with mask) ====
共有样本 key 数: 10913
build_sequences 完成:
X_all shape: (10913, 32, 8)
Y_all shape: (10913, 21, 2)
M_all shape: (10913, 21)

被丢弃的样本（L_out == 0）数量: 0

输出长度分布 (原始 L_out):
  len_out = 5: 61 条
  len_out = 6: 391 条
  len_out = 7: 1152 条
  len_out = 8: 1450 条
  len_out = 9: 1389 条
  len_out = 10: 1319 条
  len_out = 11: 906 条
  len_out = 12: 815 条
  len_out = 13: 586 条
  len_out = 14: 464 条
  len_out = 15: 357 条
  len_out = 16: 284 条
  len_out = 17: 249 条
  len_out = 18: 204 条
  len_out = 19: 197 条
  len_out = 20: 143 条
  len_out = 21: 102 条
  len_out = 22: 125 条
  len_out = 23: 166 条
  len_out = 24: 110 条
  len_out = 25: 68 条
  len_out = 26: 111 条
  len_out = 27: 67 条
  len_out = 28: 58 条
  len_out = 29: 39 条
  len_out = 30: 38 条
  len_out = 31: 11 条
  len_out = 32: 10 条
  len_out = 33: 3 条
  len_out = 34: 19 条
  len_out = 36: 4 条
  len_out = 40: 7 条
  len_out = 94: 8 条

示例样本 0:
X_all[0] 前 3 帧:
 [[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 

In [9]:
import numpy as np

N = len(keys)
keys = np.array(keys)

# 随机打乱索引
rng = np.random.default_rng(42)
perm = rng.permutation(N)

split = int(N * 0.8)   # 80% 训练，20% 验证
train_idx = perm[:split]
val_idx   = perm[split:]

X_train, Y_train, M_train = X_all[train_idx], Y_all[train_idx], M_all[train_idx]
X_val,   Y_val,   M_val   = X_all[val_idx],   Y_all[val_idx],   M_all[val_idx]

print("Train size:", X_train.shape[0])
print("Val   size:", X_val.shape[0])


Train size: 8730
Val   size: 2183


In [10]:
class Seq2SeqDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y, M):
        self.X = X
        self.Y = Y
        self.M = M

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]).float(),
            torch.from_numpy(self.Y[idx]).float(),
            torch.from_numpy(self.M[idx]).float(),
        )

def build_loader(X, Y, M, batch_size=64, shuffle=True):
    dataset = Seq2SeqDataset(X, Y, M)
    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=False,
    )

    # 检查一批
    for i, (xb, yb, mb) in enumerate(loader):
        print("First batch idx:", i)
        print(" xb shape:", xb.shape)  # [B, 32, F]
        print(" yb shape:", yb.shape)  # [B, 21, 2]
        print(" mb shape:", mb.shape)  # [B, 21]
        break

    return loader



In [11]:
train_loader = build_loader(X_train, Y_train, M_train, batch_size=64, shuffle=True)
val_loader   = build_loader(X_val,   Y_val,   M_val,   batch_size=64, shuffle=False)

First batch idx: 0
 xb shape: torch.Size([64, 32, 8])
 yb shape: torch.Size([64, 21, 2])
 mb shape: torch.Size([64, 21])
First batch idx: 0
 xb shape: torch.Size([64, 32, 8])
 yb shape: torch.Size([64, 21, 2])
 mb shape: torch.Size([64, 21])


In [12]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# =========================
# Positional Encoding
# =========================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 64):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float32)
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # [1, max_len, d_model]
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: [B, T, d_model]
        T = x.size(1)
        return x + self.pe[:, :T, :]
        

# =========================
# 自回归 Transformer 模型
# 输入: [B, T_in, F]
# 输出: [B, T_out, 2]  (未来每一帧的 x,y)
# =========================
class AutoRegressiveTransformer(nn.Module):
    def __init__(
        self,
        input_dim: int,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 3,
        T_in: int = 32,
        T_out: int = 21,
    ):
        super().__init__()
        self.T_out = T_out

        # 编码器：把输入特征映射到 d_model
        self.input_fc = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len=T_in + T_out)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=4 * d_model,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        # 解码器：一步步用“上一时刻位置”生成下一时刻位置
        dec_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=4 * d_model,
            batch_first=True,
        )
        self.decoder = nn.TransformerDecoder(dec_layer, num_layers=num_layers)

        # 把上一步的 (x,y) 位置映射到 d_model 作为 decoder 输入
        self.query_fc = nn.Linear(2, d_model)

        # 输出层：d_model -> (x,y)
        self.out_fc = nn.Linear(d_model, 2)

    def _generate_square_subsequent_mask(self, size: int, device):
        # 上三角为 -inf 的注意力 mask，保证自回归
        mask = torch.triu(
            torch.full((size, size), float("-inf"), device=device), diagonal=1
        )
        return mask

    def forward(
        self,
        src: torch.Tensor,           # [B, T_in, F]
        target: torch.Tensor = None, # [B, T_out, 2]  (训练时可选)
        teacher_forcing_ratio: float = 0.0,
    ) -> torch.Tensor:
        B, T_in, _ = src.shape
        device = src.device

        # ===== 编码器 =====
        src_embed = self.input_fc(src)          # [B, T_in, d_model]
        src_embed = self.pos_encoder(src_embed)
        memory = self.encoder(src_embed)        # [B, T_in, d_model]

        # 最后一帧已观测位置 (注意：feature_cols 的前两列是 x,y)
        last_pos = src[:, -1, :2]               # [B, 2]

        outputs = []
        dec_inputs = []

        for t in range(self.T_out):
            # 决定当前步使用什么“上一位置”作为输入
            if t == 0:
                prev_pos = last_pos             # 第一步：用观测的最后一帧位置
            else:
                use_teacher = (
                    (target is not None)
                    and (teacher_forcing_ratio > 0)
                    and (torch.rand(1).item() < teacher_forcing_ratio)
                )
                if use_teacher:
                    # Teacher Forcing：用真值的上一步位置
                    prev_pos = target[:, t - 1, :]   # [B, 2]
                else:
                    # 自回归：用模型预测的上一帧
                    prev_pos = outputs[-1]          # [B, 2]

            dec_inputs.append(prev_pos.unsqueeze(1))   # [B, 1, 2]
            dec_seq = torch.cat(dec_inputs, dim=1)     # [B, t+1, 2]

            dec_embed = self.query_fc(dec_seq)         # [B, t+1, d_model]
            dec_embed = self.pos_encoder(dec_embed)

            tgt_mask = self._generate_square_subsequent_mask(
                dec_embed.size(1), device
            )

            dec_out = self.decoder(
                dec_embed, memory, tgt_mask=tgt_mask
            )                                          # [B, t+1, d_model]

            step_out = self.out_fc(dec_out[:, -1, :])  # 只取最后一帧 [B, 2]
            outputs.append(step_out)

        outputs = torch.stack(outputs, dim=1)          # [B, T_out, 2]
        return outputs


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

input_dim = X_all.shape[-1]      # 8
d_model   = 128

model = AutoRegressiveTransformer(
    input_dim=input_dim,
    d_model=d_model,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


Using device: cuda


In [14]:
def train_one_epoch(model, loader, optimizer, clip_grad=1.0):
    model.train()
    total_loss = 0.0
    total_count = 0

    for Xb, Yb, Mb in loader:
        Xb = Xb.to(device)          # [B, 32, F]
        Yb = Yb.to(device)          # [B, 21, 2]
        Mb = Mb.to(device)          # [B, 21]

        optimizer.zero_grad()

        # 最后一帧观测位置
        last_pos = Xb[:, -1, :2]    # [B, 2]

        # 预测相对位移
        preds_rel = model(Xb)       # [B, 21, 2]
        preds_abs = preds_rel + last_pos.unsqueeze(1)

        # 只在有效帧上算 MSE
        diff = preds_abs - Yb                    # [B, 21, 2]
        diff = diff * Mb.unsqueeze(-1)           # mask 掉 padding
        se = (diff ** 2).sum()                   # 总平方误差
        cnt = Mb.sum() * 2.0                     # 有效坐标个数 (x,y)

        loss = se / cnt
        loss.backward()

        if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        optimizer.step()

        total_loss += loss.item()
        total_count += 1

    return total_loss / max(total_count, 1)


In [15]:
import math

def compute_rmse_on_loader(model, loader):
    model.eval()
    se_sum = 0.0
    n = 0.0

    with torch.no_grad():
        for Xb, Yb, Mb in loader:
            Xb = Xb.to(device)
            Yb = Yb.to(device)
            Mb = Mb.to(device)

            last_pos = Xb[:, -1, :2]
            preds_rel = model(Xb)
            preds_abs = preds_rel + last_pos.unsqueeze(1)

            diff = preds_abs - Yb           # [B, 21, 2]
            diff = diff * Mb.unsqueeze(-1)  # 只算有效帧

            dist2 = diff[..., 0]**2 + diff[..., 1]**2   # [B, 21]
            se_sum += dist2.sum().item()
            n += Mb.sum().item()

    rmse = math.sqrt(se_sum / max(n, 1.0))
    print(f"RMSE on this loader (yards): {rmse:.4f}")
    model.train()
    return rmse


In [16]:
import copy
import os

def train_model(model, train_loader, val_loader, epochs=20, clip_grad=1.0):
    best_rmse = float("inf")
    best_state = None

    for epoch in range(1, epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, clip_grad)

        print(f"Epoch {epoch}/{epochs} | Train Loss = {train_loss:.4f}")

        val_rmse = compute_rmse_on_loader(model, val_loader)
        print(f"           Val RMSE   = {val_rmse:.4f}")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_state = copy.deepcopy(model.state_dict())
            torch.save(best_state, "/kaggle/working/model_autoreg_best.pt")
            print(f"  ✅ New best model saved. RMSE = {best_rmse:.4f}")

    # 训练结束后，加载最佳权重
    if best_state is not None:
        model.load_state_dict(best_state)
        print(f"\nLoaded best model with RMSE = {best_rmse:.4f}")

    return model

# 真正开始训练
model = train_model(model, train_loader, val_loader, epochs=20, clip_grad=1.0)


Epoch 1/20 | Train Loss = 8.5227
RMSE on this loader (yards): 3.7777
           Val RMSE   = 3.7777
  ✅ New best model saved. RMSE = 3.7777
Epoch 2/20 | Train Loss = 6.8707
RMSE on this loader (yards): 3.4370
           Val RMSE   = 3.4370
  ✅ New best model saved. RMSE = 3.4370
Epoch 3/20 | Train Loss = 6.0780
RMSE on this loader (yards): 3.3318
           Val RMSE   = 3.3318
  ✅ New best model saved. RMSE = 3.3318
Epoch 4/20 | Train Loss = 5.7063
RMSE on this loader (yards): 3.2195
           Val RMSE   = 3.2195
  ✅ New best model saved. RMSE = 3.2195
Epoch 5/20 | Train Loss = 5.3030
RMSE on this loader (yards): 3.0686
           Val RMSE   = 3.0686
  ✅ New best model saved. RMSE = 3.0686
Epoch 6/20 | Train Loss = 4.8507
RMSE on this loader (yards): 3.1004
           Val RMSE   = 3.1004
Epoch 7/20 | Train Loss = 4.6387
RMSE on this loader (yards): 3.1115
           Val RMSE   = 3.1115
Epoch 8/20 | Train Loss = 4.4103
RMSE on this loader (yards): 2.8355
           Val RMSE   = 2.8355


In [17]:
########
## For Prediction
########

import numpy as np
import pandas as pd
import polars as pl
import torch
from torch.utils.data import Dataset, DataLoader

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

T_IN  = 32
T_OUT = 21

FEATURE_COLS = ["x", "y", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]
MODEL_PATH   = "/kaggle/working/model_autoreg_best.pt"   # 和训练时保持一致


def mirror_xy(df: pd.DataFrame, direction_col: str = "play_direction") -> pd.DataFrame:
    """把所有 left 进攻的路线镜像成 right（和训练时完全一致）"""
    df = df.copy()
    is_left = df[direction_col] == "left"

    df.loc[is_left, "x"]   = 120.0 - df.loc[is_left, "x"]
    df.loc[is_left, "y"]   = 53.3  - df.loc[is_left, "y"]
    df.loc[is_left, "dir"] = (180.0 - df.loc[is_left, "dir"]) % 360.0
    df.loc[is_left, "o"]   = (180.0 - df.loc[is_left, "o"])   % 360.0
    return df


def unmirror_xy_xy_array(xy: np.ndarray, is_left: bool) -> np.ndarray:
    """
    把预测结果从“全是向右进攻的坐标系”翻回原始坐标系。
    xy: [T, 2]
    """
    if not is_left:
        return xy

    out = xy.copy()
    out[:, 0] = 120.0 - out[:, 0]
    out[:, 1] = 53.3  - out[:, 1]
    return out


def build_test_sequences(input_df: pd.DataFrame):
    """
    根据 test_input 构造模型需要的输入序列。
    返回：
        X_all:  np.ndarray [N, T_IN, F]
        keys:  list[(game_id, play_id, nfl_id)]
        is_left: np.ndarray [N] 对应这个 play 是否原本向左
    """

    # 只保留要预测的球员
    df = input_df[input_df["player_to_predict"] == True].copy()

    # 记录每个 (game_id, play_id) 的原始进攻方向（left/right）
    play_dir_map = (
        df[["game_id", "play_id", "play_direction"]]
        .drop_duplicates(subset=["game_id", "play_id"])
        .set_index(["game_id", "play_id"])["play_direction"]
    )

    # 做镜像（训练里对 input 做过）
    df_norm = mirror_xy(df, "play_direction")

    X_list   = []
    keys     = []
    is_lefts = []

    for (g, p, n), group in df_norm.groupby(["game_id", "play_id", "nfl_id"]):
        group = group.sort_values("frame_id")
        hist = group[FEATURE_COLS].to_numpy(dtype=np.float32)
        L, F = hist.shape

        # 如果帧数不足 T_IN，前面补零；否则取最后 T_IN 帧
        if L >= T_IN:
            hist = hist[-T_IN:]
        else:
            pad = np.zeros((T_IN - L, F), dtype=np.float32)
            hist = np.concatenate([pad, hist], axis=0)

        X_list.append(hist)
        keys.append((g, p, n))
        is_lefts.append(play_dir_map.loc[(g, p)] == "left")

    X_all   = np.stack(X_list, axis=0)               # [N, T_IN, F]
    is_left = np.array(is_lefts, dtype=bool)         # [N]

    return X_all, keys, is_left


class InferenceDataset(Dataset):
    """只用 X 的简单 Dataset，用在 DataLoader 里做 batch 推理。"""
    def __init__(self, X_all: np.ndarray):
        self.X = torch.from_numpy(X_all).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx]



In [18]:
import numpy as np
import pandas as pd

# 和训练时保持一致
T_in = 32
T_out = 21
feature_cols = ['x', 'y', 's', 'a', 'dir', 'o', 'ball_land_x', 'ball_land_y']

def build_test_sequences(input_df_norm, feature_cols, T_in=32, T_out=21):
    """
    只根据归一化后的 input_df_norm 构造:
    - X_all: [N, T_in, F]
    - last_pos: [N, 2] 最后一个观测点（归一化后坐标）
    - lens_out: [N] 每个样本需要预测多少帧 (= num_frames_output)
    - keys: [(game_id, play_id, nfl_id), ...]
    - play_dirs: 每个样本的原始 play_direction（right/left）
    """
    X_list = []
    last_pos_list = []
    len_out_list = []
    key_list = []
    play_dirs = []

    grp = input_df_norm.groupby(["game_id", "play_id", "nfl_id"])
    for (g, p, n), df in grp:
        df = df.sort_values("frame_id")

        feats = df[feature_cols].to_numpy(dtype=np.float32)   # [L_in, F]
        L_in = feats.shape[0]
        F = feats.shape[1]

        # 左侧用 0 填充，保证长度 = T_in
        x_seq = np.zeros((T_in, F), dtype=np.float32)
        if L_in >= T_in:
            x_seq[:] = feats[-T_in:]
        else:
            x_seq[-L_in:, :] = feats

        X_list.append(x_seq)
        last_pos_list.append(feats[-1, :2])  # 最后一个观测点 (x,y)
        len_out_list.append(int(df["num_frames_output"].iloc[0]))
        key_list.append((g, p, n))
        play_dirs.append(df["play_direction"].iloc[0])

    X_all = np.stack(X_list)                   # [N, T_in, F]
    last_pos = np.stack(last_pos_list)         # [N, 2]
    lens_out = np.asarray(len_out_list, int)   # [N]

    return X_all, last_pos, lens_out, key_list, play_dirs


In [19]:
import numpy as np
import pandas as pd
import torch
import polars as pl

# 1. 方向特征（跟训练时一样）
def add_direction_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    rad_dir = np.deg2rad(df["dir"].astype(float))
    rad_o   = np.deg2rad(df["o"].astype(float))
    df["dir_sin"] = np.sin(rad_dir)
    df["dir_cos"] = np.cos(rad_dir)
    df["o_sin"]   = np.sin(rad_o)
    df["o_cos"]   = np.cos(rad_o)
    return df

# 2. 左右归一化（跟训练时的 mirror_xy / normalize_direction 一致）
def normalize_direction(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    is_left = df["play_direction"] == "left"

    df.loc[is_left, "x"] = 120.0 - df.loc[is_left, "x"]
    df.loc[is_left, "y"] =  53.3 - df.loc[is_left, "y"]

    df.loc[is_left, "dir"] = (180.0 - df.loc[is_left, "dir"]) % 360
    df.loc[is_left, "o"]   = (180.0 - df.loc[is_left, "o"]) % 360

    # 注意：这里我们不改 play_direction，让它保持原始值，
    # 方便后面还原坐标时根据 "left"/"right" 判断
    return df

# 3. 把「统一向右」的预测坐标还原回原始方向
def denormalize_xy(xy: np.ndarray, play_direction: str) -> np.ndarray:
    """
    xy: [T, 2]  在统一向右坐标系下的预测
    play_direction: 'left' 或 'right'（原始的）
    """
    if play_direction == "left":
        xy = xy.copy()
        xy[:, 0] = 120.0 - xy[:, 0]
        xy[:, 1] =  53.3 - xy[:, 1]
    return xy


In [20]:
import torch
import polars as pl
import pandas as pd
import numpy as np
import os

BEST_MODEL_PATH = "/kaggle/working/model_autoreg_best.pt"

def load_trained_model(device):
    # 和训练时完全一样的超参数
    T_in  = 32
    T_out = 21
    feature_cols = ["x", "y", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]
    input_dim = len(feature_cols)
    d_model = 128
    nhead = 8
    num_layers = 3   # ★ 必须和训练时一致！

    model = AutoRegressiveTransformer(
        input_dim=input_dim,
        d_model=d_model,
        nhead=nhead,
        num_layers=num_layers,
        T_in=T_in,
        T_out=T_out,
    ).to(device)

    state = torch.load(BEST_MODEL_PATH, map_location=device)
    # 如果你保存的是 {"model": ..., "rmse": ...}
    state_dict = state["model"] if "model" in state else state
    model.load_state_dict(state_dict)
    model.eval()
    return model

# 这些超参数要跟训练时一致
T_in  = 32
T_out = 21
feature_cols = ["x", "y", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]


def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame:
    """
    Kaggle 评测时会调用这个函数。
    参数:
      test:       pl.DataFrame, 只包含 game_id, play_id, nfl_id, frame_id
      test_input: pl.DataFrame, 传球前的 tracking (结构类似 train 的 input_2023_w01~18)

    返回:
      pl.DataFrame, 必须跟 test 一样行数，包含列 ['game_id','play_id','nfl_id','frame_id','x','y']
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = load_trained_model(device)
    model.eval()

    # 转成 pandas 方便处理
    test_df = test.to_pandas()
    inp_raw = test_input.to_pandas()

    # 记录每个 play 的原始方向，用来最后还原坐标
    play_dir_map = (
        inp_raw[["game_id", "play_id", "play_direction"]]
        .drop_duplicates()
        .set_index(["game_id", "play_id"])["play_direction"]
    )

    # 只保留需要预测的那些球员的历史
    key_cols = ["game_id", "play_id", "nfl_id"]
    inp = inp_raw.merge(
        test_df[key_cols].drop_duplicates(),
        on=key_cols,
        how="inner",
    )

    # 预处理（和训练完全一样）
    inp = add_direction_features(inp)
    inp = normalize_direction(inp)

    preds_x = np.zeros(len(test_df), dtype=np.float32)
    preds_y = np.zeros(len(test_df), dtype=np.float32)

    # 按 (game_id, play_id, nfl_id) 分组做预测
    for (g, p, nid), grp in test_df.groupby(["game_id", "play_id", "nfl_id"]):
        frames = grp["frame_id"].values  # 1-based

        hist = inp[(inp.game_id == g) & (inp.play_id == p) & (inp.nfl_id == nid)].copy()
        if hist.empty:
            # 极端情况：没找到历史，就用 0 填（基本不会发生）
            px = np.zeros_like(frames, dtype=np.float32)
            py = np.zeros_like(frames, dtype=np.float32)
        else:
            hist = hist.sort_values("frame_id")
            feats = hist[feature_cols].to_numpy(dtype=np.float32)

            # 取最后 T_in 帧，不够的话前面补 0
            if len(feats) >= T_in:
                seq_in = feats[-T_in:]
            else:
                pad = np.zeros((T_in - len(feats), feats.shape[1]), dtype=np.float32)
                seq_in = np.concatenate([pad, feats], axis=0)

            last_pos = seq_in[-1, :2].copy()  # [x, y] 最后一帧观测位置

            xb = torch.from_numpy(seq_in).unsqueeze(0).to(device)  # [1, T_in, F]
            with torch.no_grad():
                # forward 默认 teacher_forcing_ratio=0，自回归预测相对位移
                rel = model(xb)[0].cpu().numpy()  # [T_out, 2]

            abs_xy = rel + last_pos[None, :]  # [T_out, 2] 统一向右坐标系

            # 还原回原始 play 方向
            play_dir = play_dir_map.get((g, p), "right")
            abs_xy = denormalize_xy(abs_xy, play_dir)

            # 把每个 frame_id 对应到预测序列上；>T_out 的用最后一帧顶上
            px = np.empty_like(frames, dtype=np.float32)
            py = np.empty_like(frames, dtype=np.float32)
            for i, fid in enumerate(frames):
                idx = int(fid) - 1
                if idx < 0:
                    idx = 0
                if idx >= T_out:
                    idx = T_out - 1
                px[i] = abs_xy[idx, 0]
                py[i] = abs_xy[idx, 1]

        preds_x[grp.index.values] = px
        preds_y[grp.index.values] = py

    out_df = test_df.copy()
    out_df["x"] = preds_x
    out_df["y"] = preds_y

    # 返回 pl.DataFrame，列顺序无所谓，但通常保持一致
    return pl.from_pandas(out_df[["game_id", "play_id", "nfl_id", "frame_id", "x", "y"]])


In [21]:
import polars as pl

test_pl = pl.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv")
test_input_pl = pl.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv")

pred_pl = predict(test_pl, test_input_pl)

print(pred_pl.shape)          # 应该是 (len(test_pl), 6) 或至少 (len(test_pl), 2)
print(pred_pl.head())


(5837, 6)
shape: (5, 6)
┌────────────┬─────────┬────────┬──────────┬───────────┬───────────┐
│ game_id    ┆ play_id ┆ nfl_id ┆ frame_id ┆ x         ┆ y         │
│ ---        ┆ ---     ┆ ---    ┆ ---      ┆ ---       ┆ ---       │
│ i64        ┆ i64     ┆ i64    ┆ i64      ┆ f32       ┆ f32       │
╞════════════╪═════════╪════════╪══════════╪═══════════╪═══════════╡
│ 2024120805 ┆ 74      ┆ 54586  ┆ 1        ┆ 87.801926 ┆ 34.302551 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 2        ┆ 87.701584 ┆ 34.385452 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 3        ┆ 87.564453 ┆ 34.441353 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 4        ┆ 87.42749  ┆ 34.489895 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 5        ┆ 87.313553 ┆ 34.569389 │
└────────────┴─────────┴────────┴──────────┴───────────┴───────────┘


In [22]:
# 确保列顺序跟 test.csv 一样
pred_pl = pred_pl.select(["game_id", "play_id", "nfl_id", "frame_id", "x", "y"])

# Kaggle 要求的名字和格式：submission.parquet
out_path = "/kaggle/working/submission.parquet"
pred_pl.write_parquet(out_path)
print("Saved to:", out_path)
print(pred_pl.shape)
print(pred_pl.head())


# test_pl = pl.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv")

# print("test rows:", test_pl.shape[0])
# print("pred rows:", pred_pl.shape[0])

# print("test columns:", test_pl.columns)
# print("pred columns:", pred_pl.columns)


Saved to: /kaggle/working/submission.parquet
(5837, 6)
shape: (5, 6)
┌────────────┬─────────┬────────┬──────────┬───────────┬───────────┐
│ game_id    ┆ play_id ┆ nfl_id ┆ frame_id ┆ x         ┆ y         │
│ ---        ┆ ---     ┆ ---    ┆ ---      ┆ ---       ┆ ---       │
│ i64        ┆ i64     ┆ i64    ┆ i64      ┆ f32       ┆ f32       │
╞════════════╪═════════╪════════╪══════════╪═══════════╪═══════════╡
│ 2024120805 ┆ 74      ┆ 54586  ┆ 1        ┆ 87.801926 ┆ 34.302551 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 2        ┆ 87.701584 ┆ 34.385452 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 3        ┆ 87.564453 ┆ 34.441353 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 4        ┆ 87.42749  ┆ 34.489895 │
│ 2024120805 ┆ 74      ┆ 54586  ┆ 5        ┆ 87.313553 ┆ 34.569389 │
└────────────┴─────────┴────────┴──────────┴───────────┴───────────┘
