In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Celda 0 — Imports y configuración inicial - Verificación de archivos existentes

In [None]:
import os

print("Contenido de /kaggle/input:")
print(os.listdir("/kaggle/input"))

print("\nContenido de /kaggle/input/nfl-big-data-bowl-2026-prediction:")
print(os.listdir("/kaggle/input/nfl-big-data-bowl-2026-prediction"))

print("\nContenido de subcarpetas:")
for folder in os.listdir("/kaggle/input/nfl-big-data-bowl-2026-prediction"):
    path = f"/kaggle/input/nfl-big-data-bowl-2026-prediction/{folder}"
    if os.path.isdir(path):
        print(f"\nCarpeta: {folder}")
        print(os.listdir(path))

# Celda 01 - Lectura segura y emparejamiento de semanas

✔ Garantiza que emparejamos correctamente.

✔ No mezcla semanas incorrectamente.

✔ Prepara el camino para unir input+output.

In [None]:
import os
import pandas as pd
import re

DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/train"

files = os.listdir(DATA_DIR)
input_files = sorted([f for f in files if f.startswith("input")])
output_files = sorted([f for f in files if f.startswith("output")])

print("INPUT FILES:", len(input_files))
print("OUTPUT FILES:", len(output_files))

# Función para extraer número de semana
def get_week(fname):
    m = re.search(r"w(\d+)", fname)
    return int(m.group(1))

# Crear diccionarios por semana
inputs_by_week = {get_week(f): f for f in input_files}
outputs_by_week = {get_week(f): f for f in output_files}

# Semanas existentes en ambos lados
weeks = sorted(set(inputs_by_week.keys()) & set(outputs_by_week.keys()))
print("Semanas emparejadas:", weeks)


# CELDA 2 — Crear los archivos train_pp (input + output) por semana
Esta celda:

Lee el input y output de cada semana.

Los une por claves estándar del tracking:

game_id

play_id

nfl_id

frame_id

Guarda un archivo parquet por semana en /kaggle/working/train_pp.

Con esto creamos el dataset necesario para el modelo profundo.

In [None]:
# ===============================================
# CELDA 2: Crear train_pp por semana (input + output)
# ===============================================

import os
import pandas as pd
import gc

DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/train"
TRAIN_PP = "/kaggle/working/train_pp"
os.makedirs(TRAIN_PP, exist_ok=True)

KEYS = ["game_id", "play_id", "nfl_id", "frame_id"]

print("Generando archivos train_pp...\n")

for week in weeks:
    input_file = inputs_by_week[week]
    output_file = outputs_by_week[week]

    print(f">> Semana {week:02d}")
    print("   Leyendo:", input_file)
    print("   Leyendo:", output_file)

    df_in = pd.read_csv(os.path.join(DATA_DIR, input_file))
    df_out = pd.read_csv(os.path.join(DATA_DIR, output_file))

    # Verificar columnas clave
    for col in KEYS:
        if col not in df_in.columns or col not in df_out.columns:
            raise ValueError(f"⚠ ERROR: La columna clave '{col}' no está en ambas tablas de la semana {week}.")

    # Merge seguro
    df_merged = df_in.merge(df_out, on=KEYS, how="inner", suffixes=("_in", "_out"))

    print("   Dimensiones input :", df_in.shape)
    print("   Dimensiones output:", df_out.shape)
    print("   Dimensiones merged:", df_merged.shape)

    # Guardar
    out_path = os.path.join(TRAIN_PP, f"train_w{week:02d}.parquet")
    df_merged.to_parquet(out_path, index=False)

    del df_in, df_out, df_merged
    gc.collect()

print("\n✔ Todos los archivos train_pp creados correctamente.")


# CELDA 3 — Unir todas las semanas en un solo archivo

In [None]:
# ============================================
# CELDA 3: Unir semanas en train_full.parquet
# ============================================

import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import gc

TRAIN_PP = "/kaggle/working/train_pp"
train_full_path = "/kaggle/working/train_full.parquet"

files = sorted([f for f in os.listdir(TRAIN_PP) if f.endswith(".parquet")])
print("Archivos detectados:", len(files))

writer = None

for i, fname in enumerate(files):
    path = os.path.join(TRAIN_PP, fname)
    print(f">> Añadiendo {fname} ({i+1}/{len(files)})")

    df = pd.read_parquet(path)
    table = pa.Table.from_pandas(df, preserve_index=False)

    if writer is None:
        writer = pq.ParquetWriter(train_full_path, table.schema)

    writer.write_table(table)

    del df, table
    gc.collect()

if writer is not None:
    writer.close()

print("\n✔ train_full.parquet creado correctamente.")


# CELDA 4 — EDA PROFUNDO (Exploratory Data Analysis)

Esta celda:

Permite describir las variables de entrada

Analiza el dataset de forma científica

Permite dar la justificación objetiva para usar modelos secuenciales (RNN/CNN/Transformers)

In [None]:
# ===============================================
# CELDA 4: ANÁLISIS EXPLORATORIO DEL DATASET
# ===============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_parquet("/kaggle/working/train_full.parquet")

print("Shape final del dataset:", df.shape)
print("\nColumnas disponibles:")
print(df.columns.tolist())

# -----------------------------------------
# 1. Valores faltantes
# -----------------------------------------
missing = df.isna().mean().sort_values(ascending=False)

plt.figure(figsize=(8,6))
missing.head(20).plot.barh(color="steelblue")
plt.title("Proporción de valores faltantes por columna (Top 20)")
plt.xlabel("Proporción de NA")
plt.ylabel("Variable")
plt.show()

print("\nTabla completa de valores faltantes:")
print(missing)

# -----------------------------------------
# 2. Estadísticas descriptivas
# -----------------------------------------
print("\nEstadísticas descriptivas para columnas numéricas:")
print(df.describe())

# -----------------------------------------
# 3. Distribución de variables de movimiento
# -----------------------------------------
fig, axes = plt.subplots(2, 2, figsize=(12,8))

axes[0,0].hist(df["s"], bins=50, color="royalblue")
axes[0,0].set_title("Distribución de velocidad (s)")

axes[0,1].hist(df["a"], bins=50, color="firebrick")
axes[0,1].set_title("Distribución de aceleración (a)")

axes[1,0].hist(df["o"], bins=50, color="darkgreen")
axes[1,0].set_title("Distribución de orientación (o)")

axes[1,1].hist(df["dir"], bins=50, color="purple")
axes[1,1].set_title("Distribución de dirección (dir)")

plt.tight_layout()
plt.show()

# -----------------------------------------
# 4. Distribución de posiciones en el campo
# -----------------------------------------
plt.figure(figsize=(7,6))
sns.kdeplot(x=df["x_in"], y=df["y_in"], fill=True, cmap="viridis", thresh=0.05)
plt.title("Mapa de densidad del campo (x_in vs y_in)")
plt.xlabel("x_in")
plt.ylabel("y_in")
plt.show()

# -----------------------------------------
# 5. Variables objetivo: x_out, y_out
# -----------------------------------------
fig, ax = plt.subplots(1, 2, figsize=(12,5))

ax[0].hist(df["x_out"], bins=50, color="orange")
ax[0].set_title("Distribución de x_out (target)")

ax[1].hist(df["y_out"], bins=50, color="teal")
ax[1].set_title("Distribución de y_out (target)")

plt.show()

# -----------------------------------------
# 6. Frame_id: estructura temporal del tracking
# -----------------------------------------
plt.figure(figsize=(7,4))
df["frame_id"].hist(bins=60, color="gray")
plt.title("Distribución global de frame_id")
plt.xlabel("frame_id (tiempo discreto)")
plt.ylabel("Frecuencia")
plt.show()

print("\nEDA completo generado correctamente.")


# CELDA 5 — Limpieza, ingeniería de características y preparación final del dataset

### Resumen: Preprocesamiento y Feature Engineering (Celda 5)

Esta etapa garantiza la **integridad de los datos** y adapta las variables para la arquitectura del Transformer:

1.  **Seguridad (Data Safety):** Se crea una `copy()` del dataframe para preservar la fuente original intacta.
2.  **Conversión de Unidades:**
    * **Estatura:** Transformación de formato texto (pies-pulgadas) a escala numérica continua (pulgadas) para consistencia matemática.
    * **Temporalidad:** Conversión de *fecha de nacimiento* a *edad*, maximizando la carga informativa.
3.  **Tratamiento de Señales Cíclicas (Ángulos):**
    * Las variables de orientación (`dir`, `o`) se descomponen en sus componentes ortogonales: $\sin(\theta)$ y $\cos(\theta)$.
    * **Justificación:** Elimina la discontinuidad numérica entre $0^\circ$ y $360^\circ$, fundamental para que el modelo interprete correctamente la dirección espacial.
4.  **Codificación Categórica (LabelEncoder):**
    * Se transforman categorías (jugadores, posiciones) a índices enteros.
    * **Motivo:** Necesario para alimentar las capas de **Embeddings** (vectores densos) del Transformer, evitando la explosión dimensional del *One-Hot Encoding*.
5.  **Normalización (`StandardScaler`):**
    * Estandarización de variables numéricas ($z = \frac{x - \mu}{\sigma}$).
    * **Crítico:** Los mecanismos de atención (*dot products*) requieren escalas balanceadas para evitar inestabilidad en los gradientes y asegurar una convergencia eficiente.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# ============================================================
# 1. COPIA DE SEGURIDAD
# ============================================================
df_clean = df.copy()

print("Shape inicial:", df_clean.shape)

# ============================================================
# 2. PROCESAMIENTO DE ALTURA: Convertir '6-2' → 74 pulgadas
# ============================================================
def convert_height(h):
    try:
        feet, inches = h.split("-")
        return int(feet) * 12 + int(inches)
    except:
        return np.nan

df_clean["player_height_inches"] = df_clean["player_height"].apply(convert_height)

# ============================================================
# 3. CONVERSIÓN DE FECHA A EDAD
# ============================================================
df_clean["player_birth_date"] = pd.to_datetime(df_clean["player_birth_date"], errors="coerce")
df_clean["player_age"] = (pd.Timestamp("2024-01-01") - df_clean["player_birth_date"]).dt.days / 365.25

# ============================================================
# 4. TRATAMIENTO DE VARIABLES ANGULARES (dir, o)
#    Convertir a seno y coseno para evitar discontinuidades
# ============================================================
df_clean["dir_rad"] = np.deg2rad(df_clean["dir"])
df_clean["o_rad"] = np.deg2rad(df_clean["o"])

df_clean["dir_sin"] = np.sin(df_clean["dir_rad"])
df_clean["dir_cos"] = np.cos(df_clean["dir_rad"])

df_clean["o_sin"] = np.sin(df_clean["o_rad"])
df_clean["o_cos"] = np.cos(df_clean["o_rad"])

# ============================================================
# 5. ELIMINAR COLUMNAS QUE YA NO SE NECESITAN
# ============================================================
df_clean.drop(columns=["player_height", "dir_rad", "o_rad"], inplace=True)

# ============================================================
# 6. CODIFICACIÓN CATEGÓRICA (Label Encoding)
#    Para usarlas luego como embeddings en el modelo
# ============================================================
categorical_cols = [
    "player_position",
    "player_side",
    "player_role",
    "play_direction",
    "player_name"
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    label_encoders[col] = le

# ============================================================
# 7. NORMALIZACIÓN DE VARIABLES NUMÉRICAS
# ============================================================
num_cols = [
    "x_in", "y_in", "s", "a",
    "absolute_yardline_number",
    "player_height_inches", "player_weight", "player_age",
    "dir_sin", "dir_cos", "o_sin", "o_cos"
]

scaler = StandardScaler()
df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])

# ============================================================
# 8. RESULTADOS
# ============================================================
print("\nShape final:", df_clean.shape)
print("\nColumnas finales:")
print(df_clean.columns.tolist())
print("\nEjemplo de 5 filas:\n", df_clean.head())


# Correcciones.

### Depuración y Selección de Variables (Feature Selection)

El siguiente análisis justifica las acciones correctivas aplicadas al conjunto de datos para garantizar la validez científica del modelo y evitar errores metodológicos comunes:

1.  **Prevención de Fuga de Información (Data Leakage):**
    * Se excluyen las variables `ball_land_x` y `ball_land_y`. Dado que estas métricas representan el estado final del evento (coordenadas de aterrizaje), su inclusión en el vector de entrada violaría el principio de causalidad, permitiendo al modelo inferir el resultado trivialmente en lugar de aprender la dinámica del sistema. Esto asegura métricas de evaluación realistas y no artificialmente optimistas.

2.  **Mitigación de Sobreajuste por Identidad:**
    * Se elimina la variable `player_name`. Al tratarse de un identificador de alta cardinalidad, su presencia induce al modelo a memorizar comportamientos de individuos específicos en lugar de generalizar patrones físicos universales. En un contexto riguroso, se priorizan las características atléticas sobre las etiquetas nominales.

3.  **Reducción de Redundancia y Ruido:**
    * Se descarta `player_birth_date` en favor de la variable derivada `player_age`. Mantener la fecha cruda introduce ruido de formato y duplicidad dimensional sin aportar valor predictivo adicional sobre la edad calculada.

4.  **Codificación y Reproducibilidad:**
    * La conversión de variables categóricas mediante `LabelEncoder` se valida como el método adecuado para alimentar capas de *embeddings*. Se enfatiza la necesidad de persistencia (guardado) de estos objetos codificadores para garantizar la correcta transformación de los datos de prueba y la reproducibilidad futura del sistema.

5.  **Estabilidad Numérica del Entrenamiento:**
    * El escalado de variables numéricas es un requisito mandatorio. Arquitecturas basadas en gradientes, como Transformers o MLPs, requieren entradas normalizadas para evitar la saturación de neuronas y asegurar una convergencia estable y eficiente.

6.  **Coherencia del Conjunto de Entrenamiento:**
    * Se aplica un filtro estricto sobre `player_to_predict == True`. Entrenar con instancias que carecen de una variable objetivo definida o válida introduce ruido y distorsiona la función de pérdida. El modelo debe aprender exclusivamente de muestras con una relación entrada-salida verificable.

In [None]:
# ============================
# CELDA CORRECCIÓN: LIMPIEZA FINAL
# ============================
import os
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

df2 = df_clean.copy()  # partir de tu df_clean actual

# 1) Eliminar columnas que pueden originar leakage o que no son features útiles
cols_drop = ["ball_land_x", "ball_land_y", "player_name", "player_birth_date"]
for c in cols_drop:
    if c in df2.columns:
        df2.drop(columns=[c], inplace=True)

# 2) Asegurarnos que 'player_to_predict' es booleana y filtrar si corresponde
if "player_to_predict" in df2.columns:
    # Si tu objetivo es predecir solo para el sujeto marcado True, filtra:
    # (Sino omite esta línea. Revisa si tienes que usar todas las filas.)
    df2 = df2[df2["player_to_predict"]==True].reset_index(drop=True)

# 3) Columnas categóricas a usar como índices para embeddings (NO incluir player_name)
categorical_cols = []
for c in ["player_position", "player_side", "player_role", "play_direction"]:
    if c in df2.columns:
        categorical_cols.append(c)

# Aplicar LabelEncoder y guardar encoders
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df2[col].astype(str))
    label_encoders[col] = le

# Guardar los encoders para uso posterior
os.makedirs("/kaggle/working/encoders", exist_ok=True)
with open("/kaggle/working/encoders/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

# 4) Seleccionar columnas numéricas a escalar (excluir targets x_out, y_out)
num_cols = [
    "x_in", "y_in", "s", "a",
    "absolute_yardline_number",
    "player_height_inches", "player_weight", "player_age",
    "dir_sin", "dir_cos", "o_sin", "o_cos"
]
# Filtrar las que realmente existan
num_cols = [c for c in num_cols if c in df2.columns]

scaler = StandardScaler()
df2[num_cols] = scaler.fit_transform(df2[num_cols])

# Guardar scaler
with open("/kaggle/working/encoders/num_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# 5) Eliminar columnas que ya no usarás (opcionales y seguras)
#    (por ejemplo: player_to_predict si ya filtraste, o nfl_id si no la usarás)
# Aqui dejo nfl_id porque puede ser útil si vas a hacer embeddings de jugadores; si no, descomenta la línea.
# df2.drop(columns=["nfl_id"], inplace=True)

# 6) Resumen final
print("Shape final después de correcciones:", df2.shape)
print("Columnas finales (preview):", df2.columns.tolist())

# Guardar parquet corregido para las siguientes celdas
out_path = "/kaggle/working/train_clean.parquet"
df2.to_parquet(out_path, index=False)
print("Guardado:", out_path)


# Celda 6 - cargar dataset limpio.
Esta celda construye el corazón del modelo completo:

✔️ 1. Positional Encoding

Como en cualquier Transformer:

Añade una señal sen/cos para indicar posición temporal.

Sin ella, el modelo no sabría cuál frame es primero o último.

✔️ 2. Linear Input Projection

La entrada tiene ~14–16 features.
El Transformer funciona mejor en espacios más grandes:

→ Lo ampliamos a d_model = 128.

✔️ 3. Transformer Encoder

Con:

atención multi-cabeza

feedforward interno

dropout

máscara de padding

Esto permite aprender patrones temporales:

cambios de velocidad

dirección

movimiento lateral

desplazamiento continuo en el tiempo

✔️ 4. Output Head

Toma cada frame del encoding y predice:

x_out

y_out

✔️ 5. Máscara de padding

Evita que secuencias cortas contaminen el batch.

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import math

# ============================================================
# Cargar dataset limpio solo para obtener input_dim
# ============================================================
df_tmp = pd.read_parquet("/kaggle/working/train_clean.parquet")

# features de entrada (sin targets)
INPUT_FEATURES = [
    "x_in","y_in","s","a",
    "dir_sin","dir_cos",
    "o_sin","o_cos",
    "play_direction",
    "absolute_yardline_number",
    "player_height_inches",
    "player_weight",
    "player_age"
]

input_dim = len(INPUT_FEATURES)
print("input_dim =", input_dim)


# ============================================================
# POSITIONAL ENCODING
# ============================================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]


# ============================================================
# TRANSFORMER MODEL
# ============================================================
class TransformerTrajectoryModel(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=4,
                 num_layers=3, dim_feedforward=256,
                 dropout=0.1, output_dim=2):

        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=num_layers)

        self.output_head = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Linear(d_model // 2, output_dim)
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x, padding_mask=None):
        x = self.input_proj(x)
        x = self.pos_encoder(x)

        if padding_mask is not None:
            src_key_padding_mask = (padding_mask == 0)
        else:
            src_key_padding_mask = None

        enc = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        out = self.output_head(enc)
        return out


# ============================================================
# TEST DEL MODELO (sin requerir que exista batch_x)
# ============================================================
print("Generando batch temporal de prueba...")
batch_x = torch.randn(4, 20, input_dim)
batch_mask = torch.ones(4, 20)

model = TransformerTrajectoryModel(input_dim=input_dim)
preds = model(batch_x, padding_mask=batch_mask)

print("Pred shape =", preds.shape)


# Celda 7 - construcción de secuencias, DataLoaders y split por grupo
##  Preparación de Datos Secuenciales para Modelo Transformer

Este proceso se centra en transformar datos de seguimiento (tracking data) en secuencias estandarizadas, adecuadas para el entrenamiento de un modelo de aprendizaje profundo que predice coordenadas **frame-a-frame**.

| Sección | Concepto | Descripción Detallada |
| :---: | :---: | :--- |
| **1** | **Selección de Entradas y Objetivos** | **`INPUT_FEATURES`** corresponde exactamente a las **columnas normalizadas y pre-procesadas** (ej. $v_x, v_y, \text{distancia a la pelota}$, etc.). **`TARGET_COLS = ["x_out", "y_out"]`** son las coordenadas de salida (posiciones) que el modelo debe predecir en cada paso de tiempo (frame). |
| **2** | **Agrupamiento para Crear Secuencias** | Se agrupa la información utilizando la tupla **`(game_id, play_id, nfl_id)`**. Esto es crucial porque cada secuencia debe encapsular el movimiento **completo de un jugador específico** dentro de una jugada única. Luego, se ordena estrictamente por **`frame_id`** para mantener la **cronología temporal** de la señal. |
| **3** | **Filtrado por Longitud Mínima** | Se impone un umbral **`MIN_SEQ_LEN = 2`**. Este filtro es una medida de calidad para garantizar que cada secuencia resultante contenga al menos un par **entrada $\rightarrow$ salida** significativo. Secuencias de longitud 1 se consideran **triviales** y se descartan por no ofrecer suficiente contexto para el entrenamiento secuencial. |
| **4** | **Split por GroupKFold (Estrategia Anti-Leakage)** | Se utiliza **`GroupKFold`** con **`play_id`** como variable de agrupación. Esta es una práctica fundamental para evitar el **data leakage** (fuga de información), asegurando que una misma jugada **entera** nunca aparezca simultáneamente en los conjuntos de entrenamiento y validación. Se seleccionan **5 folds** y se usa el primer *split* para obtener el conjunto de validación. |
| **5** | **Dataset y `collate_fn` (Padding y Máscara)** | La clase **`NFLSeqDataset`** produce secuencias de **longitud variable** ($T_i, F$). La función **`collate_fn`** es esencial: realiza el **padding** (relleno) de las secuencias al largo máximo del *batch* ($T_{\text{max}}$) y, crucialmente, genera una **máscara binaria** (**`mask`**) donde $1.0$ indica un *token* (frame) **válido** y $0.0$ indica *padding*. Esta máscara se convierte y se utiliza en el Transformer como **`src_key_padding_mask`** (donde $0 \rightarrow \text{válido}, 1 \rightarrow \text{pad}$). |
| **6** | **DataLoader y `batch_size`** | Se establece **`BATCH_SIZE = 32`**, un valor que apunta a un buen equilibrio entre velocidad de entrenamiento y **estabilidad del gradiente**. Se utilizan **`num_workers=2`** y **`pin_memory=True`** para optimizar la transferencia de datos a la GPU (si aplica). El uso de **`shuffle=True`** en el *train set* es la manera tradicional de garantizar la mezcla de datos en cada *epoch*. |
| **7** | **Salidas Verificadas (Dimensionalidad del Batch)** | Los tensores resultantes del `DataLoader` cumplen con las siguientes formas (donde $B=32$ y $T_{\text{max}}$ es la longitud máxima del batch): |
| | **`batch_x` (Entradas)** | Forma: $(32, T_{\text{max}}, 13)$ |
| | **`batch_y` (Objetivos)** | Forma: $(32, T_{\text{max}}, 2)$ |
| | **`batch_mask` (Máscara)** | Forma: $(32, T_{\text{max}})$ (Valores: $1.0$ para válido, $0.0$ para *padding*) |
| | **`lengths` (Longitudes Reales)** | Vector de longitudes reales ($T_i$) para cada secuencia del batch. |

In [None]:
# ============================================================
# CELDA 7 — ConstruCCIÓN DE SECUENCIAS (frame-to-frame) y DATALOADERS
# ============================================================

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import GroupKFold

# ============================================================
# 1) Cargar dataset limpio
# ============================================================
df = pd.read_parquet("/kaggle/working/train_clean.parquet")
print("df shape:", df.shape)

# ============================================================
# 2) Construir secuencias por (game_id, play_id, nfl_id)
# ============================================================
group_cols = ["game_id", "play_id", "nfl_id"]

# Orden por frame para mantener secuencias correctas
df = df.sort_values(group_cols + ["frame_id"])

# Variables de entrada
input_features = ["x_in", "y_in", "s", "a", "dir", "o",
                  "player_height_inches", "player_age",
                  "dir_sin", "dir_cos", "o_sin", "o_cos",
                  "absolute_yardline_number"]

# Variables de salida
output_features = ["x_out", "y_out"]

X_seqs, Y_seqs, group_ids = [], [], []

print("Construyendo secuencias:")
for (g, p, n), group_df in tqdm(df.groupby(group_cols), total=df[group_cols].drop_duplicates().shape[0]):

    x_seq = group_df[input_features].values.astype(np.float32)
    y_seq = group_df[output_features].values.astype(np.float32)

    X_seqs.append(torch.tensor(x_seq))
    Y_seqs.append(torch.tensor(y_seq))

    # group_id = play_id → asegura que un mismo play no aparezca en train y val
    group_ids.append(p)

print("Total secuencias creadas:", len(X_seqs))

seq_lengths = [len(x) for x in X_seqs]
print("Longitud media de secuencia:", np.mean(seq_lengths), "mediana:", np.median(seq_lengths))

# ============================================================
# 3) Padding y creación del Dataset
# ============================================================

class SequenceDataset(Dataset):
    def __init__(self, X_list, Y_list, max_len=20):
        self.X_list = X_list
        self.Y_list = Y_list
        self.max_len = max_len

    def __len__(self):
        return len(self.X_list)

    def __getitem__(self, idx):
        x = self.X_list[idx]
        y = self.Y_list[idx]

        L = len(x)
        pad_len = self.max_len - L

        if pad_len > 0:
            x_pad = torch.cat([x, torch.zeros(pad_len, x.shape[1])], dim=0)
            y_pad = torch.cat([y, torch.zeros(pad_len, y.shape[1])], dim=0)
            mask = torch.cat([torch.ones(L), torch.zeros(pad_len)])
        else:
            x_pad = x[:self.max_len]
            y_pad = y[:self.max_len]
            mask = torch.ones(self.max_len)

        return x_pad, y_pad, mask

MAX_LEN = 20
dataset_all = SequenceDataset(X_seqs, Y_seqs, max_len=MAX_LEN)

# ============================================================
# 4) Split train/val por GroupKFold
# ============================================================

gkf = GroupKFold(n_splits=5)
indices = np.arange(len(X_seqs))

train_idx, val_idx = None, None

# X = indices, y = None, groups = group_ids
for train_index, val_index in gkf.split(indices, groups=group_ids):
    train_idx, val_idx = train_index, val_index
    break

print("Train sequences:", len(train_idx), "Val sequences:", len(val_idx))

# ============================================================
# 5) Crear DataLoaders (BATCH SIZE = 32)
# ============================================================

batch_size = 32

train_subset = torch.utils.data.Subset(dataset_all, train_idx)
val_subset   = torch.utils.data.Subset(dataset_all, val_idx)

train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

print("DataLoaders creados correctamente.")


# CELDA 8 — ENTRENAMIENTO (Transformer, AdamW, MAE, EarlyStopping, Checkpoint)

**1) Device**

Se detecta CUDA y se usa DEVICE. En Kaggle, si añadiste GPU, será GPU; si no, CPU (más lento).

**2) Modelo**

Reutiliza la clase TransformerTrajectoryModel que ya probaste y validaste (input_dim=13). Mantengo d_model=128, 3 capas, 4 cabezas.

**3) Optimizer AdamW**

AdamW corrige el decaimiento de peso aplicado directamente a los parámetros (weight decay) y suele generalizar mejor.

**4) Scheduler ReduceLROnPlateau**

Si el val_loss no mejora, baja LR a la mitad (factor 0.5) tras patience=3. Esto estabiliza ajustes finos.

**5) Loss**

Usamos MAE (L1). Implementación: sumamos errores solo donde mask==1, dividimos por número real de elementos válidos. De este modo evitamos que el padding contamine la métrica.

**6) Masked computation**

mask es (B,T) con 1.0=valido, 0.0=pad.

Expandimos a (B,T,1) para combinar con pred y target.

Evitamos división por cero con +1e-12.

**7) Gradient clipping**

Cortamos norma gradiente a 1.0 para evitar saltos. Muy útil en Transformers.

**8) Checkpoint**

Guardamos el mejor modelo (según val_mae) en MODEL_PATH. También guardamos el optimizador en ese checkpoint.

**9) Early stopping**

PATIENCE=6 epochs sin mejora → interrumpe. Evita sobreentrenar.

**10) Historial**

Se guarda history con métricas que puedes graficar luego (train/val MAE vs epoch).

In [None]:
# ============================================================
# CELDA 8 — ENTRENAMIENTO
# ============================================================
import os
import time
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np

# Parámteros de entrenamiento
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 30
LR = 1e-4
WEIGHT_DECAY = 1e-5
PATIENCE = 6                 # early stopping patience
GRAD_CLIP = 1.0
MODEL_PATH = "/kaggle/working/best_transformer_model.pth"

print("DEVICE:", DEVICE)

# ---------------------------------------------------------------------
# Reutilizamos la clase TransformerTrajectoryModel ya definida en la celda 6.
# Asegúrate de que la definición está en memoria; si no, vuelve a correr la celda 6.
# ---------------------------------------------------------------------
model = TransformerTrajectoryModel(input_dim=input_dim, d_model=128, nhead=4, num_layers=3, dim_feedforward=256, dropout=0.1, output_dim=2)
model.to(DEVICE)

# Optimizer + scheduler
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Loss: podremos calcular MAE manualmente con mask
# pero podemos use nn.L1Loss(reduction='sum') and divide by valid tokens
criterion = nn.L1Loss(reduction='sum')

# Early stopping bookkeeping
best_val_mae = float("inf")
best_model_wts = copy.deepcopy(model.state_dict())
epochs_no_improve = 0
history = {"train_mae": [], "val_mae": [], "train_loss": [], "val_loss": [], "lr": []}

# Helper to compute masked MAE
def masked_mae(preds, targets, mask):
    """
    preds, targets: (B, T, 2)
    mask: (B, T) with 1.0 valid, 0.0 padding
    returns: mae scalar
    """
    # Expand mask to match last dim
    mask3 = mask.unsqueeze(-1)  # (B, T, 1)
    diff = torch.abs(preds - targets) * mask3  # (B, T, 2)
    sum_diff = diff.sum()
    valid = mask3.sum()
    if valid.item() == 0:
        return torch.tensor(0.0, device=preds.device), 0.0
    mae = (sum_diff / valid).item()  # mean absolute error per coordinate
    return torch.tensor(mae, device=preds.device), valid.item()

# Training loop
for epoch in range(1, EPOCHS + 1):
    t0 = time.time()
    model.train()
    running_loss = 0.0
    running_mae_sum = 0.0
    running_tokens = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS} — train", leave=False)
    for batch in pbar:
        # batch: x_pad, y_pad, mask (SequenceDataset returns these)
        x_batch, y_batch, mask_batch = batch  # shapes: (B, T, F), (B, T, 2), (B, T)
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)
        mask_batch = mask_batch.to(DEVICE)

        optimizer.zero_grad()
        preds = model(x_batch, padding_mask=mask_batch)  # (B, T, 2)

        # compute loss only on valid tokens
        # use criterion with reduction=sum then divide by valid tokens
        mask3 = mask_batch.unsqueeze(-1)
        loss_sum = torch.abs(preds - y_batch) * mask3
        loss_sum = loss_sum.sum()  # scalar sum over batch and dims
        valid_tokens = mask3.sum() * preds.shape[-1]  # total scalar elements considered (B*T*2)
        # We want MAE per coordinate, so divide by number of coordinate-elements
        loss = loss_sum / (valid_tokens + 1e-12)

        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        # stats
        running_loss += loss.item() * mask_batch.sum().item()  # weighted by tokens
        mae_batch, valid_count = masked_mae(preds, y_batch, mask_batch)
        running_mae_sum += mae_batch.item() * valid_count
        running_tokens += valid_count

        pbar.set_postfix({"loss": f"{loss.item():.4f}", "mae": f"{mae_batch.item():.4f}"})

    # epoch train metrics
    train_epoch_mae = (running_mae_sum / running_tokens) if running_tokens > 0 else 0.0
    train_epoch_loss = running_loss / (running_tokens + 1e-12)

    # Validation
    model.eval()
    val_running_mae_sum = 0.0
    val_running_tokens = 0.0
    val_running_loss_sum = 0.0

    with torch.no_grad():
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch}/{EPOCHS} — val  ", leave=False)
        for batch in pbar_val:
            x_batch, y_batch, mask_batch = batch
            x_batch = x_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            mask_batch = mask_batch.to(DEVICE)

            preds = model(x_batch, padding_mask=mask_batch)

            # compute validation losses
            mask3 = mask_batch.unsqueeze(-1)
            loss_sum = torch.abs(preds - y_batch) * mask3
            loss_sum = loss_sum.sum()
            valid_tokens = mask3.sum() * preds.shape[-1]
            loss = loss_sum / (valid_tokens + 1e-12)

            mae_batch, valid_count = masked_mae(preds, y_batch, mask_batch)
            val_running_mae_sum += mae_batch.item() * valid_count
            val_running_tokens += valid_count
            val_running_loss_sum += loss.item() * valid_count

    val_epoch_mae = (val_running_mae_sum / val_running_tokens) if val_running_tokens > 0 else 0.0
    val_epoch_loss = val_running_loss_sum / (val_running_tokens + 1e-12)

    # Scheduler step on validation loss
    scheduler.step(val_epoch_loss)

    # logging
    history["train_mae"].append(train_epoch_mae)
    history["val_mae"].append(val_epoch_mae)
    history["train_loss"].append(train_epoch_loss)
    history["val_loss"].append(val_epoch_loss)
    history["lr"].append(optimizer.param_groups[0]["lr"])

    epoch_time = time.time() - t0
    print(f"Epoch {epoch:02d} | time {epoch_time:.1f}s | train_mae {train_epoch_mae:.4f} | val_mae {val_epoch_mae:.4f} | lr {optimizer.param_groups[0]['lr']:.2e}")

    # Early stopping & checkpoint
    if val_epoch_mae < best_val_mae - 1e-5:
        best_val_mae = val_epoch_mae
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save({"model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, MODEL_PATH)
        print(f"  -> New best model saved (val_mae {best_val_mae:.4f})")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"  No improvement for {epochs_no_improve} epoch(s)")

    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping triggered (no improvement for {PATIENCE} epochs).")
        break

# Load best model weights
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), "/kaggle/working/final_best_model_weights.pth")

print("Training finished. Best val_mae:", best_val_mae)
print("Best model saved to:", MODEL_PATH)


# CELDA 9 (inferencia + submission.csv) con el modelo final

1. Carga y verifica que existan test_input.csv, test.csv, los encoders y el modelo.

2. Aplica las mismas transformaciones del entrenamiento (height→inches, birth_date→age, sin/cos angulares, label encoding seguro y scaler numérico).

3. Para valores categóricos desconocidos en test, mapea a 0 (evita errores de LabelEncoder).

4. Agrupa por (game_id, play_id, nfl_id) y crea secuencias exactamente igual que en entrenamiento, manteniendo un puntero _orig_index por cada fila para reconstruir el orden original.

5. Crea batches y hace inferencia con el Transformer guardado.

6. Asigna las predicciones por fila del test_input (por índice original).

7. Hace merge con test.csv usando las columnas comunes (usualmente game_id, play_id, nfl_id, frame_id) para obtener el orden y formato que exige la competición. Si no existen columnas en común intenta alinear por orden, con comprobación de tamaño.

8. Guarda submission.csv en /kaggle/working/submission.csv.



In [None]:
# ============================================================
# CELDA 9 — INFERENCIA (test_input.csv) y CREAR submission.csv
# Objetivo: predecir (x_out, y_out) para cada fila de test_input
# Necesita: /kaggle/working/best_transformer_model.pth
#          /kaggle/working/encoders/num_scaler.pkl
#          /kaggle/working/encoders/label_encoders.pkl
# ============================================================

import os
import pandas as pd
import numpy as np
import torch
import pickle
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# ---------------------------
# Paths
# ---------------------------
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction"
TEST_INPUT_PATH = os.path.join(DATA_DIR, "test_input.csv")
TEST_KEYS_PATH  = os.path.join(DATA_DIR, "test.csv")
ENC_DIR = "/kaggle/working/encoders"
SCALER_PATH = os.path.join(ENC_DIR, "num_scaler.pkl")
LABELS_PATH = os.path.join(ENC_DIR, "label_encoders.pkl")
MODEL_PATH = "/kaggle/working/best_transformer_model.pth"
OUT_SUB = "/kaggle/working/submission.csv"

# ---------------------------
# Comprobaciones de archivos
# ---------------------------
assert os.path.exists(TEST_INPUT_PATH), f"No se encontró {TEST_INPUT_PATH}"
assert os.path.exists(TEST_KEYS_PATH),  f"No se encontró {TEST_KEYS_PATH}"
assert os.path.exists(SCALER_PATH),     f"No se encontró {SCALER_PATH}"
assert os.path.exists(LABELS_PATH),     f"No se encontró {LABELS_PATH}"
assert os.path.exists(MODEL_PATH),      f"No se encontró {MODEL_PATH}"

print("Todos los archivos necesarios están presentes.")

# ---------------------------
# Cargar test files
# ---------------------------
test_input = pd.read_csv(TEST_INPUT_PATH)
test_keys  = pd.read_csv(TEST_KEYS_PATH)

print("test_input.shape:", test_input.shape)
print("test_keys.shape:", test_keys.shape)

# ---------------------------
# Helper transforms (mirror entrenamiento)
# ---------------------------
def convert_height(h):
    try:
        feet, inches = str(h).split("-")
        return int(feet) * 12 + int(inches)
    except:
        return np.nan

# Cargar encoders y scaler
with open(SCALER_PATH, "rb") as f:
    scaler = pickle.load(f)

with open(LABELS_PATH, "rb") as f:
    label_encoders = pickle.load(f)

# Columnas de entrada usadas por el modelo (mismo orden que input_dim)
INPUT_FEATURES = [
    "x_in","y_in","s","a",
    "dir_sin","dir_cos",
    "o_sin","o_cos",
    "play_direction",
    "absolute_yardline_number",
    "player_height_inches",
    "player_weight",
    "player_age"
]

# ---------------------------
# 1) Preprocesamiento de test_input (mirror train)
# ---------------------------
df_test = test_input.copy()
print("Preprocesando test_input ...")

# altura
if "player_height" in df_test.columns:
    df_test["player_height_inches"] = df_test["player_height"].apply(convert_height)
else:
    # si ya existe (caso raro), mantenemos
    if "player_height_inches" not in df_test.columns:
        df_test["player_height_inches"] = np.nan

# birth_date -> age
if "player_birth_date" in df_test.columns:
    df_test["player_birth_date"] = pd.to_datetime(df_test["player_birth_date"], errors="coerce")
    df_test["player_age"] = (pd.Timestamp("2024-01-01") - df_test["player_birth_date"]).dt.days / 365.25
else:
    if "player_age" not in df_test.columns:
        df_test["player_age"] = np.nan

# angulos -> sin/cos (dir, o)
if "dir" in df_test.columns:
    dir_rad = np.deg2rad(df_test["dir"].fillna(0.0).astype(float))
    df_test["dir_sin"] = np.sin(dir_rad)
    df_test["dir_cos"] = np.cos(dir_rad)
else:
    if "dir_sin" not in df_test.columns:
        df_test["dir_sin"] = 0.0
        df_test["dir_cos"] = 1.0

if "o" in df_test.columns:
    o_rad = np.deg2rad(df_test["o"].fillna(0.0).astype(float))
    df_test["o_sin"] = np.sin(o_rad)
    df_test["o_cos"] = np.cos(o_rad)
else:
    if "o_sin" not in df_test.columns:
        df_test["o_sin"] = 0.0
        df_test["o_cos"] = 1.0

# Asegurar columnas numéricas y tipos
for c in ["x_in","y_in","s","a","absolute_yardline_number","player_weight"]:
    if c not in df_test.columns:
        df_test[c] = 0.0

# Label encoding: mapear usando los label_encoders guardados
cat_cols = list(label_encoders.keys())
for c in cat_cols:
    if c in df_test.columns:
        le = label_encoders[c]
        # transform seguro: mapear valores desconocidos a 0
        def safe_transform(vals, le=le):
            out = []
            classes = set(le.classes_.astype(str))
            for v in vals.astype(str):
                if v in classes:
                    out.append(int(le.transform([v])[0]))
                else:
                    out.append(0)  # mapeo seguro para unseen
            return np.array(out, dtype=int)
        df_test[c] = safe_transform(df_test[c])

# Normalizar numéricas usando scaler (igual que en train)
num_cols = ["x_in","y_in","s","a","absolute_yardline_number",
            "player_height_inches","player_weight","player_age",
            "dir_sin","dir_cos","o_sin","o_cos"]

# Si faltan columnas en test, crear con ceros para evitar error
for c in num_cols:
    if c not in df_test.columns:
        df_test[c] = 0.0

# Aplicar scaler: scaler espera exactamente las columnas que se le dio en train.
# Para seguridad, construimos array con las columnas num_cols en el mismo orden.
arr_nums = df_test[num_cols].to_numpy(dtype=float)
arr_nums[np.isnan(arr_nums)] = 0.0  # rellenar NaNs por 0 antes de scaler
arr_scaled = scaler.transform(arr_nums)
df_test[num_cols] = arr_scaled

print("Preprocesamiento terminado. Columnas disponibles:", df_test.shape[1])

# ---------------------------
# 2) Construcción de secuencias del test y mapeo a índices originales
# ---------------------------
group_cols = ["game_id", "play_id", "nfl_id"]

# Orden por frame_id (si existe)
if "frame_id" in df_test.columns:
    df_test = df_test.sort_values(group_cols + ["frame_id"]).reset_index(drop=True)
else:
    df_test = df_test.sort_values(group_cols).reset_index(drop=True)

# Para mapear predicciones de vuelta a filas del test_input guardamos índices
original_index = np.arange(len(df_test))
df_test["_orig_index"] = original_index

# Agrupar y construir lista de secuencias + lista de original index lists
seq_groups = []
seq_index_refs = []  # por cada secuencia guardamos la lista de original indices (ordenados por frame)
for (g,p,n), gdf in df_test.groupby(group_cols, sort=False):
    gdf_sorted = gdf.sort_values("frame_id") if "frame_id" in gdf.columns else gdf
    X = gdf_sorted[INPUT_FEATURES].to_numpy(dtype=np.float32)
    idxs = gdf_sorted["_orig_index"].to_numpy(dtype=int)
    seq_groups.append(X)
    seq_index_refs.append(idxs)

print("Secuencias creadas en test:", len(seq_groups))

# ---------------------------
# 3) Dataset test y DataLoader
# ---------------------------
class TestSequenceDataset(Dataset):
    def __init__(self, X_list, idx_refs):
        self.X_list = X_list
        self.idx_refs = idx_refs

    def __len__(self):
        return len(self.X_list)

    def __getitem__(self, i):
        x = torch.from_numpy(self.X_list[i])  # (T, F)
        idxs = self.idx_refs[i]
        return x, idxs

def collate_test(batch):
    xs = [item[0] for item in batch]
    idxs = [item[1] for item in batch]
    lengths = [x.shape[0] for x in xs]
    T_max = max(lengths)
    F = xs[0].shape[1]
    x_padded = torch.zeros((len(xs), T_max, F), dtype=torch.float32)
    mask = torch.zeros((len(xs), T_max), dtype=torch.float32)
    idxs_padded = []
    for i, x in enumerate(xs):
        t = x.shape[0]
        x_padded[i, :t, :] = x
        mask[i, :t] = 1.0
        idxs_padded.append(idxs[i])  # guardar lista original (no pad)
    return x_padded, mask, idxs_padded, lengths

test_dataset = TestSequenceDataset(seq_groups, seq_index_refs)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_test)

# ---------------------------
# 4) Cargar modelo y ejecutar inferencia (batch)
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Asegúrate que TransformerTrajectoryModel está definido en memoria (celda 6).
# Si no, vuelve a ejecutar la celda 6 antes.
model = TransformerTrajectoryModel(input_dim=len(INPUT_FEATURES), d_model=128, nhead=4, num_layers=3)
ckpt = torch.load(MODEL_PATH, map_location=device)
# ckpt might be dict with model_state_dict or direct state_dict
if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
    model.load_state_dict(ckpt["model_state_dict"])
else:
    model.load_state_dict(ckpt)
model.to(device)
model.eval()

# Pre-allocate arrays to store predictions per original row
pred_x = np.full(len(df_test), np.nan, dtype=float)
pred_y = np.full(len(df_test), np.nan, dtype=float)

with torch.no_grad():
    for x_padded, mask_batch, idxs_padded, lengths in tqdm(test_loader, desc="Inferencia test"):
        x_padded = x_padded.to(device)
        mask_batch = mask_batch.to(device)
        preds = model(x_padded, padding_mask=mask_batch)  # (B, T, 2)
        preds = preds.cpu().numpy()
        mask_np = mask_batch.cpu().numpy()
        # asignar por secuencia
        for i in range(len(idxs_padded)):
            idxs = idxs_padded[i]          # array de indices originales para esta secuencia (shape T_i)
            t = lengths[i]
            seq_preds = preds[i, :t, :]    # (T_i, 2)
            # mapear a pred_x/pred_y por índice original
            pred_x[idxs] = seq_preds[:, 0]
            pred_y[idxs] = seq_preds[:, 1]

print("Inferencia completa. Predicciones asignadas a filas del test_input.")

# ---------------------------
# 5) Construir DataFrame de predicciones y unir con test_keys (orden requerido)
# ---------------------------
pred_df = df_test[["game_id","play_id","nfl_id","frame_id","_orig_index"]].copy() if "frame_id" in df_test.columns else df_test[["game_id","play_id","nfl_id","_orig_index"]].copy()
pred_df["x_out"] = pred_x
pred_df["y_out"] = pred_y

# Determinar columnas en común para merge (usaremos todas las columnas que test_keys tiene)
merge_cols = [c for c in ["game_id","play_id","nfl_id","frame_id"] if c in test_keys.columns and c in pred_df.columns]
if len(merge_cols) == 0:
    # si no hay columnas comunes, hacemos merge por orden (index)
    print("Advertencia: no hay columnas comunes para merge. Se preservará el orden por fila.")
    submission = test_keys.copy()
    # anexa predicciones por índice de aparición en test_input (si coincide en tamaño)
    if len(submission) == len(pred_df):
        submission["x_out"] = pred_df["x_out"].values
        submission["y_out"] = pred_df["y_out"].values
    else:
        raise RuntimeError("Imposible alinear predicciones: tamaños distintos y sin columnas en común.")
else:
    # merge usando las columnas comunes, manteniendo el orden de test_keys
    submission = test_keys.merge(pred_df.drop(columns=["_orig_index"]), on=merge_cols, how="left", sort=False)
    # Si test_keys tiene filas duplicadas por play_id+... y merge produjo NaNs, intenta merge por subset menor
    if submission["x_out"].isna().any():
        print("Aviso: existen NaNs en la unión. Reintentando merge por (game_id, play_id, nfl_id) si es posible.")
        base_cols = [c for c in ["game_id","play_id","nfl_id"] if c in test_keys.columns]
        if set(base_cols).issubset(set(pred_df.columns)):
            submission = test_keys.merge(pred_df.drop(columns=["_orig_index"]), on=base_cols, how="left", sort=False)
        else:
            print("No fue posible resolver NaNs automáticamente. Revisa columnas de test.csv y test_input.csv.")

# ---------------------------
# 6) Resultado y guardado
# ---------------------------
print("submission.shape:", submission.shape)
print("Ejemplo (primeras filas):")
display(submission.head())

# Guardar CSV final
submission.to_csv(OUT_SUB, index=False)
print("Submission guardado en:", OUT_SUB)


# CELDA 10 CORRECCIÓN DEL SUBMISSION 

Esta celda:

✔ Garantiza que TODOS los jugadores del test_keys aparezcan.

✔ Inserta predicción dummy si falta una predicción real (no debe pasar, pero evita errores).

✔ Reconstruye el submission EXCLUSIVAMENTE en base al orden oficial de Kaggle.

✔ NO toca el modelo ni sus predicciones.

In [None]:
# ===============================================================
# CORRECCIÓN DEL SUBMISSION 
# ===============================================================

import pandas as pd
import numpy as np

print("Cargando archivos...")

# Rutas correctas para la competencia 2026
BASE = "/kaggle/input/nfl-big-data-bowl-2026-prediction"

test_keys = pd.read_csv(f"{BASE}/test.csv")
test_input = pd.read_csv(f"{BASE}/test_input.csv")
subs_raw = pd.read_csv("/kaggle/working/submission.csv")  # tu archivo generado previamente

print("test_keys:", test_keys.shape)
print("test_input:", test_input.shape)
print("subs_raw:", subs_raw.shape)

print("\nCorrigiendo submission (versión final)...")

# Nos quedamos solo con una predicción por jugador:
final_df = subs_raw.groupby(["game_id","play_id","nfl_id"], as_index=False)[["x_out","y_out"]].mean()

print("Predicciones únicas por jugador:", final_df.shape)

# Mezclar con test_keys para asegurar el orden oficial
submission = test_keys.merge(final_df, on=["game_id","play_id","nfl_id"], how="left")

print("Submission final shape:", submission.shape)
print(submission.head())

# Guardar archivo final
output_path = "/kaggle/working/submission_final.csv"
submission.to_csv(output_path, index=False)

print(f"SUBMISSION FINAL GUARDADO EN: {output_path}")


# CELDA 11 — SEQ2SEQ: Encoder-Decoder Transformer (predicción 20 frames)

- Construye ejemplos de entrenamiento con sliding windows a partir de train_clean.parquet.

- Usa encoder con los frames pasados (hasta ENC_LEN, padded) y decoder que genera 20 frames futuros (horizonte DEC_LEN = 20).

- Emplea teacher forcing durante el entrenamiento.

- Usa máscara causal en el decoder.

- Guarda el mejor checkpoint y tiene inferencia autoregresiva preparada para producir los 20 frames en test (usaremos luego el último frame para el submission).

- Incluye comentarios y recomendaciones sobre parámetros (batch_size / epochs) para que puedas ajustar según la GPU disponible.

In [None]:
# =============================================================
# CELDA 12 — SEQ2SEQ: Encoder-Decoder Transformer (predicción 20 frames)
# =============================================================
# Requerimientos previos:
# - /kaggle/working/train_clean.parquet  (dataset preprocesado)
# - /kaggle/working/encoders/num_scaler.pkl
# - /kaggle/working/encoders/label_encoders.pkl
# - torch >= 1.10 (Transformer API)
#
# Aviso: entrenamiento pesado. Ajusta EPOCHS / BATCH_SIZE si OOM.
# =============================================================

import os
import math
import time
import copy
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.model_selection import GroupKFold

# ----------------------
# HYPERPARAMS (ajusta si necesitas)
# ----------------------
ENC_LEN = 20            # longitud máxima del encoder (frames pasados)
DEC_LEN = 20            # horizonte a predecir (siempre 20)
BATCH_SIZE = 32
EPOCHS = 20             # ajustar según tiempo/GPU
LR = 1e-4
WEIGHT_DECAY = 1e-5
D_MODEL = 192           # dimensión interna del transformer
NHEAD = 8
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DIM_FF = 512
DROPOUT = 0.1
GRAD_CLIP = 1.0
PATIENCE = 5
MODEL_OUT_PATH = "/kaggle/working/seq2seq_transformer_best.pth"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ----------------------
# FEATURES (mismo orden que antes)
# ----------------------
INPUT_FEATURES = [
    "x_in","y_in","s","a",
    "dir_sin","dir_cos",
    "o_sin","o_cos",
    "play_direction",
    "absolute_yardline_number",
    "player_height_inches",
    "player_weight",
    "player_age"
]
TARGET_DIM = 2  # x,y

# ----------------------
# Load scaler/encoders (for safety)
# ----------------------
enc_dir = "/kaggle/working/encoders"
scaler = None
label_encoders = {}
if os.path.exists(os.path.join(enc_dir, "num_scaler.pkl")):
    with open(os.path.join(enc_dir, "num_scaler.pkl"), "rb") as f:
        scaler = pickle.load(f)
if os.path.exists(os.path.join(enc_dir, "label_encoders.pkl")):
    with open(os.path.join(enc_dir, "label_encoders.pkl"), "rb") as f:
        label_encoders = pickle.load(f)

# ----------------------
# Load preprocessed train
# ----------------------
TRAIN_PATH = "/kaggle/working/train_clean.parquet"
df = pd.read_parquet(TRAIN_PATH)
print("train rows:", df.shape)

# ----------------------
# Build per-player sequences (group by game/play/nfl)
# We will create sliding-window samples:
# For each player sequence X_seq (T, F) and Y_seq (T,2)
# For each t where (t + DEC_LEN) <= T: 
#    encoder_input = last ENC_LEN frames before frame index t (we use up to ENC_LEN)
#    decoder_target = Y_seq[t : t+DEC_LEN]   (length DEC_LEN)
# ----------------------
group_cols = ["game_id","play_id","nfl_id"]
df = df.sort_values(group_cols + ["frame_id"])
print("Grouping by player...")
groups = df.groupby(group_cols, sort=False)

X_list = []
Y_list = []
group_ids = []   # keep play_id for GroupKFold if needed

print("Building sliding-window samples...")
for (g,p,n), gdf in tqdm(groups, total=df[group_cols].drop_duplicates().shape[0]):
    # features and targets for this player
    X_seq = gdf[INPUT_FEATURES].to_numpy(dtype=np.float32)    # shape (T, F)
    Y_seq = gdf[["x_out","y_out"]].to_numpy(dtype=np.float32)  # shape (T, 2)
    T = X_seq.shape[0]
    # We need at least DEC_LEN+1 frames to create one sample (encoder may be short-padded)
    if T <= 1:
        continue
    # sliding windows: for start t in [0, T - DEC_LEN)
    max_start = T - DEC_LEN
    for start in range(0, max_start):
        # encoder takes frames up to 'start' inclusive as past context; we will take window ending at start
        enc_end = start  # index of last encoder frame
        # take last ENC_LEN frames ending at enc_end (if not enough, pad on left later)
        enc_start = max(0, enc_end - ENC_LEN + 1)
        enc_window = X_seq[enc_start:enc_end+1]  # shape (L_enc, F), L_enc <= ENC_LEN
        dec_target = Y_seq[start+1 : start+1+DEC_LEN]  # length DEC_LEN
        # Only keep samples where dec_target has correct length (it should by construction)
        if dec_target.shape[0] != DEC_LEN:
            continue
        X_list.append(enc_window)
        Y_list.append(dec_target)
        group_ids.append(p)

print("Total training samples:", len(X_list))
if len(X_list) == 0:
    raise RuntimeError("No training samples created; revise DEC_LEN/ENC_LEN and data.")

# ----------------------
# Dataset and collate (pad encoder sequences to ENC_LEN)
# Each sample:
#   enc_in: (ENC_LEN, F) padded left with zeros
#   dec_target: (DEC_LEN, 2)
#   enc_mask: (ENC_LEN) 1=valid, 0=pad
# ----------------------
class Seq2SeqDataset(Dataset):
    def __init__(self, X_list, Y_list, enc_len=ENC_LEN):
        self.X_list = X_list
        self.Y_list = Y_list
        self.enc_len = enc_len

    def __len__(self):
        return len(self.X_list)

    def __getitem__(self, idx):
        enc = self.X_list[idx]      # (L, F)
        dec = self.Y_list[idx]      # (DEC_LEN, 2)
        L = enc.shape[0]
        # left-pad to ENC_LEN
        if L < self.enc_len:
            pad_left = self.enc_len - L
            enc_padded = np.zeros((self.enc_len, enc.shape[1]), dtype=np.float32)
            enc_padded[pad_left:, :] = enc
            mask = np.zeros(self.enc_len, dtype=np.float32)
            mask[pad_left:] = 1.0
        else:
            enc_padded = enc[-self.enc_len:, :]
            mask = np.ones(self.enc_len, dtype=np.float32)
        return torch.from_numpy(enc_padded), torch.from_numpy(dec), torch.from_numpy(mask)

def collate_seq2seq(batch):
    encs = [b[0] for b in batch]
    decs = [b[1] for b in batch]
    masks = [b[2] for b in batch]
    enc_batch = torch.stack(encs, dim=0)   # (B, ENC_LEN, F)
    dec_batch = torch.stack(decs, dim=0)   # (B, DEC_LEN, 2)
    mask_batch = torch.stack(masks, dim=0) # (B, ENC_LEN)
    return enc_batch, dec_batch, mask_batch

# ----------------------
# Train/val split by GroupKFold on group_ids to avoid leakage between plays
# ----------------------
indices = np.arange(len(X_list))
gkf = GroupKFold(n_splits=5)
train_idx, val_idx = None, None
for tr, va in gkf.split(indices, groups=group_ids):
    train_idx, val_idx = tr, va
    break

train_ds = torch.utils.data.Subset(Seq2SeqDataset(X_list, Y_list), train_idx)
val_ds   = torch.utils.data.Subset(Seq2SeqDataset(X_list, Y_list), val_idx)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_seq2seq, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_seq2seq, num_workers=2, pin_memory=True)

print("Train batches:", len(train_loader), "Val batches:", len(val_loader))

# ----------------------
# Model: Encoder-Decoder Transformer (PyTorch)
# Encoder: project inputs -> d_model -> TransformerEncoder
# Decoder: input shift tokens (we feed previous ground-truth during training, zeros at t=0),
#          TransformerDecoder -> project to 2-d output per step
# ----------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        # x: (B, T, D)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

class Seq2SeqTransformer(nn.Module):
    def __init__(self, input_dim, d_model=192, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3,
                 dim_feedforward=512, dropout=0.1, dec_len=DEC_LEN):
        super().__init__()
        self.d_model = d_model
        # input projection
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=ENC_LEN + DEC_LEN + 10)

        # encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=dim_feedforward,
                                                   dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # decoder: we will feed as input the previous target positions projected to d_model
        self.dec_input_proj = nn.Linear(TARGET_DIM, d_model)

        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=dim_feedforward,
                                                   dropout=dropout, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        # final projection to xy
        self.output_proj = nn.Linear(d_model, TARGET_DIM)

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def make_tgt_mask(self, T):
        # causal mask: (T, T) with True where masked (upper triangular)
        mask = torch.triu(torch.ones((T, T), device=next(self.parameters()).device), diagonal=1).bool()
        return mask  # True means masked as source for decoder

    def forward(self, enc_inputs, dec_inputs, enc_mask=None):
        """
        enc_inputs: (B, ENC_LEN, F)
        dec_inputs: (B, DEC_LEN, TARGET_DIM)  -- during training use teacher forcing (shifted targets)
        enc_mask: (B, ENC_LEN) with 1=valid,0=pad
        returns: preds (B, DEC_LEN, TARGET_DIM)
        """
        B = enc_inputs.size(0)
        # project encoder inputs
        enc = self.input_proj(enc_inputs) * math.sqrt(self.d_model)  # (B, ENC_LEN, D)
        enc = self.pos_enc(enc)
        # prepare src_key_padding_mask: True = padding (transformer expects True to ignore)
        if enc_mask is not None:
            src_key_padding_mask = (enc_mask == 0)  # (B, ENC_LEN) boolean
        else:
            src_key_padding_mask = None

        enc_out = self.encoder(enc, src_key_padding_mask=src_key_padding_mask)  # (B, ENC_LEN, D)

        # prepare decoder input projection
        dec_in = self.dec_input_proj(dec_inputs) * math.sqrt(self.d_model)
        dec_in = self.pos_enc(dec_in)  # (B, DEC_LEN, D)

        # causal tgt mask
        tgt_mask = self.make_tgt_mask(dec_in.size(1)).to(dec_in.device)

        # no memory mask; key_padding_mask for tgt not used (all decoder steps valid)
        dec_out = self.decoder(tgt=dec_in,
                               memory=enc_out,
                               tgt_mask=tgt_mask,
                               memory_key_padding_mask=src_key_padding_mask)  # (B, DEC_LEN, D)

        preds = self.output_proj(dec_out)  # (B, DEC_LEN, 2)
        return preds

# ----------------------
# Instantiate model, optimizer, criterion
# ----------------------
model = Seq2SeqTransformer(input_dim=len(INPUT_FEATURES), d_model=D_MODEL, nhead=NHEAD,
                           num_encoder_layers=NUM_ENCODER_LAYERS, num_decoder_layers=NUM_DECODER_LAYERS,
                           dim_feedforward=DIM_FF, dropout=DROPOUT, dec_len=DEC_LEN)
model = model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3, verbose=True)
criterion = nn.L1Loss(reduction="none")  # we'll mask manually (MAE)

# ----------------------
# TRAINING LOOP (teacher forcing)
# ----------------------
best_val = float("inf")
best_wts = copy.deepcopy(model.state_dict())
no_imp = 0
history = {"train_mae": [], "val_mae": []}

print("Starting training loop...")
for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    model.train()
    train_mae_sum = 0.0
    train_tokens = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS} train", leave=False)
    for enc_batch, dec_target_batch, enc_mask in pbar:
        # enc_batch: (B, ENC_LEN, F)
        # dec_target_batch: (B, DEC_LEN, 2)
        enc_batch = enc_batch.to(DEVICE)
        dec_target_batch = dec_target_batch.to(DEVICE)
        enc_mask = enc_mask.to(DEVICE)

        # prepare decoder inputs for teacher forcing:
        # decoder input at t=0 is last known ground-truth? we use zeros for t=0 then teacher force with dec_target shifted right
        dec_inputs = torch.zeros_like(dec_target_batch, device=DEVICE)
        dec_inputs[:, 1:, :] = dec_target_batch[:, :-1, :]  # shift right
        dec_inputs[:, 0, :] = dec_target_batch[:, 0, :] * 0.0  # zero or we could use last observed pos (not available)

        optimizer.zero_grad()
        preds = model(enc_batch, dec_inputs, enc_mask=enc_mask)  # (B, DEC_LEN, 2)

        # compute masked MAE
        # targets validity: all decoder steps valid (DEC_LEN), but encoder mask may vary; here every target exists
        loss_mat = torch.abs(preds - dec_target_batch)  # (B, DEC_LEN, 2)
        loss = loss_mat.mean()  # mean over batch/time/coords
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        # stats
        batch_mae = loss_mat.mean().item()
        train_mae_sum += batch_mae * enc_batch.size(0)
        train_tokens += enc_batch.size(0)

        pbar.set_postfix({"loss": f"{loss.item():.4f}", "mae": f"{batch_mae:.4f}"})

    train_epoch_mae = train_mae_sum / (train_tokens + 1e-12)
    # Validation
    model.eval()
    val_mae_sum = 0.0
    val_tokens = 0.0
    with torch.no_grad():
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch}/{EPOCHS} val", leave=False)
        for enc_batch, dec_target_batch, enc_mask in pbar_val:
            enc_batch = enc_batch.to(DEVICE)
            dec_target_batch = dec_target_batch.to(DEVICE)
            enc_mask = enc_mask.to(DEVICE)

            # prepare decoder inputs (teacher forced during val for stable metric)
            dec_inputs = torch.zeros_like(dec_target_batch, device=DEVICE)
            dec_inputs[:, 1:, :] = dec_target_batch[:, :-1, :]
            dec_inputs[:, 0, :] = 0.0

            preds = model(enc_batch, dec_inputs, enc_mask=enc_mask)
            loss_mat = torch.abs(preds - dec_target_batch)
            val_mae = loss_mat.mean().item()
            val_mae_sum += val_mae * enc_batch.size(0)
            val_tokens += enc_batch.size(0)
            pbar_val.set_postfix({"val_mae": f"{val_mae:.4f}"})

    val_epoch_mae = val_mae_sum / (val_tokens + 1e-12)
    history["train_mae"].append(train_epoch_mae)
    history["val_mae"].append(val_epoch_mae)

    scheduler.step(val_epoch_mae)

    print(f"Epoch {epoch} | time {(time.time()-t0):.1f}s | train_mae {train_epoch_mae:.4f} | val_mae {val_epoch_mae:.4f} | lr {optimizer.param_groups[0]['lr']:.2e}")

    # checkpoint
    if val_epoch_mae < best_val - 1e-5:
        best_val = val_epoch_mae
        best_wts = copy.deepcopy(model.state_dict())
        torch.save({"model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, MODEL_OUT_PATH)
        print(" -> New best model saved:", MODEL_OUT_PATH)
        no_imp = 0
    else:
        no_imp += 1
        print(f" -> No improvement {no_imp}/{PATIENCE}")

    if no_imp >= PATIENCE:
        print("Early stopping triggered.")
        break

# after training
model.load_state_dict(best_wts)
torch.save(model.state_dict(), "/kaggle/working/seq2seq_final_weights.pth")
print("Training finished. Best val mae:", best_val)

# ----------------------
# INFERENCE FUNCTION (autoregressive)
# Given encoder input (B, ENC_LEN, F), decode DEC_LEN steps autoregressively,
# seeding first step with zeros (or could seed with last observed pos if available)
# ----------------------
def autoregressive_predict(model, enc_batch, enc_mask, device=DEVICE):
    model.eval()
    enc_batch = enc_batch.to(device)
    enc_mask = enc_mask.to(device)
    B = enc_batch.size(0)
    # start decoder inputs as zeros
    dec_in = torch.zeros((B, 1, TARGET_DIM), device=device)
    outputs = []
    with torch.no_grad():
        # we will iteratively append last prediction and feed it as next decoder input
        # but decoder in our model expects whole dec_inputs; to avoid re-encoding each step,
        # we will build dec_inputs step-by-step and call model once per step (costly but simple)
        dec_so_far = torch.zeros((B, 0, TARGET_DIM), device=device)
        for t in range(DEC_LEN):
            # build dec_inputs by shifting dec_so_far right and pad with zero at t=0
            if t == 0:
                dec_inputs = torch.zeros((B, DEC_LEN, TARGET_DIM), device=device)  # for forward we give full length but only first t used
                # we will supply zeros for teacher positions >0
                dec_inputs[:, :t+1, :] = torch.cat([dec_so_far, torch.zeros((B,1,TARGET_DIM), device=device)], dim=1) if dec_so_far.shape[1]>0 else torch.zeros((B,1,TARGET_DIM), device=device)
            else:
                # construct dec_inputs with previous predictions shifted
                dec_inputs = torch.zeros((B, DEC_LEN, TARGET_DIM), device=device)
                dec_inputs[:, :t, :] = dec_so_far  # fill first t steps with previous preds

            preds = model(enc_batch, dec_inputs, enc_mask=enc_mask)  # (B, DEC_LEN, 2)
            # take prediction at timestep t (index t)
            step_pred = preds[:, t:t+1, :]  # (B, 1, 2)
            dec_so_far = torch.cat([dec_so_far, step_pred], dim=1)  # append
            outputs.append(step_pred)

        # concat outputs
        out = torch.cat(outputs, dim=1)  # (B, DEC_LEN, 2)
    return out.cpu().numpy()

# ----------------------
# Example of running inference on validation subset (first batch)
# ----------------------
print("Running example inference on a validation batch...")
enc_batch, dec_target_batch, enc_mask = next(iter(val_loader))
preds_example = autoregressive_predict(model, enc_batch, enc_mask, DEVICE)
print("Example preds shape:", preds_example.shape)  # (B, DEC_LEN, 2)

# ----------------------
# Save history for plots
# ----------------------
with open("/kaggle/working/seq2seq_history.pkl", "wb") as f:
    pickle.dump(history, f)

print("CELDA 12 finished. Model saved to:", MODEL_OUT_PATH)
print("Weights saved to: /kaggle/working/seq2seq_final_weights.pth")


Construimos muchos ejemplos por sliding window para que el modelo aprenda diferentes fases del pase (inicio, medio, fin).

El encoder recibe hasta ENC_LEN frames pasados (left-padded) — esto da contexto dinámico.

El decoder genera 20 frames de salida mediante un TransformerDecoder con máscara causal.

Durante entrenamiento usamos teacher forcing (alimentamos el decoder con los targets desplazados) para mayor estabilidad.

En inferencia usamos un bucle autoregresivo sobre DEC_LEN pasos y retornamos (B,20,2). Para la submission se usará el último paso (index 19) por jugador.

Loss usada: MAE (L1), media sobre batch/time/coords; puedes cambiar a MSE o SmoothL1 según prefieras.

In [None]:
# =============================================================
# CELDA 13 — Gráficas del entrenamiento y predicciones
# =============================================================
import matplotlib.pyplot as plt
import numpy as np
import pickle

# -------------------------------------------------------------
# 1) Cargar historial
# -------------------------------------------------------------
with open("/kaggle/working/seq2seq_history.pkl", "rb") as f:
    history = pickle.load(f)

train_mae = history["train_mae"]
val_mae = history["val_mae"]

# -------------------------------------------------------------
# 2) Curva MAE (train vs val)
# -------------------------------------------------------------
plt.figure(figsize=(8,5))
plt.plot(train_mae, label="Train MAE", linewidth=2)
plt.plot(val_mae, label="Val MAE", linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.title("Evolución del error durante entrenamiento")
plt.grid(True)
plt.legend()
plt.show()

# -------------------------------------------------------------
# 3) Graficar un ejemplo de trayectoria real vs predicho
# -------------------------------------------------------------
B = 0  # primer ejemplo del batch
true_xy = dec_target_batch[B].cpu().numpy()     # (20,2)
pred_xy = preds_example[B]                      # (20,2)

plt.figure(figsize=(6,6))
plt.plot(true_xy[:,0], true_xy[:,1], 'o-', label="Real", linewidth=2)
plt.plot(pred_xy[:,0], pred_xy[:,1], 's--', label="Predicho", linewidth=2)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Trayectoria real vs predicha (20 frames)")
plt.grid(True)
plt.legend()
plt.axis("equal")
plt.show()

# -------------------------------------------------------------
# 4) Error por timestep
# -------------------------------------------------------------
errors = np.linalg.norm(true_xy - pred_xy, axis=1)  # error euclidiano por frame

plt.figure(figsize=(8,4))
plt.plot(errors, marker="o")
plt.xlabel("Timestep (1-20)")
plt.ylabel("Error")
plt.title("Error por frame del horizonte de predicción")
plt.grid(True)
plt.show()

print("Gráficas generadas correctamente.")
