In [None]:
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [None]:
nfl_big_data_bowl_2026_prediction_path = kagglehub.competition_download('nfl-big-data-bowl-2026-prediction')
print('Data source import complete.')

Data source import complete.


In [None]:
import pandas as pd
import numpy as np
import warnings
import os
import gc
import pickle
import glob
import random
from tqdm import tqdm
from scipy import stats
import json

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# scikit-learn imports (for data processing only)
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

# TensorFlow imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks

warnings.filterwarnings('ignore')

# Seed for reproducibility
SEED = 42

def set_seed(seed=42):
    """Fix all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"Random seed set to {seed}")

set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Random seed set to 42
Using device: cuda


In [None]:
BASE_DIR = nfl_big_data_bowl_2026_prediction_path
TRAIN_DIR = os.path.join(BASE_DIR, "train")

# Load training data
input_files = sorted(glob.glob(os.path.join(TRAIN_DIR, 'input_*.csv')))
output_files = sorted(glob.glob(os.path.join(TRAIN_DIR, 'output_*.csv')))

print(f"Found {len(input_files)} input files and {len(output_files)} output files")

train_input_list = []
train_output_list = []

for i, (inp_file, out_file) in enumerate(zip(input_files, output_files)):
    print(f"Loading week {i+1}/{len(input_files)}...")
    inp_df = pd.read_csv(inp_file)
    out_df = pd.read_csv(out_file)
    train_input_list.append(inp_df)
    train_output_list.append(out_df)

train_input = pd.concat(train_input_list, ignore_index=True)
train_output = pd.concat(train_output_list, ignore_index=True)

print(f"Train input shape: {train_input.shape}")
print(f"Train output shape: {train_output.shape}")

test_input = pd.read_csv(os.path.join(BASE_DIR, 'test_input.csv'))
test = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))

print(f"Test input shape: {test_input.shape}")
print(f"Test shape: {test.shape}")

Found 18 input files and 18 output files
Loading week 1/18...
Loading week 2/18...
Loading week 3/18...
Loading week 4/18...
Loading week 5/18...
Loading week 6/18...
Loading week 7/18...
Loading week 8/18...
Loading week 9/18...
Loading week 10/18...
Loading week 11/18...
Loading week 12/18...
Loading week 13/18...
Loading week 14/18...
Loading week 15/18...
Loading week 16/18...
Loading week 17/18...
Loading week 18/18...
Train input shape: (4880579, 23)
Train output shape: (562936, 6)
Test input shape: (49753, 23)
Test shape: (5837, 5)


In [None]:
# Feature engineering functions (same as original)
def add_enhanced_features(df):
    """Enhanced feature engineering with opponent interaction features"""
    df = df.copy()
    print("Adding enhanced features...")

    # Mirror transform - normalize to right direction
    left_mask = df["play_direction"] == "left"
    df.loc[left_mask, "x"] = 120 - df.loc[left_mask, "x"]
    df.loc[left_mask, "y"] = 53.3 - df.loc[left_mask, "y"]
    df.loc[left_mask, "ball_land_x"] = 120 - df.loc[left_mask, "ball_land_x"]
    df.loc[left_mask, "ball_land_y"] = 53.3 - df.loc[left_mask, "ball_land_y"]

    # Direction encoding (circular features)
    df["dir_rad"] = np.deg2rad(df["dir"]) % (2 * np.pi)
    df["dir_sin"] = np.sin(df["dir_rad"])
    df["dir_cos"] = np.cos(df["dir_rad"])
    df["o_rad"] = np.deg2rad(df["o"]) % (2 * np.pi)
    df["o_sin"] = np.sin(df["o_rad"])
    df["o_cos"] = np.cos(df["o_rad"])

    # Velocity components
    df["vx"] = df["s"] * np.cos(df["dir_rad"])
    df["vy"] = df["s"] * np.sin(df["dir_rad"])

    # Acceleration components
    df["ax"] = df["a"] * np.cos(df["dir_rad"])
    df["ay"] = df["a"] * np.sin(df["dir_rad"])

    # Ball landing features
    df["dx_to_land"] = df["ball_land_x"] - df["x"]
    df["dy_to_land"] = df["ball_land_y"] - df["y"]
    df["dist_to_land"] = np.sqrt(df["dx_to_land"]**2 + df["dy_to_land"]**2)

    # Position encoding
    position_map = {'QB':0, 'RB':1, 'WR':2, 'TE':3, 'FB':4, 'OL':5, 'DL':6, 'LB':7, 'DB':8, 'S':9}
    df["pos_enc"] = df["player_position"].map(position_map).fillna(10)

    # Target receiver flag
    if "player_to_predict" in df.columns:
        df["is_target"] = df["player_to_predict"]
    else:
        df["is_target"] = 0

    # Sort for temporal and spatial features
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    group_cols = ['game_id', 'play_id', 'nfl_id']

    # Lag features (1 only)
    df['x_lag1'] = df.groupby(group_cols)['x'].shift(1)
    df['y_lag1'] = df.groupby(group_cols)['y'].shift(1)
    df['s_lag1'] = df.groupby(group_cols)['s'].shift(1)

    # Speed change
    df['speed_change'] = df.groupby(group_cols)['s'].diff()

    # Distance to sidelines
    df['dist_from_sideline'] = np.minimum(df['y'], 53.3 - df['y'])

    # Opponent/Teammate interaction features
    print("Computing player interactions...")
    df = add_opponent_features_fast(df)

    # Fill NaN values
    df = df.fillna(0)

    print("Enhanced feature engineering complete!")
    return df

def add_opponent_features_fast(df):
    """Add opponent features - optimized version (last frame only)"""
    last_frames = (df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
                     .groupby(['game_id', 'play_id', 'nfl_id'])
                     .tail(1)
                     .copy())

    opponent_data = []

    for (gid, pid), play_group in tqdm(last_frames.groupby(['game_id', 'play_id']),
                                       desc="Player interactions"):
        if len(play_group) < 2:
            for idx in play_group.index:
                opponent_data.append({
                    'game_id': gid,
                    'play_id': pid,
                    'nfl_id': play_group.loc[idx, 'nfl_id'],
                    'nearest_teammate_dist': 50.0,
                    'nearest_opponent_dist': 50.0,
                    'num_nearby_3yd': 0
                })
            continue

        positions = play_group[['x', 'y']].values
        sides = play_group['player_side'].values
        nfl_ids = play_group['nfl_id'].values

        for i in range(len(play_group)):
            other_mask = np.arange(len(positions)) != i
            other_positions = positions[other_mask]
            other_sides = sides[other_mask]

            distances = np.sqrt(((positions[i] - other_positions) ** 2).sum(axis=1))

            teammate_mask = other_sides == sides[i]
            nearest_teammate = distances[teammate_mask].min() if teammate_mask.any() else 50.0

            opponent_mask = other_sides != sides[i]
            nearest_opponent = distances[opponent_mask].min() if opponent_mask.any() else 50.0

            num_nearby = (distances < 3.0).sum()

            opponent_data.append({
                'game_id': gid,
                'play_id': pid,
                'nfl_id': nfl_ids[i],
                'nearest_teammate_dist': nearest_teammate,
                'nearest_opponent_dist': nearest_opponent,
                'num_nearby_3yd': int(num_nearby)
            })

    opponent_df = pd.DataFrame(opponent_data)
    df = df.merge(opponent_df, on=['game_id', 'play_id', 'nfl_id'], how='left')

    df['nearest_teammate_dist'] = df['nearest_teammate_dist'].fillna(50.0)
    df['nearest_opponent_dist'] = df['nearest_opponent_dist'].fillna(50.0)
    df['num_nearby_3yd'] = df['num_nearby_3yd'].fillna(0)

    return df

In [None]:
# Define features - 26 features
FEATURES = [
    'x', 'y', 'vx', 'vy', 's', 'a', 'ax', 'ay',
    'dir_sin', 'dir_cos', 'o_sin', 'o_cos',
    'dx_to_land', 'dy_to_land', 'dist_to_land',
    'pos_enc', 'is_target', 'absolute_yardline_number',
    'x_lag1', 'y_lag1', 's_lag1',
    'speed_change',
    'dist_from_sideline',
    'nearest_teammate_dist', 'nearest_opponent_dist', 'num_nearby_3yd'
]

print(f"Total features: {len(FEATURES)}")

def prepare_data_efficient(input_df, output_df, max_inp=60, max_out=40):
    """Prepare forecasting data: Input -> Output"""
    print("Preparing forecasting data...")

    input_plays = input_df.groupby(['game_id', 'play_id'])
    output_plays = output_df.groupby(['game_id', 'play_id'])

    inp_seqs, out_seqs, inp_masks, out_masks = [], [], [], []

    for (gid, pid), inp_group in tqdm(input_plays, desc="Processing"):
        try:
            out_group = output_plays.get_group((gid, pid))
        except KeyError:
            continue

        for nid in inp_group['nfl_id'].unique():
            inp_player = inp_group[inp_group['nfl_id'] == nid].sort_values('frame_id')
            out_player = out_group[out_group['nfl_id'] == nid].sort_values('frame_id')

            if len(out_player) == 0:
                continue

            inp_feat = inp_player[FEATURES].values
            out_pos = out_player[['x', 'y']].values

            # Pad/truncate input
            if len(inp_feat) > max_inp:
                inp_feat = inp_feat[-max_inp:]
                inp_mask = np.ones(max_inp)
            else:
                pad = max_inp - len(inp_feat)
                inp_feat = np.vstack([inp_feat, np.zeros((pad, len(FEATURES)))])
                inp_mask = np.concatenate([np.ones(len(inp_player)), np.zeros(pad)])

            # Pad/truncate output
            if len(out_pos) > max_out:
                out_pos = out_pos[:max_out]
                out_mask = np.ones(max_out)
            else:
                pad = max_out - len(out_pos)
                out_pos = np.vstack([out_pos, np.zeros((pad, 2))])
                out_mask = np.concatenate([np.ones(len(out_player)), np.zeros(pad)])

            inp_seqs.append(inp_feat)
            out_seqs.append(out_pos)
            inp_masks.append(inp_mask)
            out_masks.append(out_mask)

    print(f"Created {len(inp_seqs)} sequences")
    return (np.array(inp_seqs, dtype='float32'),
            np.array(out_seqs, dtype='float32'),
            np.array(inp_masks, dtype='float32'),
            np.array(out_masks, dtype='float32'))

Total features: 26


In [None]:
# =========================================================
# PYTORCH MODEL (Original)
# =========================================================
class LightweightForecaster(nn.Module):
    """Encoder-Decoder for position forecasting"""
    def __init__(self, input_dim=26, hidden_dim=256, max_out=40):
        super().__init__()
        self.max_out = max_out

        # Encoder
        self.enc_embed = nn.Linear(input_dim, hidden_dim)
        self.enc_gru = nn.GRU(hidden_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.0)

        # Decoder
        self.dec_embed = nn.Linear(2, hidden_dim)
        self.dec_gru = nn.GRU(hidden_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.0)

        # Output
        self.out_proj = nn.Linear(hidden_dim, 2)

    def forward(self, x, target=None):
        B = x.size(0)

        # Encode
        enc = F.relu(self.enc_embed(x))
        _, h = self.enc_gru(enc)

        if self.training and target is not None:
            # Teacher forcing
            dec_inp = torch.cat([torch.zeros(B, 1, 2, device=x.device), target[:, :-1]], dim=1)
            dec = F.relu(self.dec_embed(dec_inp))
            dec_out, _ = self.dec_gru(dec, h)
            return self.out_proj(dec_out)
        else:
            # Autoregressive
            outputs = []
            prev = torch.zeros(B, 1, 2, device=x.device)
            h_dec = h

            for _ in range(self.max_out):
                dec = F.relu(self.dec_embed(prev))
                dec_out, h_dec = self.dec_gru(dec, h_dec)
                pos = self.out_proj(dec_out)
                outputs.append(pos)
                prev = pos

            return torch.cat(outputs, dim=1)

class SimpleDataset(Dataset):
    def __init__(self, inp, out, inp_mask, out_mask):
        self.inp = torch.from_numpy(inp)
        self.out = torch.from_numpy(out)
        self.inp_mask = torch.from_numpy(inp_mask)
        self.out_mask = torch.from_numpy(out_mask)

    def __len__(self):
        return len(self.inp)

    def __getitem__(self, i):
        return self.inp[i], self.out[i], self.inp_mask[i], self.out_mask[i]

def train_pytorch_model(model, train_loader, val_loader, epochs=20):
    criterion = nn.MSELoss(reduction='none')
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4,
                                   betas=(0.9, 0.999), eps=1e-8)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_rmse = float('inf')
    patience = 8
    counter = 0

    for epoch in range(epochs):
        # Train
        model.train()
        train_loss = 0

        for inp, out, _, out_mask in tqdm(train_loader, desc=f"PyTorch Epoch {epoch+1}/{epochs}"):
            inp, out, out_mask = inp.to(device), out.to(device), out_mask.to(device)

            optimizer.zero_grad()
            pred = model(inp, target=out)

            loss = criterion(pred, out).sum(-1) * out_mask
            loss = loss.sum() / (out_mask.sum() + 1e-6)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item() * len(inp)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_preds, val_tgts, val_masks = [], [], []

        with torch.no_grad():
            for inp, out, _, out_mask in val_loader:
                inp = inp.to(device)
                pred = model(inp)
                val_preds.append(pred.cpu())
                val_tgts.append(out)
                val_masks.append(out_mask)

        val_preds = torch.cat(val_preds)
        val_tgts = torch.cat(val_tgts)
        val_masks = torch.cat(val_masks)

        mask_exp = val_masks.unsqueeze(-1).expand_as(val_preds) > 0
        rmse = torch.sqrt(((val_preds[mask_exp] - val_tgts[mask_exp])**2).mean()).item()

        print(f"Epoch {epoch+1:02d} | Loss: {train_loss:.4f} | RMSE: {rmse:.4f}")

        scheduler.step()

        if rmse < best_rmse:
            best_rmse = rmse
            counter = 0
            torch.save(model.state_dict(), "best_model_pytorch.pt")
            print(f"Saved! Best: {rmse:.4f}")
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping")
                break

    return best_rmse

In [None]:
# =========================================================
# TENSORFLOW MODEL (Matching PyTorch structure)
# =========================================================
class LightweightForecasterTF(keras.Model):
    """TensorFlow model matching PyTorch LightweightForecaster exactly"""
    def __init__(self, input_dim=26, hidden_dim=256, max_out=40):
        super().__init__()
        self.max_out = max_out
        self.hidden_dim = hidden_dim

        # Encoder
        self.enc_embed = layers.Dense(hidden_dim, activation='relu', name='enc_embed')
        # 2-layer stacked GRU using GRUCell (matching PyTorch num_layers=2)
        self.enc_gru_cell1 = layers.GRUCell(hidden_dim, dropout=0.0, recurrent_dropout=0.0, name='enc_gru_cell1')
        self.enc_gru_cell2 = layers.GRUCell(hidden_dim, dropout=0.0, recurrent_dropout=0.0, name='enc_gru_cell2')
        self.enc_gru = layers.RNN(layers.StackedRNNCells([self.enc_gru_cell1, self.enc_gru_cell2]),
                                   return_sequences=False, return_state=True, name='enc_gru')

        # Decoder
        self.dec_embed = layers.Dense(hidden_dim, activation='relu', name='dec_embed')
        # 2-layer stacked GRU for decoder
        self.dec_gru_cell1 = layers.GRUCell(hidden_dim, dropout=0.0, recurrent_dropout=0.0, name='dec_gru_cell1')
        self.dec_gru_cell2 = layers.GRUCell(hidden_dim, dropout=0.0, recurrent_dropout=0.0, name='dec_gru_cell2')
        self.dec_gru = layers.RNN(layers.StackedRNNCells([self.dec_gru_cell1, self.dec_gru_cell2]),
                                   return_sequences=True, return_state=True, name='dec_gru')

        # Output
        self.out_proj = layers.Dense(2, name='out_proj')

    def call(self, inputs, training=None):
        """Forward pass - supports both training (teacher forcing) and inference (autoregressive)"""
        if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
            # Training mode: [encoder_input, decoder_targets]
            encoder_input, decoder_targets = inputs
            return self._forward_teacher_forcing(encoder_input, decoder_targets, training)
        else:
            # Inference mode: encoder_input only
            encoder_input = inputs
            return self._forward_autoregressive(encoder_input, training)

    def _forward_teacher_forcing(self, encoder_input, decoder_targets, training):
        """Teacher forcing for training"""
        # Encode - StackedRNNCells returns 2 states (one per layer)
        enc = self.enc_embed(encoder_input)
        _, h1_enc, h2_enc = self.enc_gru(enc, training=training)
        init_states_dec = [h1_enc, h2_enc]

        # Shift decoder targets for teacher forcing (prepend zeros)
        batch_size = tf.shape(decoder_targets)[0]
        dtype = decoder_targets.dtype
        shifted_targets = tf.concat([
            tf.zeros((batch_size, 1, 2), dtype=dtype),
            decoder_targets[:, :-1, :]
        ], axis=1)

        # Decode with teacher forcing - StackedRNNCells returns 2 states
        dec = self.dec_embed(shifted_targets)
        dec_out, _, _ = self.dec_gru(dec, initial_state=init_states_dec, training=training)
        output = self.out_proj(dec_out)

        return output

    def _forward_autoregressive(self, encoder_input, training):
        """Autoregressive inference"""
        # Encode - StackedRNNCells returns 2 states (one per layer)
        enc = self.enc_embed(encoder_input)
        _, h1_dec, h2_dec = self.enc_gru(enc, training=training)

        # Autoregressive decoding
        batch_size = tf.shape(encoder_input)[0]
        dtype = encoder_input.dtype
        prev_pos = tf.zeros((batch_size, 1, 2), dtype=dtype)
        outputs = []

        for _ in range(self.max_out):
            dec = self.dec_embed(prev_pos)
            # StackedRNNCells returns 2 states, need to pass list and unpack list
            dec_out, h1_dec, h2_dec = self.dec_gru(dec, initial_state=[h1_dec, h2_dec], training=training)
            pos = self.out_proj(dec_out)
            outputs.append(pos)
            prev_pos = pos

        return tf.concat(outputs, axis=1)

    def predict_autoregressive(self, encoder_input):
        """Explicit autoregressive prediction method"""
        result = self._forward_autoregressive(encoder_input, training=False)
        # Convert to numpy for consistency
        if isinstance(result, tf.Tensor):
            return result.numpy()
        return result

def create_tf_model_simple(input_dim=26, hidden_dim=256, max_out=40):
    """Create TensorFlow model matching PyTorch structure exactly"""
    model = LightweightForecasterTF(input_dim=input_dim, hidden_dim=hidden_dim, max_out=max_out)
    return model

def mse_per_timestep(y_true, y_pred):
    """MSE per timestep - returns (batch, seq_len) for sample_weight to apply mask"""
    # (batch, seq_len, 2) -> (batch, seq_len)
    return tf.reduce_sum(tf.square(y_true - y_pred), axis=-1)

class MaskedMSEMetric(tf.keras.metrics.Metric):
    """Masked MSE metric for validation"""
    def __init__(self, name='masked_mse', **kwargs):
        super().__init__(name=name, **kwargs)
        self.total_mse = self.add_weight(name='total_mse', initializer='zeros')
        self.total_count = self.add_weight(name='total_count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        mse = tf.reduce_sum(tf.square(y_true - y_pred), axis=-1)
        if sample_weight is not None:
            mse = mse * sample_weight
            self.total_mse.assign_add(tf.reduce_sum(mse))
            self.total_count.assign_add(tf.reduce_sum(sample_weight))
        else:
            self.total_mse.assign_add(tf.reduce_sum(mse))
            self.total_count.assign_add(tf.cast(tf.size(mse), tf.float32))

    def result(self):
        return self.total_mse / (self.total_count + 1e-6)

    def reset_state(self):
        self.total_mse.assign(0.0)
        self.total_count.assign(0.0)

def train_tensorflow_model(model, X_train, y_train, X_val, y_val,
                          mask_train, mask_val, epochs=20):
    """Train TensorFlow model with masked loss"""
    print("Training TensorFlow model...")

    # Compile with per-timestep loss (sample_weight will apply mask)
    model.compile(
        optimizer=optimizers.AdamW(learning_rate=5e-4, weight_decay=1e-4,
                                    beta_1=0.9, beta_2=0.999, epsilon=1e-8),
        loss=mse_per_timestep,
        metrics=[MaskedMSEMetric()]
    )

    # Cosine annealing schedule matching PyTorch (T_max=epochs, alpha=0.0)
    def cosine_annealing_schedule(epoch, lr):
        """Cosine annealing matching PyTorch CosineAnnealingLR"""
        T_max = epochs
        return 5e-4 * (1 + np.cos(np.pi * epoch / T_max)) / 2

    # Callbacks
    callbacks_list = [
        callbacks.EarlyStopping(monitor='val_masked_mse', mode='min', patience=8, restore_best_weights=True),
        callbacks.LearningRateScheduler(cosine_annealing_schedule, verbose=0),
        callbacks.ModelCheckpoint('best_model_tensorflow.weights.h5',
                                 save_best_only=True,
                                 monitor='val_masked_mse',
                                 mode='min',
                                 save_weights_only=True)
    ]

    # Train with sample weights (masks)
    history = model.fit(
        [X_train, y_train], y_train,
        sample_weight=mask_train,
        validation_data=([X_val, y_val], y_val, mask_val),
        epochs=epochs,
        batch_size=64,
        callbacks=callbacks_list,
        verbose=1
    )

    # Evaluate with autoregressive inference and mask (matching PyTorch)
    val_pred = model.predict_autoregressive(X_val)

    # Apply mask for RMSE calculation (matching PyTorch exactly)
    # Expand mask from (batch, seq) to (batch, seq, 2) to match predictions
    mask_expanded = np.expand_dims(mask_val, axis=-1)  # (batch, seq, 1)
    mask_expanded = np.broadcast_to(mask_expanded, val_pred.shape)  # (batch, seq, 2)
    mask_bool = mask_expanded > 0
    val_pred_masked = val_pred[mask_bool]
    y_val_masked = y_val[mask_bool]

    rmse = np.sqrt(np.mean((val_pred_masked - y_val_masked)**2))
    print(f"TensorFlow Validation RMSE (masked): {rmse:.4f}")

    return model, rmse, history

In [None]:
# =========================================================
# MAIN TRAINING PIPELINE
# =========================================================
print("="*80)
print("TRAINING PIPELINE")
print("="*80)

# Feature engineering
print("Feature Engineering...")
train_input_enh = add_enhanced_features(train_input)
test_input_enh = add_enhanced_features(test_input)

# Train/Val split
print("Train/Val Split...")
play_keys = train_input[["game_id","play_id"]].drop_duplicates()
train_plays, val_plays = train_test_split(play_keys, test_size=0.2, random_state=SEED)

train_idx = train_input_enh.set_index(["game_id","play_id"]).index.isin(
    train_plays.set_index(["game_id","play_id"]).index
)
val_idx = train_input_enh.set_index(["game_id","play_id"]).index.isin(
    val_plays.set_index(["game_id","play_id"]).index
)

print(f"Train samples: {train_idx.sum()}, Val samples: {val_idx.sum()}")

# Prepare data
print("Preparing Data...")
inp_train, out_train, mask_inp_train, mask_out_train = prepare_data_efficient(
    train_input_enh.loc[train_idx],
    train_output[train_output.set_index(['game_id','play_id']).index.isin(
        train_input_enh.loc[train_idx].set_index(['game_id','play_id']).index
    )],
    max_inp=60, max_out=40
)
gc.collect()

inp_val, out_val, mask_inp_val, mask_out_val = prepare_data_efficient(
    train_input_enh.loc[val_idx],
    train_output[train_output.set_index(['game_id','play_id']).index.isin(
        train_input_enh.loc[val_idx].set_index(['game_id','play_id']).index
    )],
    max_inp=60, max_out=40
)
gc.collect()

print(f"Train shape: {inp_train.shape}, Val shape: {inp_val.shape}")

# Scaling
print("Scaling...")
scaler = RobustScaler()
inp_train_flat = inp_train.reshape(-1, len(FEATURES))
scaler.fit(inp_train_flat)
inp_train_scaled = scaler.transform(inp_train_flat).reshape(inp_train.shape)
inp_val_scaled = scaler.transform(inp_val.reshape(-1, len(FEATURES))).reshape(inp_val.shape)
del inp_train_flat
gc.collect()

# Save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

TRAINING PIPELINE
Feature Engineering...
Adding enhanced features...
Computing player interactions...


Player interactions: 100%|██████████| 14108/14108 [00:14<00:00, 1001.93it/s]


Enhanced feature engineering complete!
Adding enhanced features...
Computing player interactions...


Player interactions: 100%|██████████| 143/143 [00:00<00:00, 1048.95it/s]


Enhanced feature engineering complete!
Train/Val Split...
Train samples: 3904237, Val samples: 976342
Preparing Data...
Preparing forecasting data...


Processing: 100%|██████████| 11286/11286 [03:07<00:00, 60.13it/s]


Created 36775 sequences
Preparing forecasting data...


Processing: 100%|██████████| 2822/2822 [00:46<00:00, 60.31it/s]


Created 9270 sequences
Train shape: (36775, 60, 26), Val shape: (9270, 60, 26)
Scaling...


In [None]:
# =========================================================
# TRAIN MODELS
# =========================================================
results = {}

# 1. PyTorch Model
print("\n" + "="*80)
print("TRAINING PYTORCH MODEL")
print("="*80)
train_loader = DataLoader(
    SimpleDataset(inp_train_scaled, out_train, mask_inp_train, mask_out_train),
    batch_size=64, shuffle=True
)
val_loader = DataLoader(
    SimpleDataset(inp_val_scaled, out_val, mask_inp_val, mask_out_val),
    batch_size=128, shuffle=False
)

pytorch_model = LightweightForecaster(input_dim=len(FEATURES), hidden_dim=256, max_out=40).to(device)
print(f"PyTorch model parameters: {sum(p.numel() for p in pytorch_model.parameters()):,}")

pytorch_rmse = train_pytorch_model(pytorch_model, train_loader, val_loader, epochs=20)
results['PyTorch'] = float(pytorch_rmse)  # Convert to Python float for JSON serialization
print(f"PyTorch Best RMSE: {pytorch_rmse:.4f}")

# 2. TensorFlow Model
print("\n" + "="*80)
print("TRAINING TENSORFLOW MODEL")
print("="*80)
tf_model = create_tf_model_simple(input_dim=len(FEATURES), hidden_dim=256, max_out=40)
# Build model before counting parameters (subclassed model needs build)
_ = tf_model([np.zeros((1, 60, len(FEATURES)), dtype='float32'),
              np.zeros((1, 40, 2), dtype='float32')], training=True)
print(f"TensorFlow model parameters: {tf_model.count_params():,}")

tf_model, tf_rmse, tf_history = train_tensorflow_model(
    tf_model, inp_train_scaled, out_train, inp_val_scaled, out_val,
    mask_train=mask_out_train, mask_val=mask_out_val, epochs=20
)
results['TensorFlow'] = float(tf_rmse)  # Convert to Python float for JSON serialization
print(f"TensorFlow Best RMSE: {tf_rmse:.4f}")


TRAINING PYTORCH MODEL
PyTorch model parameters: 1,587,202


PyTorch Epoch 1/20: 100%|██████████| 575/575 [00:06<00:00, 84.20it/s]


Epoch 01 | Loss: 1415.5933 | RMSE: 13.6242
Saved! Best: 13.6242


PyTorch Epoch 2/20: 100%|██████████| 575/575 [00:06<00:00, 84.50it/s]


Epoch 02 | Loss: 118.3230 | RMSE: 7.0687
Saved! Best: 7.0687


PyTorch Epoch 3/20: 100%|██████████| 575/575 [00:06<00:00, 83.94it/s]


Epoch 03 | Loss: 28.6519 | RMSE: 5.4056
Saved! Best: 5.4056


PyTorch Epoch 4/20: 100%|██████████| 575/575 [00:06<00:00, 83.52it/s]


Epoch 04 | Loss: 11.5428 | RMSE: 5.0979
Saved! Best: 5.0979


PyTorch Epoch 5/20: 100%|██████████| 575/575 [00:06<00:00, 83.81it/s]


Epoch 05 | Loss: 6.3193 | RMSE: 8.3377


PyTorch Epoch 6/20: 100%|██████████| 575/575 [00:06<00:00, 84.25it/s]


Epoch 06 | Loss: 4.1785 | RMSE: 3.8502
Saved! Best: 3.8502


PyTorch Epoch 7/20: 100%|██████████| 575/575 [00:06<00:00, 83.69it/s]


Epoch 07 | Loss: 2.7939 | RMSE: 3.4562
Saved! Best: 3.4562


PyTorch Epoch 8/20: 100%|██████████| 575/575 [00:06<00:00, 84.15it/s]


Epoch 08 | Loss: 1.8743 | RMSE: 4.0200


PyTorch Epoch 9/20: 100%|██████████| 575/575 [00:06<00:00, 84.47it/s]


Epoch 09 | Loss: 1.3589 | RMSE: 3.1265
Saved! Best: 3.1265


PyTorch Epoch 10/20: 100%|██████████| 575/575 [00:06<00:00, 84.72it/s]


Epoch 10 | Loss: 1.0847 | RMSE: 3.0968
Saved! Best: 3.0968


PyTorch Epoch 11/20: 100%|██████████| 575/575 [00:06<00:00, 84.02it/s]


Epoch 11 | Loss: 0.8263 | RMSE: 3.7200


PyTorch Epoch 12/20: 100%|██████████| 575/575 [00:06<00:00, 84.34it/s]


Epoch 12 | Loss: 0.7267 | RMSE: 2.6805
Saved! Best: 2.6805


PyTorch Epoch 13/20: 100%|██████████| 575/575 [00:06<00:00, 84.02it/s]


Epoch 13 | Loss: 0.5914 | RMSE: 3.9806


PyTorch Epoch 14/20: 100%|██████████| 575/575 [00:06<00:00, 84.29it/s]


Epoch 14 | Loss: 0.4125 | RMSE: 3.1966


PyTorch Epoch 15/20: 100%|██████████| 575/575 [00:06<00:00, 83.95it/s]


Epoch 15 | Loss: 0.3350 | RMSE: 2.4392
Saved! Best: 2.4392


PyTorch Epoch 16/20: 100%|██████████| 575/575 [00:06<00:00, 84.23it/s]


Epoch 16 | Loss: 0.2541 | RMSE: 1.9678
Saved! Best: 1.9678


PyTorch Epoch 17/20: 100%|██████████| 575/575 [00:06<00:00, 83.67it/s]


Epoch 17 | Loss: 0.2037 | RMSE: 2.0218


PyTorch Epoch 18/20: 100%|██████████| 575/575 [00:06<00:00, 84.30it/s]


Epoch 18 | Loss: 0.1630 | RMSE: 1.7133
Saved! Best: 1.7133


PyTorch Epoch 19/20: 100%|██████████| 575/575 [00:06<00:00, 83.61it/s]


Epoch 19 | Loss: 0.1432 | RMSE: 1.6238
Saved! Best: 1.6238


PyTorch Epoch 20/20: 100%|██████████| 575/575 [00:06<00:00, 85.07it/s]


Epoch 20 | Loss: 0.1323 | RMSE: 1.5523
Saved! Best: 1.5523
PyTorch Best RMSE: 1.5523

TRAINING TENSORFLOW MODEL
TensorFlow model parameters: 1,587,202
Training TensorFlow model...
Epoch 1/20
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 33ms/step - loss: 566.5207 - masked_mse: 451.7614 - val_loss: 106.1616 - val_masked_mse: 250.9043 - learning_rate: 5.0000e-04
Epoch 2/20
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - loss: 63.6776 - masked_mse: 305.1059 - val_loss: 21.8395 - val_masked_mse: 419.9876 - learning_rate: 4.9692e-04
Epoch 3/20
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - loss: 14.7428 - masked_mse: 436.2543 - val_loss: 7.2327 - val_masked_mse: 472.7694 - learning_rate: 4.8776e-04
Epoch 4/20
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - loss: 5.2497 - masked_mse: 482.4986 - val_loss: 2.9604 - val_masked_mse: 493.5779 - learning_rate: 4.7275e-04
Epoch 5/20
[

In [None]:
# =========================================================
# STATISTICAL COMPARISON
# =========================================================
print("\n" + "="*80)
print("STATISTICAL COMPARISON")
print("="*80)

# Calculate detailed metrics for each model
comparison_results = {}

for model_name, rmse in results.items():
    comparison_results[model_name] = {
        'RMSE': float(rmse),  # Ensure float type
        'RMSE_std': 0.0,  # Would need multiple runs for std
    }

# Print comparison table
print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(f"{'Model':<20} {'RMSE':<15}")
print("-" * 35)
for model_name, metrics in comparison_results.items():
    print(f"{model_name:<20} {metrics['RMSE']:<15.4f}")

# Find best model
best_model = min(results, key=results.get)
best_rmse = float(results[best_model])
print(f"\nBest Model: {best_model} (RMSE: {best_rmse:.4f})")

# Statistical tests (if we had multiple runs)
print("\n" + "="*80)
print("STATISTICAL ANALYSIS")
print("="*80)
print("Note: For full statistical comparison, multiple runs would be needed.")
print("Current comparison based on single validation set:\n")

# Calculate improvement percentages relative to best model
for model_name, rmse in results.items():
    rmse_float = float(rmse)
    if model_name == best_model:
        print(f"{model_name}: {rmse_float:.4f} (baseline - best model)")
    else:
        improvement = ((best_rmse - rmse_float) / best_rmse) * 100
        if improvement > 0:
            print(f"{model_name}: {rmse_float:.4f} ({improvement:+.2f}% worse than {best_model})")
        else:
            print(f"{model_name}: {rmse_float:.4f} ({abs(improvement):+.2f}% better than {best_model})")

# Save results (ensure all values are JSON serializable)
results_json = {k: float(v) for k, v in results.items()}
with open('model_comparison_results.json', 'w') as f:
    json.dump(results_json, f, indent=2)
print("\nResults saved to model_comparison_results.json")


STATISTICAL COMPARISON

MODEL COMPARISON RESULTS
Model                RMSE           
-----------------------------------
PyTorch              1.5523         
TensorFlow           19.4662        

Best Model: PyTorch (RMSE: 1.5523)

STATISTICAL ANALYSIS
Note: For full statistical comparison, multiple runs would be needed.
Current comparison based on single validation set:

PyTorch: 1.5523 (baseline - best model)
TensorFlow: 19.4662 (+1154.04% better than PyTorch)

Results saved to model_comparison_results.json


In [None]:
# =========================================================
# GENERATE SUBMISSIONS
# =========================================================
print("\n" + "="*80)
print("GENERATING SUBMISSIONS")
print("="*80)

def prepare_test_sequences(test_input_df, max_inp=60):
    """Prepare test input sequences"""
    test_with_features = add_enhanced_features(test_input_df)
    grouped = test_with_features.groupby(['game_id', 'play_id', 'nfl_id'])

    sequences = []
    metadata = []

    for (gid, pid, nid), group in grouped:
        group = group.sort_values('frame_id')
        inp_feat = group[FEATURES].values

        if len(inp_feat) > max_inp:
            inp_feat = inp_feat[-max_inp:]
        else:
            pad = max_inp - len(inp_feat)
            inp_feat = np.vstack([inp_feat, np.zeros((pad, len(FEATURES)))])

        sequences.append(inp_feat)
        metadata.append({'game_id': gid, 'play_id': pid, 'nfl_id': nid})

    sequences = np.array(sequences, dtype='float32')
    sequences = scaler.transform(sequences.reshape(-1, len(FEATURES))).reshape(sequences.shape)

    return sequences, metadata

def generate_submission_pytorch(model, test, test_input):
    """Generate submission for PyTorch model"""
    print("Generating PyTorch submission...")
    sequences, metadata = prepare_test_sequences(test_input, max_inp=60)

    model.eval()
    with torch.no_grad():
        sequences_tensor = torch.from_numpy(sequences).to(device)
        predictions_all = model(sequences_tensor).cpu().numpy()

    pred_map = {}
    for meta, pred in zip(metadata, predictions_all):
        key = (meta['game_id'], meta['play_id'], meta['nfl_id'])
        pred_map[key] = pred

    results = []
    for _, row in test.iterrows():
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        frame_id = row['frame_id']

        if key in pred_map:
            frame_idx = frame_id - 1
            if 0 <= frame_idx < 40:
                x_pred, y_pred = pred_map[key][frame_idx]
            else:
                x_pred, y_pred = pred_map[key][-1]
        else:
            x_pred, y_pred = 60.0, 26.65

        x_pred = np.clip(x_pred, 0.0, 120.0)
        y_pred = np.clip(y_pred, 0.0, 53.3)
        results.append({'x': float(x_pred), 'y': float(y_pred)})

    submission = pd.DataFrame(results)
    submission.to_csv('submission_pytorch.csv', index=False)
    print(f"PyTorch submission saved! Shape: {submission.shape}")
    return submission

def generate_submission_tensorflow(model, test, test_input):
    """Generate submission for TensorFlow model using autoregressive inference"""
    print("Generating TensorFlow submission...")
    sequences, metadata = prepare_test_sequences(test_input, max_inp=60)

    # Use autoregressive inference (matching PyTorch)
    predictions_all = model.predict_autoregressive(sequences)

    pred_map = {}
    for meta, pred in zip(metadata, predictions_all):
        key = (meta['game_id'], meta['play_id'], meta['nfl_id'])
        pred_map[key] = pred

    results = []
    for _, row in test.iterrows():
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        frame_id = row['frame_id']

        if key in pred_map:
            frame_idx = frame_id - 1
            if 0 <= frame_idx < 40:
                x_pred, y_pred = pred_map[key][frame_idx]
            else:
                x_pred, y_pred = pred_map[key][-1]
        else:
            x_pred, y_pred = 60.0, 26.65

        x_pred = np.clip(x_pred, 0.0, 120.0)
        y_pred = np.clip(y_pred, 0.0, 53.3)
        results.append({'x': float(x_pred), 'y': float(y_pred)})

    submission = pd.DataFrame(results)
    submission.to_csv('submission_tensorflow.csv', index=False)
    print(f"TensorFlow submission saved! Shape: {submission.shape}")
    return submission

# Generate all submissions
print("\nGenerating submissions for all models...")

# Load best models
pytorch_model.load_state_dict(torch.load("best_model_pytorch.pt", map_location=device))
pytorch_model.eval()

tf_model.load_weights('best_model_tensorflow.weights.h5')

submission_pytorch = generate_submission_pytorch(pytorch_model, test, test_input)
submission_tensorflow = generate_submission_tensorflow(tf_model, test, test_input)

print("\n" + "="*80)
print("ALL SUBMISSIONS GENERATED")
print("="*80)
print("Files created:")
print("  - submission_pytorch.csv")
print("  - submission_tensorflow.csv")
print("  - model_comparison_results.json")

print("\n" + "="*80)
print("COMPARISON SUMMARY")
print("="*80)
print(f"{'Model':<20} {'RMSE':<15} {'Submission File':<30}")
print("-" * 65)
print(f"{'PyTorch':<20} {results['PyTorch']:<15.4f} {'submission_pytorch.csv':<30}")
print(f"{'TensorFlow':<20} {results['TensorFlow']:<15.4f} {'submission_tensorflow.csv':<30}")
print("="*80)


GENERATING SUBMISSIONS

Generating submissions for all models...
Generating PyTorch submission...
Adding enhanced features...
Computing player interactions...


Player interactions: 100%|██████████| 143/143 [00:00<00:00, 1010.30it/s]


Enhanced feature engineering complete!
PyTorch submission saved! Shape: (5837, 2)
Generating TensorFlow submission...
Adding enhanced features...
Computing player interactions...


Player interactions: 100%|██████████| 143/143 [00:00<00:00, 1034.70it/s]


Enhanced feature engineering complete!
TensorFlow submission saved! Shape: (5837, 2)

ALL SUBMISSIONS GENERATED
Files created:
  - submission_pytorch.csv
  - submission_tensorflow.csv
  - model_comparison_results.json

COMPARISON SUMMARY
Model                RMSE            Submission File               
-----------------------------------------------------------------
PyTorch              1.5523          submission_pytorch.csv        
TensorFlow           19.4662         submission_tensorflow.csv     


In [None]:
# ============================
# 1) Parameter count check
# ============================

def count_torch_params(model: torch.nn.Module) -> int:
    # Count trainable parameters only
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_tf_params(model: tf.keras.Model) -> int:
    # Keras count_params counts trainable + non-trainable by default,
    # but in subclassed models it reflects built weights.
    return model.count_params()

print("PyTorch trainable params:", f"{count_torch_params(pytorch_model):,}")
print("TensorFlow params:", f"{count_tf_params(tf_model):,}")


PyTorch trainable params: 1,587,202
TensorFlow params: 1,587,202


In [None]:
# ============================
# 2A) PyTorch weight shapes
# ============================

def print_torch_gru_shapes(gru: torch.nn.GRU, prefix="gru"):
    # PyTorch GRU has parameters like:
    # weight_ih_l{k}, weight_hh_l{k}, bias_ih_l{k}, bias_hh_l{k}
    for name, p in gru.named_parameters():
        print(f"[Torch {prefix}] {name:20s} {tuple(p.shape)}")

print("=== PyTorch enc_embed ===")
print("weight:", tuple(pytorch_model.enc_embed.weight.shape), "bias:", tuple(pytorch_model.enc_embed.bias.shape))

print("\n=== PyTorch enc_gru ===")
print_torch_gru_shapes(pytorch_model.enc_gru, prefix="enc_gru")

print("\n=== PyTorch dec_embed ===")
print("weight:", tuple(pytorch_model.dec_embed.weight.shape), "bias:", tuple(pytorch_model.dec_embed.bias.shape))

print("\n=== PyTorch dec_gru ===")
print_torch_gru_shapes(pytorch_model.dec_gru, prefix="dec_gru")

print("\n=== PyTorch out_proj ===")
print("weight:", tuple(pytorch_model.out_proj.weight.shape), "bias:", tuple(pytorch_model.out_proj.bias.shape))


=== PyTorch enc_embed ===
weight: (256, 26) bias: (256,)

=== PyTorch enc_gru ===
[Torch enc_gru] weight_ih_l0         (768, 256)
[Torch enc_gru] weight_hh_l0         (768, 256)
[Torch enc_gru] bias_ih_l0           (768,)
[Torch enc_gru] bias_hh_l0           (768,)
[Torch enc_gru] weight_ih_l1         (768, 256)
[Torch enc_gru] weight_hh_l1         (768, 256)
[Torch enc_gru] bias_ih_l1           (768,)
[Torch enc_gru] bias_hh_l1           (768,)

=== PyTorch dec_embed ===
weight: (256, 2) bias: (256,)

=== PyTorch dec_gru ===
[Torch dec_gru] weight_ih_l0         (768, 256)
[Torch dec_gru] weight_hh_l0         (768, 256)
[Torch dec_gru] bias_ih_l0           (768,)
[Torch dec_gru] bias_hh_l0           (768,)
[Torch dec_gru] weight_ih_l1         (768, 256)
[Torch dec_gru] weight_hh_l1         (768, 256)
[Torch dec_gru] bias_ih_l1           (768,)
[Torch dec_gru] bias_hh_l1           (768,)

=== PyTorch out_proj ===
weight: (2, 256) bias: (2,)


In [None]:
# ============================
# 2B) TensorFlow weight shapes
# ============================

def print_tf_layer_weights(layer: tf.keras.layers.Layer, prefix="layer"):
    weights = layer.weights
    if not weights:
        print(f"[TF {prefix}] (no weights or not built)")
        return
    for w in weights:
        print(f"[TF {prefix}] {w.name:45s} {tuple(w.shape)}")

print("=== TensorFlow enc_embed ===")
print_tf_layer_weights(tf_model.enc_embed, prefix="enc_embed")

print("\n=== TensorFlow enc_gru (StackedRNNCells inside RNN) ===")
# Each GRUCell has kernel, recurrent_kernel, bias
print_tf_layer_weights(tf_model.enc_gru_cell1, prefix="enc_gru_cell1")
print_tf_layer_weights(tf_model.enc_gru_cell2, prefix="enc_gru_cell2")

print("\n=== TensorFlow dec_embed ===")
print_tf_layer_weights(tf_model.dec_embed, prefix="dec_embed")

print("\n=== TensorFlow dec_gru (StackedRNNCells inside RNN) ===")
print_tf_layer_weights(tf_model.dec_gru_cell1, prefix="dec_gru_cell1")
print_tf_layer_weights(tf_model.dec_gru_cell2, prefix="dec_gru_cell2")

print("\n=== TensorFlow out_proj ===")
print_tf_layer_weights(tf_model.out_proj, prefix="out_proj")

=== TensorFlow enc_embed ===
[TF enc_embed] kernel                                        (26, 256)
[TF enc_embed] bias                                          (256,)

=== TensorFlow enc_gru (StackedRNNCells inside RNN) ===
[TF enc_gru_cell1] kernel                                        (256, 768)
[TF enc_gru_cell1] recurrent_kernel                              (256, 768)
[TF enc_gru_cell1] bias                                          (2, 768)
[TF enc_gru_cell2] kernel                                        (256, 768)
[TF enc_gru_cell2] recurrent_kernel                              (256, 768)
[TF enc_gru_cell2] bias                                          (2, 768)

=== TensorFlow dec_embed ===
[TF dec_embed] kernel                                        (2, 256)
[TF dec_embed] bias                                          (256,)

=== TensorFlow dec_gru (StackedRNNCells inside RNN) ===
[TF dec_gru_cell1] kernel                                        (256, 768)
[TF dec_gru_cell1] rec

In [None]:
# ============================
# 3) Forward output shape check
# ============================

# Dummy inputs
B = 4
T_in = 60
T_out = 40
D = len(FEATURES)

x_np = np.random.randn(B, T_in, D).astype("float32")
y_np = np.random.randn(B, T_out, 2).astype("float32")

# PyTorch: teacher forcing forward
pytorch_model.train()
with torch.no_grad():
    x_t = torch.from_numpy(x_np).to(device)
    y_t = torch.from_numpy(y_np).to(device)
    out_torch_train = pytorch_model(x_t, target=y_t).cpu().numpy()

# PyTorch: autoregressive forward
pytorch_model.eval()
with torch.no_grad():
    out_torch_inf = pytorch_model(x_t).cpu().numpy()

# TF: teacher forcing forward
tf_out_train = tf_model([x_np, y_np], training=True).numpy()

# TF: autoregressive forward
tf_out_inf = tf_model.predict_autoregressive(x_np)

print("Torch train out shape:", out_torch_train.shape)
print("TF    train out shape:", tf_out_train.shape)

print("Torch infer out shape:", out_torch_inf.shape)
print("TF    infer out shape:", tf_out_inf.shape)


Torch train out shape: (4, 40, 2)
TF    train out shape: (4, 40, 2)
Torch infer out shape: (4, 40, 2)
TF    infer out shape: (4, 40, 2)


In [None]:
import numpy as np
import pandas as pd
from scipy import stats

def compare_submissions(
    path_pt="submission_pytorch.csv",
    path_tf="submission_tensorflow.csv",
    eps=1e-12
):
    # -----------------------
    # 1) Load + sanity checks
    # -----------------------
    pt = pd.read_csv(path_pt)
    tf = pd.read_csv(path_tf)

    required_cols = {"x", "y"}
    if not required_cols.issubset(pt.columns) or not required_cols.issubset(tf.columns):
        raise ValueError(f"Both files must contain columns {required_cols}")

    if len(pt) != len(tf):
        raise ValueError(f"Row count mismatch: PyTorch={len(pt)}, TensorFlow={len(tf)}")

    # ensure numeric
    pt = pt[["x", "y"]].astype(float).reset_index(drop=True)
    tf = tf[["x", "y"]].astype(float).reset_index(drop=True)

    # -----------------------
    # 2) Row-wise differences
    # -----------------------
    dx = tf["x"].values - pt["x"].values
    dy = tf["y"].values - pt["y"].values
    dist = np.sqrt(dx**2 + dy**2)  # Euclidean distance between model predictions

    # PyTorch-as-reference error metrics (NOT ground-truth)
    rmse_x = np.sqrt(np.mean(dx**2))
    rmse_y = np.sqrt(np.mean(dy**2))
    rmse_xy = np.sqrt(np.mean(dx**2 + dy**2))  # RMSE in 2D
    mae_x = np.mean(np.abs(dx))
    mae_y = np.mean(np.abs(dy))
    mae_xy = np.mean(dist)

    # Correlation + R^2 (per coordinate)
    # (R^2 here is "how well TF explains PT" linearly; still not accuracy vs GT)
    def r2(y_true, y_pred):
        ss_res = np.sum((y_true - y_pred)**2)
        ss_tot = np.sum((y_true - np.mean(y_true))**2) + eps
        return 1.0 - ss_res / ss_tot

    corr_x = np.corrcoef(pt["x"].values, tf["x"].values)[0, 1]
    corr_y = np.corrcoef(pt["y"].values, tf["y"].values)[0, 1]
    r2_x = r2(pt["x"].values, tf["x"].values)
    r2_y = r2(pt["y"].values, tf["y"].values)

    # -----------------------
    # 3) Statistical tests
    # -----------------------
    # Paired tests on differences (H0: mean/median difference = 0)
    t_x = stats.ttest_rel(tf["x"].values, pt["x"].values, nan_policy="omit")
    t_y = stats.ttest_rel(tf["y"].values, pt["y"].values, nan_policy="omit")

    # Wilcoxon signed-rank (robust alternative); requires non-all-zero diffs
    def safe_wilcoxon(d):
        d = d[np.isfinite(d)]
        if np.allclose(d, 0.0):
            return None
        return stats.wilcoxon(d)

    w_x = safe_wilcoxon(dx)
    w_y = safe_wilcoxon(dy)

    # KS test on marginal distributions (H0: same distribution)
    ks_x = stats.ks_2samp(pt["x"].values, tf["x"].values)
    ks_y = stats.ks_2samp(pt["y"].values, tf["y"].values)

    # Effect size: Cohen's d for paired differences
    def cohens_d_paired(d):
        d = d[np.isfinite(d)]
        return np.mean(d) / (np.std(d, ddof=1) + eps)

    d_x = cohens_d_paired(dx)
    d_y = cohens_d_paired(dy)

    # -----------------------
    # 4) Summaries + outliers
    # -----------------------
    summary = {
        "n_rows": len(pt),

        "diff_mean_x (TF-PT)": float(np.mean(dx)),
        "diff_std_x": float(np.std(dx, ddof=1)),
        "diff_median_x": float(np.median(dx)),

        "diff_mean_y (TF-PT)": float(np.mean(dy)),
        "diff_std_y": float(np.std(dy, ddof=1)),
        "diff_median_y": float(np.median(dy)),

        "dist_mean": float(np.mean(dist)),
        "dist_median": float(np.median(dist)),
        "dist_p95": float(np.quantile(dist, 0.95)),
        "dist_p99": float(np.quantile(dist, 0.99)),
        "dist_max": float(np.max(dist)),

        "rmse_x (TF vs PT)": float(rmse_x),
        "rmse_y (TF vs PT)": float(rmse_y),
        "rmse_xy (2D)": float(rmse_xy),
        "mae_x (TF vs PT)": float(mae_x),
        "mae_y (TF vs PT)": float(mae_y),
        "mae_xy (2D)": float(mae_xy),

        "corr_x (PT vs TF)": float(corr_x),
        "corr_y (PT vs TF)": float(corr_y),
        "r2_x (PT~TF)": float(r2_x),
        "r2_y (PT~TF)": float(r2_y),

        "paired_ttest_x_stat": float(t_x.statistic),
        "paired_ttest_x_p": float(t_x.pvalue),
        "paired_ttest_y_stat": float(t_y.statistic),
        "paired_ttest_y_p": float(t_y.pvalue),

        "wilcoxon_x_stat": None if w_x is None else float(w_x.statistic),
        "wilcoxon_x_p": None if w_x is None else float(w_x.pvalue),
        "wilcoxon_y_stat": None if w_y is None else float(w_y.statistic),
        "wilcoxon_y_p": None if w_y is None else float(w_y.pvalue),

        "ks_x_stat": float(ks_x.statistic),
        "ks_x_p": float(ks_x.pvalue),
        "ks_y_stat": float(ks_y.statistic),
        "ks_y_p": float(ks_y.pvalue),

        "cohens_d_paired_x": float(d_x),
        "cohens_d_paired_y": float(d_y),
    }

    # Outliers: top 1% distances
    k = max(1, int(0.01 * len(dist)))
    top_idx = np.argsort(dist)[-k:][::-1]
    outliers = pd.DataFrame({
        "row_index": top_idx,
        "pt_x": pt.loc[top_idx, "x"].values,
        "pt_y": pt.loc[top_idx, "y"].values,
        "tf_x": tf.loc[top_idx, "x"].values,
        "tf_y": tf.loc[top_idx, "y"].values,
        "dx": dx[top_idx],
        "dy": dy[top_idx],
        "dist": dist[top_idx],
    })

    # Print a clean report
    print("\n" + "="*80)
    print("SUBMISSION COMPARISON (TensorFlow vs PyTorch; paired by row index)")
    print("="*80)
    for k, v in summary.items():
        print(f"{k:<28}: {v}")

    print("\nTop 1% largest per-row prediction gaps (Euclidean distance):")
    print(outliers.head(20).to_string(index=False))

    return summary, outliers

# Run
summary, outliers = compare_submissions(
    "submission_pytorch.csv",
    "submission_tensorflow.csv"
)


SUBMISSION COMPARISON (TensorFlow vs PyTorch; paired by row index)
n_rows                      : 5837
diff_mean_x (TF-PT)         : -0.3924265122687304
diff_std_x                  : 23.550622916874378
diff_median_x               : -2.234142303466797
diff_mean_y (TF-PT)         : -1.1467200008967529
diff_std_y                  : 11.350704544439992
diff_median_y               : -1.7814788818359375
dist_mean                   : 22.651423285753275
dist_median                 : 20.104820931358756
dist_p95                    : 48.510847327525546
dist_p99                    : 60.93940527707063
dist_max                    : 79.32014351923331
rmse_x (TF vs PT)           : 23.55187504899019
rmse_y (TF vs PT)           : 11.407514527319222
rmse_xy (2D)                : 26.16910785667417
mae_x (TF vs PT)            : 18.780578349170487
mae_y (TF vs PT)            : 9.746711684346547
mae_xy (2D)                 : 22.651423285753275
corr_x (PT vs TF)           : 0.27247461804061796
corr_y (PT vs TF