In [None]:
# Cell 1 — Imports & helper functions
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.load_data import load_all_inputs, load_all_outputs, load_test, load_test_input
import features
import joblib
plt.rcParams['figure.figsize'] = (8, 5)
sns.set_style('whitegrid')

In [None]:
# Cell 2 — Load a small amount of data for quick interactive EDA
# Note: change paths if your workspace structure differs
X = load_all_inputs('train')
y = load_all_outputs('train')
test = load_test('.')
test_input = load_test_input('.')
print('loaded rows:', len(X), 'inputs; ', len(y), 'outputs; ', len(test), 'test rows; ', len(test_input), 'test_input rows')
# Show column heads
display(X.head())
display(y.head())

In [None]:
# Cell 3 — Quick summary statistics
print('Unique plays in train inputs:', X[['game_id','play_id']].drop_duplicates().shape[0])
print('Speed (s) summary:')
display(X['s'].describe())
print('Acceleration (a) summary:')
display(X['a'].describe())

# distribution of num_frames_output (how many frames the model must predict)
display(X['num_frames_output'].value_counts().sort_index().head(20))

In [None]:
# Cell 4 — Plot a sample trajectory for one play/player
sample_play = X.groupby(['game_id','play_id']).ngroup().sample(n=1, random_state=1).iloc[0] if len(X)>0 else None
if sample_play is not None:
    gp = X[(X['game_id']==sample_play['game_id']) & (X['play_id']==sample_play['play_id'])]
    # choose a player in that play
    player_rows = gp['nfl_id'].unique()[:5]
    fig, ax = plt.subplots()
    for pid in player_rows:
        pr = gp[gp['nfl_id']==pid]
        ax.plot(pr['x'], pr['y'], marker='o', label=str(pid))
    ax.set_title(f'Sample play trajectories (game {sample_play[game_id]}, play {sample_play[play_id]})')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend()
    plt.gca().invert_yaxis()
    plt.show()
else:
    print('No data available to plot')

In [None]:
# Cell 5 — Speed distribution and per-position median speed
fig, ax = plt.subplots()
sns.histplot(X['s'].dropna(), bins=60, ax=ax, kde=True)
ax.set_title('Distribution of speed (s)')
plt.show()

if 'player_position' in X.columns:
    med = X.groupby('player_position')['s'].median().sort_values(ascending=False).head(20)
    display(med.reset_index().rename(columns={'s':'median_speed'}))

In [None]:
# Cell 6 — Compute features using `features.py` and show sample rows
from features import prepare_features, add_time_lag_features, transform_for_inference
# Use a small subset for speed in the notebook
sample_df = X.head(2000).copy()
sample_df = add_time_lag_features(sample_df)
sample_feat_df, feat_cols = prepare_features(sample_df)
print('Feature columns:', feat_cols)
display(sample_feat_df[feat_cols].head())

In [None]:
# Cell 7 — Load trained model and evaluate on a small held-out sample from train
model_path = 'models/lgbm_baseline.pkl'
import os
if os.path.exists(model_path):
    meta = joblib.load(model_path)
    X_full = X.copy()
    y_full = y.copy()
    merged = pd.merge(X_full, y_full, on=['game_id','play_id','nfl_id','frame_id'], how='inner', suffixes=(None,'_target'))
    merged = add_time_lag_features(merged)
    merged_feat, feat_cols = prepare_features(merged)
    # predict on a small random subset for speed
    sample = merged.sample(frac=0.02, random_state=42)
    X_eval = transform_for_inference(sample, meta['feature_columns'], meta.get('player_position_values', None))
    mx = meta['models']['x']
    my = meta['models']['y']
    px = mx.predict(X_eval)
    py = my.predict(X_eval)
    # compute simple MAE vs targets
    mae_x = (np.abs(px - sample['x_target'])).mean()
    mae_y = (np.abs(py - sample['y_target'])).mean()
    print(f'MAE x: {mae_x:.4f}, MAE y: {mae_y:.4f} on sample of {len(sample)} rows')
    # scatter predicted vs actual for x
    fig, ax = plt.subplots(1,2, figsize=(12,5))
    ax[0].scatter(sample['x_target'], px, alpha=0.5, s=8)
    ax[0].set_title('x: actual vs predicted')
    ax[0].set_xlabel('actual x')
    ax[0].set_ylabel('predicted x')
    ax[1].scatter(sample['y_target'], py, alpha=0.5, s=8)
    ax[1].set_title('y: actual vs predicted')
    ax[1].set_xlabel('actual y')
    ax[1].set_ylabel('predicted y')
    plt.show()
else:
    print('Model not found at', model_path, ' — run training first')