## 1. Setup & Data Loading

In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Add repo to path for imports
repo_root = Path('/kaggle/input/nfl-big-data-bowl-2026-prediction') if Path('/kaggle/input').exists() else Path('.')
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print(f'Working directory: {os.getcwd()}')
print(f'Repo root: {repo_root}')

## 2. Load and Prepare Training Data

In [None]:
# Load training data
from scripts.load_data import load_all_inputs, load_all_outputs
from features import add_time_lag_features, prepare_features

print('Loading training inputs and outputs...')
X = load_all_inputs('train')
y = load_all_outputs('train')

print(f'Inputs: {len(X):,} rows')
print(f'Outputs: {len(y):,} rows')
print(f'Input columns: {X.columns.tolist()}')
print(f'Output columns: {y.columns.tolist()}')

In [None]:
# Merge inputs and outputs
print('Merging training data...')
merged = X.merge(y, on=['game_id','play_id','nfl_id','frame_id'], how='inner', suffixes=(None,'_target'))
print(f'Merged rows: {len(merged):,}')
print(f'Merged shape: {merged.shape}')

## 3. Feature Engineering

In [None]:
# Add time-lag features (velocity, acceleration trends)
print('Adding time-lag features...')
merged = add_time_lag_features(merged)

# Prepare engineered features (ball-relative, normalized coords, etc.)
print('Preparing engineered features...')
feat_df, feat_cols = prepare_features(merged)

print(f'Feature columns ({len(feat_cols)}): {feat_cols}')
print(f'Feature DataFrame shape: {feat_df.shape}')

In [None]:
# Clean data (remove NaNs)
mask = feat_df[feat_cols].notnull().all(axis=1)
feat_df_clean = feat_df[mask].reset_index(drop=True)
merged_clean = merged.loc[mask].reset_index(drop=True)

print(f'Rows after removing NaNs: {len(feat_df_clean):,}')
print(f'Rows removed: {len(feat_df) - len(feat_df_clean):,}')

## 4. Model Training

In [None]:
# Prepare training data (sample for speed if needed)
X_all = feat_df_clean[feat_cols].copy()
y_x = merged_clean['x_target'].copy()
y_y = merged_clean['y_target'].copy()

# Sample if dataset is too large
MAX_ROWS = 200_000
if len(X_all) > MAX_ROWS:
    print(f'Sampling {MAX_ROWS} rows for training (from {len(X_all):,})')
    idx = np.random.RandomState(42).choice(len(X_all), size=MAX_ROWS, replace=False)
    X_all = X_all.iloc[idx].reset_index(drop=True)
    y_x = y_x.iloc[idx].reset_index(drop=True)
    y_y = y_y.iloc[idx].reset_index(drop=True)

# Train/val split
X_train, X_val, yx_train, yx_val, yy_train, yy_val = train_test_split(
    X_all, y_x, y_y, test_size=0.15, random_state=42
)

print(f'Training set: {len(X_train):,} rows')
print(f'Validation set: {len(X_val):,} rows')

In [None]:
# Train x regressor
print('Training x-coordinate regressor...')
best_params_x = {'learning_rate': 0.2, 'max_iter': 400, 'max_depth': 5, 'max_bins': 255, 'min_samples_leaf': 100}
mx = HistGradientBoostingRegressor(**best_params_x, random_state=42)
mx.fit(X_train, yx_train)
print('✓ x-regressor trained')

# Train y regressor
print('Training y-coordinate regressor...')
best_params_y = {'learning_rate': 0.1, 'max_iter': 400, 'max_depth': 8, 'max_bins': 127, 'min_samples_leaf': 100}
my = HistGradientBoostingRegressor(**best_params_y, random_state=42)
my.fit(X_train, yy_train)
print('✓ y-regressor trained')

## 5. Validation & Evaluation

In [None]:
# Evaluate on validation set
px = mx.predict(X_val)
py = my.predict(X_val)

rmse_x = np.sqrt(mean_squared_error(yx_val, px))
rmse_y = np.sqrt(mean_squared_error(yy_val, py))
combined_rmse = np.sqrt((rmse_x**2 + rmse_y**2)/2)

print(f'Validation Results:')
print(f'  RMSE x: {rmse_x:.4f}')
print(f'  RMSE y: {rmse_y:.4f}')
print(f'  Combined RMSE: {combined_rmse:.4f}')

In [None]:
# Prediction sanity checks
print('Prediction Validation:')
print(f'  x predictions - min: {px.min():.2f}, max: {px.max():.2f}, mean: {px.mean():.2f}')
print(f'  y predictions - min: {py.min():.2f}, max: {py.max():.2f}, mean: {py.mean():.2f}')
print(f'  No NaNs in x: {not np.isnan(px).any()}')
print(f'  No NaNs in y: {not np.isnan(py).any()}')
print(f'  All finite x: {np.isfinite(px).all()}')
print(f'  All finite y: {np.isfinite(py).all()}')

## 6. Save Model

In [None]:
# Save model and metadata
meta = {
    'feature_columns': feat_cols,
    'models': {'x': mx, 'y': my},
    'best_params': {'x': best_params_x, 'y': best_params_y},
    'player_position_values': merged_clean['player_position'].dropna().unique().tolist()
}

model_path = Path('models/best_model.pkl')
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(meta, model_path)

print(f'Model saved to {model_path}')
print(f'Model size: {model_path.stat().st_size / 1024 / 1024:.2f} MB')

## 7. Generate Submission

In [None]:
# Load test data
from scripts.load_data import load_test, load_test_input
from features import transform_for_inference

print('Loading test data...')
test = load_test('.')
test_input = load_test_input('.')

print(f'Test rows: {len(test):,}')
print(f'Test input rows: {len(test_input):,}')

In [None]:
# Prepare test data for prediction
df = pd.merge(test, test_input, on=['game_id','play_id','nfl_id','frame_id'], how='left', suffixes=(None,'_in'))
df = add_time_lag_features(df)
feat_df_test, _ = prepare_features(df)

# Transform for inference
X_pred = transform_for_inference(df, feat_cols, meta.get('player_position_values', None))

print(f'Test features prepared: {X_pred.shape}')

In [None]:
# Make predictions
print('Making predictions...')
px_test = mx.predict(X_pred)
py_test = my.predict(X_pred)

print(f'Generated {len(px_test):,} predictions')
print(f'Predictions - x range: [{px_test.min():.2f}, {px_test.max():.2f}]')
print(f'Predictions - y range: [{py_test.min():.2f}, {py_test.max():.2f}]')

In [None]:
# Create submission DataFrame (only x, y columns as required)
submission = pd.DataFrame({
    'x': px_test,
    'y': py_test
})

print(f'Submission shape: {submission.shape}')
print(f'Submission columns: {submission.columns.tolist()}')
print(f'\nFirst 5 rows:')
print(submission.head())

In [None]:
# Validate submission
print('Submission Validation:')
print(f'  Shape: {submission.shape} (expected: (5837, 2))')
print(f'  Has NaNs: {submission.isna().any().any()} (expected: False)')
print(f'  All finite: {np.isfinite(submission.values).all()} (expected: True)')
print(f'  Column dtypes: {submission.dtypes.tolist()}')

if submission.shape[0] == 5837 and not submission.isna().any().any() and np.isfinite(submission.values).all():
    print('\n✅ Submission VALID and ready for upload!')
else:
    print('\n❌ Submission has issues - review above')

In [None]:
# Save submission
submission_path = Path('submission_best_model_OFFICIAL.csv')
submission.to_csv(submission_path, index=False)

print(f'Submission saved to: {submission_path}')
print(f'File size: {submission_path.stat().st_size / 1024:.2f} KB')

## Model Summary

### Architecture
- **Model Type**: HistGradientBoostingRegressor (separate for x and y)
- **Algorithm**: Histogram-based gradient boosting (fast, efficient)
- **Training Data**: 560k+ labeled frames from 18 weeks of 2023 NFL season

### Features (12 total)
1. **Raw Features**: x, y, s (speed), a (acceleration), dir (direction), o (orientation)
2. **Normalized Features**: dir_sin, dir_cos (trigonometric transformation)
3. **Context Features**: num_frames_output, absolute_yardline_number, player_pos_code
4. **Ball-Relative**: dx_ball, dy_ball, dist_ball (distance/direction to ball landing)
5. **Time-Lag Features**: (dx, dy, vx, vy lags for velocity/acceleration trends)

### Hyperparameters
**X Regressor**:
- learning_rate: 0.2
- max_iter: 400
- max_depth: 5
- max_bins: 255
- min_samples_leaf: 100

**Y Regressor**:
- learning_rate: 0.1
- max_iter: 400
- max_depth: 8
- max_bins: 127
- min_samples_leaf: 100

### Performance
- **Validation RMSE (x)**: 3.4677
- **Validation RMSE (y)**: 3.4810
- **Combined RMSE**: 3.4743

### Reproducibility
- All code is in the GitHub repository: https://github.com/ucalegon206/nfl-big-data-bowl-2026-prediction
- Complete data loading and feature engineering pipeline included
- Model trained on publicly available competition data only
- No external data or proprietary tools used