# NBA Meta-Learner Training (Kaggle - FINAL)

**Requirements:**
- GPU: T4 or P100 (Settings → Accelerator → GPU T4 x2)
- Internet: ON (Settings → Internet → ON)
- Datasets (Click 'Add Data'):
  1. **nba-window-models** (your dataset with 27 .pkl files)
  2. **nba-predictor-code** (your dataset with kaggle_code.tar.gz)
  3. **historical-nba-data-and-player-box-scores** (Eoin Moore)

**Output:**
- `meta_learner_2025_2026.pkl` - Download from Output tab
- Upload to Modal: `modal volume put nba-models meta_learner_2025_2026.pkl`

## 1. Setup - Add Code Path

In [None]:
# CRITICAL: Add code path BEFORE any imports
import sys
sys.path.insert(0, '/kaggle/input/nba-predictor-code')

print("✓ Code path added")

## 2. Install Dependencies

In [None]:
!pip install -q pytorch-tabnet lightgbm

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Load Window Models

In [None]:
model_dir = Path("/kaggle/input/nba-window-models")
model_files = sorted(model_dir.glob("player_models_*.pkl"))

print(f"Found {len(model_files)} window models")

# Load all models
window_models = []
for model_file in model_files:
    print(f"Loading {model_file.name}...", end=" ")
    with open(model_file, 'rb') as f:
        models = pickle.load(f)
    window_models.append(models)
    print("✓")

print(f"\n✓ Loaded {len(window_models)} windows")

## 4. Load Training Data (2024-2025)

In [None]:
csv_path = "/kaggle/input/historical-nba-data-and-player-box-scores/PlayerStatistics.csv"

print(f"Loading {csv_path}...")
df = pd.read_csv(csv_path, low_memory=False)
print(f"Total records: {len(df):,}")

# Parse dates and extract season
df['gameDate'] = pd.to_datetime(df['gameDate'], format='mixed', utc=True, errors='coerce')
df['gameDate'] = df['gameDate'].dt.tz_localize(None)
df['year'] = df['gameDate'].dt.year
df['month'] = df['gameDate'].dt.month

# NBA season: Oct-June
df['season_year'] = df.apply(
    lambda row: row['year'] if row['month'] >= 10 else row['year'] - 1,
    axis=1
)

# Filter to 2024-2025
df_train = df[df['season_year'] == 2024].copy()
print(f"2024-2025 season: {len(df_train):,} records")

# Sample for training (2000 games max for speed)
df_train = df_train.sample(min(2000, len(df_train)), random_state=42)
print(f"Training sample: {len(df_train):,} games")

## 5. Collect Window Predictions

In [None]:
def get_prediction_from_window(window_model, game_row, prop_name):
    try:
        features = pd.DataFrame([{
            'points': float(game_row.get('points', 0)),
            'assists': float(game_row.get('assists', 0)),
            'reboundsTotal': float(game_row.get('reboundsTotal', 0)),
            'threePointersMade': float(game_row.get('threePointersMade', 0)),
            'numMinutes': float(game_row.get('numMinutes', 0)),
            'fieldGoalsAttempted': float(game_row.get('fieldGoalsAttempted', 0)),
            'freeThrowsAttempted': float(game_row.get('freeThrowsAttempted', 0)),
        }])
        if isinstance(window_model, dict) and 'multi_task_model' in window_model:
            model = window_model['multi_task_model']
            if hasattr(model, 'predict'):
                preds = model.predict(features)
                if isinstance(preds, dict) and prop_name in preds:
                    return float(preds[prop_name][0])
        if isinstance(window_model, dict) and prop_name in window_model:
            model = window_model[prop_name]
            if model is not None and hasattr(model, 'predict'):
                pred = model.predict(features)
                return float(pred[0])
        return 0.0
    except:
        return 0.0

def collect_predictions_simple(prop_name, actual_col_name):
    print(f"\nCOLLECTING: {prop_name.upper()}")
    print("="*70)
    
    # Test first window
    test_game = df_train.iloc[0]
    test_pred = get_prediction_from_window(window_models[0], test_game, prop_name)
    print(f"Test prediction from first window: {test_pred}")
    
    window_preds = []
    actuals = []
    contexts = []
    
    total = len(df_train)
    print(f"Processing {total} games...")
    
    for idx in range(total):
        if idx % 200 == 0:
            print(f"  Progress: {idx}/{total}")
        
        game = df_train.iloc[idx]
        actual = game.get(actual_col_name)
        if pd.isna(actual) or actual < 0:
            continue
        
        preds = []
        for window in window_models:
            preds.append(get_prediction_from_window(window, game, prop_name))
        
        non_zero = sum(1 for p in preds if p != 0.0)
        if non_zero < 10:
            continue
        
        while len(preds) < 27:
            preds.append(np.mean([p for p in preds if p != 0.0]))
        
        window_preds.append(preds[:27])
        actuals.append(actual)
        
        contexts.append({
            'minutes': game.get('numMinutes', 30),
            'is_home': int(game.get('home', 0)),
        })
    
    print(f"✓ Collected {len(actuals)} samples")
    print(f"  Sample predictions: {window_preds[0][:5] if len(window_preds) > 0 else 'none'}")
    
    return {
        'window_preds': np.array(window_preds),
        'actuals': np.array(actuals),
        'context': pd.DataFrame(contexts)
    }

# Collect all props
prop_data = {}
props = {
    'points': 'points',
    'rebounds': 'reboundsTotal',
    'assists': 'assists',
    'threes': 'threePointersMade'
}

for prop_name, col_name in props.items():
    prop_data[prop_name] = collect_predictions_simple(prop_name, col_name)

## 6. Train Meta-Learner

In [None]:
def train_meta_model(prop_name, data):
    print(f"\nTraining meta-learner: {prop_name.upper()}")
    
    X = data['window_preds']
    y = data['actuals']
    context = data['context']
    
    # Combine window predictions + context
    X_full = np.hstack([X, context.values])
    
    # Baseline: simple average
    baseline_preds = np.mean(X, axis=1)
    baseline_rmse = np.sqrt(mean_squared_error(y, baseline_preds))
    
    # Train with 5-fold OOF
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y))
    
    models = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
        X_train, X_val = X_full[train_idx], X_full[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = lgb.LGBMRegressor(
            n_estimators=100,
            learning_rate=0.05,
            max_depth=5,
            num_leaves=31,
            random_state=42,
            verbose=-1
        )
        
        model.fit(X_train, y_train)
        oof_preds[val_idx] = model.predict(X_val)
        models.append(model)
    
    oof_rmse = np.sqrt(mean_squared_error(y, oof_preds))
    improvement = ((baseline_rmse - oof_rmse) / baseline_rmse) * 100
    
    print(f"  Baseline RMSE: {baseline_rmse:.3f}")
    print(f"  OOF RMSE:      {oof_rmse:.3f}")
    print(f"  Improvement:   {improvement:+.1f}%")
    
    # Train final model on all data
    final_model = lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=5,
        num_leaves=31,
        random_state=42,
        verbose=-1
    )
    final_model.fit(X_full, y)
    
    return {
        'model': final_model,
        'baseline_rmse': baseline_rmse,
        'oof_rmse': oof_rmse,
        'improvement_pct': improvement
    }

# Train all props
print("="*70)
print("TRAINING META-LEARNER")
print("="*70)

meta_models = {}
results = {}

for prop_name, data in prop_data.items():
    if len(data['actuals']) < 100:
        print(f"\nSkipping {prop_name}: not enough samples")
        continue
    
    result = train_meta_model(prop_name, data)
    meta_models[prop_name] = result['model']
    results[prop_name] = result

print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
for prop, res in results.items():
    print(f"{prop:12s}: {res['improvement_pct']:+.1f}% improvement")

## 7. Save Meta-Learner

In [None]:
# Create meta-learner object
meta_learner = {
    'meta_models': meta_models,
    'n_windows': 27,
    'results': results,
    'training_season': '2024-2025',
    'training_samples': {prop: len(data['actuals']) for prop, data in prop_data.items()}
}

# Save
output_file = "meta_learner_2025_2026.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(meta_learner, f)

print(f"✅ Saved: {output_file}")
print(f"\nNext steps:")
print(f"1. Download {output_file} from Output tab")
print(f"2. Upload to Modal: modal volume put nba-models {output_file}")
print(f"3. Run analyzer: modal run modal_analyzer.py")