# NBA Meta-Learner Training (Kaggle)

Train context-aware meta-learner using 27 window models.

**Requirements:**
- GPU: T4 or P100 (free on Kaggle)
- Dataset: Historical NBA Data and Player Box Scores (Kaggle)
- Window models: Upload to Kaggle dataset or download from Modal

**Steps:**
1. Install dependencies
2. Load 27 window models from Kaggle dataset
3. Load PlayerStatistics.csv (2024-2025 season)
4. Collect window predictions for each prop
5. Train meta-learner with OOF cross-validation
6. Download meta_learner_2025_2026.pkl
7. Upload to Modal: `modal volume put nba-models meta_learner_2025_2026.pkl`

## 1. Setup & Dependencies

In [None]:
!pip install -q pytorch-tabnet lightgbm scikit-learn pandas numpy

In [None]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

## 2. Load Project Files

**Option A**: Upload as Kaggle dataset
- Create dataset with: ensemble_predictor.py, meta_learner_ensemble.py, hybrid_multi_task.py, etc.
- Add as dataset dependency

**Option B**: Copy-paste code into cells (simpler for first run)

In [None]:
# Option B: Copy code from GitHub
# You'll need to paste ensemble_predictor.py and meta_learner_ensemble.py code here
# OR upload them as a Kaggle dataset

# For now, we'll assume they're in /kaggle/input/nba-predictor-code/
sys.path.insert(0, '/kaggle/input/nba-predictor-code/')

## 3. Load Window Models

**Upload window models to Kaggle dataset:**
1. Download from Modal: `modal volume get nba-models model_cache/`
2. Create Kaggle dataset "nba-window-models" with all .pkl files
3. Add as dataset dependency

In [None]:
from ensemble_predictor import load_all_window_models

# Path to window models (uploaded as Kaggle dataset)
model_cache_dir = "/kaggle/input/nba-window-models/"

print("Loading 27 window models...")
window_models = load_all_window_models(model_cache_dir)
print(f"✓ Loaded {len(window_models)} windows")

for window_name in list(window_models.keys())[:5]:
    print(f"  - {window_name}")

## 4. Load Training Data (2024-2025 Season)

In [None]:
# Load PlayerStatistics.csv from Kaggle dataset
csv_path = "/kaggle/input/historical-nba-data-and-player-box-scores/PlayerStatistics.csv"

print(f"Loading {csv_path}...")
df = pd.read_csv(csv_path, low_memory=False)
print(f"Total records: {len(df):,}")

# Parse gameDate and extract season
df['gameDate'] = pd.to_datetime(df['gameDate'], format='mixed', utc=True)
df['gameDate'] = df['gameDate'].dt.tz_localize(None)
df['year'] = df['gameDate'].dt.year
df['month'] = df['gameDate'].dt.month

# NBA season: Oct-June (games from Oct-Dec are start of season)
df['season_year'] = df.apply(
    lambda row: row['year'] if row['month'] >= 10 else row['year'] - 1,
    axis=1
)

# Filter to 2024-2025 season
training_season = "2024-2025"
season_start_year = 2024
df = df[df['season_year'] == season_start_year]

print(f"Filtered to {training_season}: {len(df):,} records")
print(f"Date range: {df['gameDate'].min()} to {df['gameDate'].max()}")

## 5. Collect Window Predictions

In [None]:
from ensemble_predictor import predict_with_window

def collect_predictions(prop_name: str, sample_size: int = 5000):
    """Collect window predictions and actuals for a prop"""
    print(f"\n{'='*70}")
    print(f"COLLECTING PREDICTIONS: {prop_name.upper()}")
    print(f"{'='*70}")
    
    # Column mapping
    prop_col_map = {
        'points': 'points',
        'rebounds': 'reboundsTotal',
        'assists': 'assists',
        'threes': 'threePointersMade'
    }
    
    if prop_name not in prop_col_map:
        print(f"[!] Unknown prop: {prop_name}")
        return None
    
    actual_col = prop_col_map[prop_name]
    
    window_preds = []
    contexts = []
    actuals = []
    
    # Sample games
    sample_df = df.sample(min(sample_size, len(df)), random_state=42)
    
    for idx, (_, game) in enumerate(sample_df.iterrows(), 1):
        actual = game.get(actual_col)
        if pd.isna(actual) or actual < 0:
            continue
        
        # Get predictions from each window
        preds = []
        for window_name, models in window_models.items():
            try:
                # Create feature row
                X = pd.DataFrame([{
                    'fieldGoalsAttempted': game.get('fieldGoalsAttempted', 0),
                    'freeThrowsAttempted': game.get('freeThrowsAttempted', 0),
                    'assists': game.get('assists', 0),
                    'rebounds': game.get('reboundsTotal', 0),
                    'threes': game.get('threePointersMade', 0),
                    'points': game.get('points', 0),
                    'numMinutes': game.get('numMinutes', 0),
                }])
                
                pred = predict_with_window(models, X, prop_name)
                if isinstance(pred, np.ndarray):
                    pred = pred[0] if len(pred) > 0 else 0.0
                preds.append(pred if pred is not None else 0.0)
            except Exception as e:
                if idx == 1:
                    print(f"  [!] Window {window_name} failed: {e}")
                preds.append(0.0)
        
        if len(preds) < 20:
            continue
        
        # Pad to 27
        while len(preds) < 27:
            preds.append(np.mean(preds))
        
        window_preds.append(preds[:27])
        
        # Extract context
        contexts.append({
            'position_encoded': 2,
            'usage_rate': 0.20,
            'minutes_avg': game.get('numMinutes', 30),
            'is_home': int(game.get('home', 0)),
        })
        
        actuals.append(actual)
        
        if idx % 500 == 0:
            non_zero = sum(1 for p in preds if p != 0.0)
            print(f"  Processed {idx}/{len(sample_df)} games... (non-zero preds: {non_zero}/27)")
    
    print(f"  ✓ Collected {len(actuals):,} samples")
    
    if len(actuals) < 100:
        print(f"  [!] Not enough samples")
        return None
    
    return {
        'window_predictions': np.array(window_preds),
        'player_context': pd.DataFrame(contexts),
        'actuals': np.array(actuals)
    }

## 6. Train Meta-Learner

In [None]:
from meta_learner_ensemble import ContextAwareMetaLearner

print(f"\n{'='*70}")
print(f"TRAINING META-LEARNER")
print(f"{'='*70}")

meta_learner = ContextAwareMetaLearner(n_windows=27, cv_folds=5)

results = {}
for prop in ['points', 'rebounds', 'assists', 'threes']:
    data = collect_predictions(prop, sample_size=5000)
    
    if data is None:
        results[prop] = "skipped"
        continue
    
    # Train with OOF
    metrics = meta_learner.fit_oof(
        window_predictions=data['window_predictions'],
        y_true=data['actuals'],
        player_context=data['player_context'],
        prop_name=prop
    )
    
    results[prop] = {
        'samples': len(data['actuals']),
        'improvement_rmse': f"{metrics['improvement_rmse_pct']:+.1f}%",
        'oof_rmse': f"{metrics['oof_rmse']:.3f}",
        'baseline_rmse': f"{metrics['baseline_rmse']:.3f}"
    }

print(f"\n{'='*70}")
print(f"TRAINING COMPLETE")
print(f"{'='*70}")
print(f"Props trained: {len(meta_learner.meta_models)}")
print(f"\nResults:")
for prop, result in results.items():
    if isinstance(result, dict):
        print(f"  {prop:12s}: {result['samples']:5,} samples, {result['improvement_rmse']} improvement")
    else:
        print(f"  {prop:12s}: {result}")

## 7. Save Meta-Learner

In [None]:
output_file = "meta_learner_2025_2026.pkl"

with open(output_file, 'wb') as f:
    pickle.dump(meta_learner, f)

print(f"\n✅ Saved: {output_file}")
print(f"\nNext steps:")
print(f"1. Download {output_file} from Kaggle (Output section)")
print(f"2. Upload to Modal: modal volume put nba-models {output_file}")
print(f"3. Run analyzer: modal run modal_analyzer.py")

## 8. Feature Importance Analysis (Optional)

In [None]:
# Analyze which windows are most important
for prop_name, model in meta_learner.meta_models.items():
    print(f"\n{prop_name.upper()} - Top 10 Features:")
    
    importances = model.feature_importances_
    feature_names = meta_learner.get_feature_names(n_windows=27)
    
    feature_importance = sorted(
        zip(feature_names, importances),
        key=lambda x: x[1],
        reverse=True
    )
    
    for feat, imp in feature_importance[:10]:
        print(f"  {feat:30s}: {imp:7.1f}")