# üèÄ NBA Predictor - Cloud Training (FIXED)

## Steps:
1. Upload your files
2. Run training
3. Download models

**Time:** 20-30 minutes with GPU

In [None]:
# ============================================================
# STEP 1: Upload Your Data Files
# ============================================================
# Upload 2 files:
# 1. PlayerStatistics.csv.zip (39.5 MB - compressed)
# 2. priors_data.zip (Basketball Reference stats)

from google.colab import files
import os

print("üì§ Upload PlayerStatistics.csv.zip AND priors_data.zip:")
uploaded = files.upload()

# Extract both files
print("\nüì¶ Extracting files...")
if os.path.exists('PlayerStatistics.csv.zip'):
    !unzip -q PlayerStatistics.csv.zip
    !rm PlayerStatistics.csv.zip
    print("‚úÖ PlayerStatistics.csv extracted")

if os.path.exists('priors_data.zip'):
    !unzip -q priors_data.zip
    print("‚úÖ priors_data extracted")

# VERIFY FIX IMMEDIATELY (while data is fresh in memory)
print("\nüîç Verifying player data fix...")
import pandas as pd
import numpy as np

# Test the fix on a sample
ps_sample = pd.read_csv('PlayerStatistics.csv', nrows=10000, low_memory=False)
print(f"   Loaded sample: {len(ps_sample):,} rows")
print(f"   Columns: {list(ps_sample.columns)[:10]}...")

date_col = [c for c in ps_sample.columns if 'date' in c.lower()][0]
print(f"   Date column: '{date_col}'")
print(f"   Sample dates (raw): {ps_sample[date_col].head(3).tolist()}")

ps_sample[date_col] = pd.to_datetime(ps_sample[date_col], errors='coerce')
print(f"   Non-null dates: {ps_sample[date_col].notna().sum()} / {len(ps_sample)}")

# Use exact logic from train_auto.py
def _season_from_date(dt):
    if pd.api.types.is_datetime64_any_dtype(dt):
        d = dt
    else:
        d = pd.to_datetime(dt, errors="coerce", utc=False)
    y = d.dt.year
    m = d.dt.month
    return np.where(m >= 8, y + 1, y)

# Test filtering for window 2002-2006
window_seasons = [2002, 2003, 2004, 2005, 2006]
start_year = 2002
end_year = 2006
padded_seasons = set(window_seasons) | {start_year-1, end_year+1}

print(f"\n   Testing window: {window_seasons}")
print(f"   Padded seasons: {sorted(padded_seasons)}")

# Apply fix
temp_seasons = pd.Series(_season_from_date(ps_sample[date_col]))
print(f"   Seasons dtype (before): {temp_seasons.dtype}")
print(f"   Sample seasons: {temp_seasons.dropna().head(10).tolist()}")

ps_sample['_temp_season'] = temp_seasons.fillna(-1).astype(int)
print(f"   Seasons dtype (after): {ps_sample['_temp_season'].dtype}")

# Filter
filtered = ps_sample[ps_sample['_temp_season'].isin(padded_seasons)]

print(f"\n   üìä RESULT: {len(ps_sample):,} rows ‚Üí {len(filtered):,} rows")

if len(filtered) == 0:
    print("\n   ‚ùå CRITICAL ERROR: Filtering returns 0 rows!")
    print("   Debugging info:")
    print(f"     ‚Ä¢ Available seasons in sample: {sorted(ps_sample['_temp_season'].unique())[:20]}")
    print(f"     ‚Ä¢ Looking for seasons: {sorted(padded_seasons)}")
    print(f"     ‚Ä¢ Season overlap: {set(ps_sample['_temp_season'].unique()) & padded_seasons}")
    print("\n   This PlayerStatistics.csv is INVALID or CORRUPTED!")
else:
    print(f"   ‚úÖ Player data verified! Filtering works correctly.")
    season_dist = filtered['_temp_season'].value_counts().sort_index().to_dict()
    print(f"   Season distribution: {season_dist}")

print("\n‚úÖ All files uploaded and ready!")

In [None]:
# ============================================================
# STEP 2: Setup & Train
# ============================================================

print("üì¶ Installing packages...")
!pip install -q nba-api kagglehub pytorch-tabnet lightgbm scikit-learn pandas numpy tqdm

print("\nüì• Downloading code...")
import os
os.chdir('/content')
!git clone https://github.com/tyriqmiles0529-pixel/meep.git
os.chdir('meep')

print("\nüìç Code version:")
!git log -1 --oneline

# Check GPU
import torch
print(f"\nüéÆ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'Not available'}")

print("\nüöÄ Starting training (20-30 min)...\n")
!python3 train_auto.py \
    --priors-dataset /content/priors_data \
    --player-csv /content/PlayerStatistics.csv \
    --verbose \
    --fresh \
    --neural-device gpu \
    --neural-epochs 50

print("\n‚úÖ TRAINING COMPLETE!")

In [None]:
# ============================================================
# STEP 3: Download Models
# ============================================================

from google.colab import files

print("üì¶ Packaging models...")
!zip -q -r nba_models_trained.zip models/ model_cache/

print("üíæ Downloading...")
files.download('nba_models_trained.zip')

print("\n‚úÖ Done! Extract nba_models_trained.zip to your local nba_predictor folder.")

---

## ‚ùì Troubleshooting

### "Loaded 0 player-games for window"
- Make sure you uploaded **PlayerStatistics.csv.zip** (not the uncompressed version)
- Verify file is 39.5 MB compressed

### "No GPU available"
- Runtime ‚Üí Change runtime type ‚Üí GPU (T4 or L4)

### "Out of memory"
- Runtime ‚Üí Restart runtime
- Re-run from Step 1

---

**Version:** 2.2 (Simplified, Fixed)

**What's included:**
- Game models (moneyline, spread)
- Player models (points, rebounds, assists, 3PM, minutes)
- Ensemble (Ridge + Elo + Four Factors + LightGBM)
- Neural hybrid (TabNet + LightGBM)
- 20+ years of historical data
- Basketball Reference priors (~68 features)

**Expected output:**
```
Training window 1/5: 2002-2006
  ‚Ä¢ Loaded 245,892 player-games for window  ‚úÖ
```