# üèÄ NBA API Historical Data Fetcher

**Purpose:** Fetch thousands of historical NBA games for QEPC backtesting

**What this does:**
- Downloads games from multiple NBA seasons
- Processes data into QEPC-compatible format
- Merges with your existing data
- Gives you 5,000+ games for backtesting!

**Time:** ~10-15 minutes to run

---

## üîß Setup

In [None]:
# Install NBA API if needed
!pip install nba_api --quiet

print("‚úÖ NBA API installed")

In [None]:
# Setup - with fallback if notebook_context not available
from pathlib import Path
import sys

# Try to import notebook_context
try:
    from notebook_context import *
    print("‚úÖ notebook_context loaded")
except ModuleNotFoundError:
    print("‚ÑπÔ∏è  notebook_context not found, setting up manually...")
    
    # Find project root
    current = Path.cwd()
    project_root = None
    
    # Search for project markers
    for parent in [current, current.parent, current.parent.parent, current.parent.parent.parent]:
        if (parent / "qepc").is_dir() or (parent / "main.py").exists() or (parent / "data").is_dir():
            project_root = parent
            print(f"   ‚úÖ Found project root: {project_root}")
            break
    
    if project_root is None:
        print(f"   ‚ö†Ô∏è  Using current directory: {current}")
        project_root = current
    
    # Add to path
    if str(project_root) not in sys.path:
        sys.path.insert(0, str(project_root))

# Now import other libraries
from nba_api.stats.endpoints import leaguegamefinder, teamgamelogs
import pandas as pd
import time
from datetime import datetime

print(f"üìÅ Project root: {project_root}")
print("‚úÖ Imports complete")

---

## üìÖ Configure Seasons to Fetch

In [None]:
# CONFIGURE: Which seasons do you want?

# Option 1: Last 3 seasons (Quick - ~3,700 games)
# SEASONS_TO_FETCH = ['2021-22', '2022-23', '2023-24']

# Option 2: Last 5 seasons (More data - ~6,000 games)
# SEASONS_TO_FETCH = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

# Option 3: Last 10 seasons (Comprehensive - ~12,000 games)
SEASONS_TO_FETCH = [
     '2014-15', '2015-16', '2016-17', '2017-18', '2018-19',
     '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'
 ]

print(f"üéØ Will fetch {len(SEASONS_TO_FETCH)} seasons:")
for season in SEASONS_TO_FETCH:
    print(f"   ‚Ä¢ {season}")

print(f"\n‚è±Ô∏è  Estimated time: {len(SEASONS_TO_FETCH) * 2} minutes")
print(f"üìä Estimated games: ~{len(SEASONS_TO_FETCH) * 1230} games")

---

## üì• Fetch Data from NBA API

In [None]:
print("üì• Fetching NBA data...\n")
print("‚è±Ô∏è  This may take a few minutes - be patient!\n")

all_games = []
errors = []

for i, season in enumerate(SEASONS_TO_FETCH, 1):
    print(f"[{i}/{len(SEASONS_TO_FETCH)}] Fetching {season}...", end=' ')
    
    try:
        # Fetch regular season games
        gamefinder = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            season_type_nullable='Regular Season'
        )
        
        games = gamefinder.get_data_frames()[0]
        
        # Add season column
        games['Season'] = season
        
        all_games.append(games)
        
        print(f"‚úÖ {len(games)} rows ({len(games)//2} games)")
        
        # Be nice to the API - wait 2 seconds between requests
        if i < len(SEASONS_TO_FETCH):
            time.sleep(2)
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        errors.append({'season': season, 'error': str(e)})
        continue

print("\n" + "="*60)

if len(all_games) > 0:
    # Combine all seasons
    combined_games = pd.concat(all_games, ignore_index=True)
    
    print(f"‚úÖ FETCH COMPLETE!")
    print(f"   Total rows: {len(combined_games):,}")
    print(f"   Unique games: {len(combined_games) // 2:,}")
    print(f"   Seasons: {len(all_games)}/{len(SEASONS_TO_FETCH)}")
    
    if errors:
        print(f"\n‚ö†Ô∏è  Errors: {len(errors)}")
        for err in errors:
            print(f"   ‚Ä¢ {err['season']}: {err['error']}")
else:
    print("‚ùå No data fetched - check errors above")
    combined_games = None

---

## üîç Preview Data

In [None]:
if combined_games is not None:
    print("üìä Data Preview:\n")
    
    print(f"Columns ({len(combined_games.columns)}):")
    print(combined_games.columns.tolist())
    
    print(f"\nFirst few games:")
    display(combined_games.head())
    
    print(f"\nData Types:")
    print(combined_games.dtypes)
    
    print(f"\nDate Range:")
    combined_games['GAME_DATE'] = pd.to_datetime(combined_games['GAME_DATE'], errors='coerce')
    print(f"   Earliest: {combined_games['GAME_DATE'].min()}")
    print(f"   Latest: {combined_games['GAME_DATE'].max()}")
else:
    print("‚ùå No data to preview")

---

## üîÑ Process for QEPC Format

In [None]:
if combined_games is not None:
    print("üîÑ Processing for QEPC format...\n")
    
    # Convert date
    combined_games['gameDate'] = pd.to_datetime(combined_games['GAME_DATE'], errors='coerce')
    
    # Separate home and away games
    home_games = combined_games[combined_games['MATCHUP'].str.contains('vs.', na=False)].copy()
    away_games = combined_games[combined_games['MATCHUP'].str.contains('@', na=False)].copy()
    
    print(f"   Home game rows: {len(home_games):,}")
    print(f"   Away game rows: {len(away_games):,}")
    
    # Create QEPC-compatible format
    # This matches your TeamStatistics.csv structure
    
    qepc_format = pd.DataFrame({
        'gameDate': combined_games['gameDate'],
        'teamCity': combined_games['TEAM_NAME'].str.rsplit(' ', n=1).str[0],  # Split city from name
        'teamName': combined_games['TEAM_NAME'].str.rsplit(' ', n=1).str[1],
        'opponentTeamCity': '',  # Will fill below
        'opponentTeamName': '',  # Will fill below
        'teamScore': combined_games['PTS'],
        'opponentScore': 0,  # Will calculate
        'reboundsTotal': combined_games['REB'],
        'assists': combined_games['AST'],
        'threePointersMade': combined_games['FG3M'],
        'threePointersAttempted': combined_games['FG3A'],
        'blocks': combined_games['BLK'],
        'steals': combined_games['STL'],
        'fieldGoalsAttempted': combined_games['FGA'],
        'fieldGoalsMade': combined_games['FGM'],
        'fieldGoalsPercentage': combined_games['FG_PCT'],
        'threePointersPercentage': combined_games['FG3_PCT'],
        'freeThrowsAttempted': combined_games['FTA'],
        'freeThrowsMade': combined_games['FTM'],
        'freeThrowsPercentage': combined_games['FT_PCT'],
        'reboundsDefensive': 0,  # Not in basic gamefinder
        'reboundsOffensive': 0,  # Not in basic gamefinder
        'foulsPersonal': combined_games['PF'],
        'turnovers': combined_games['TOV'],
        'seasonWins': 0,  # Calculate from WL
        'seasonLosses': 0,  # Calculate from WL
        'win': (combined_games['WL'] == 'W').astype(int),
        'home': (combined_games['MATCHUP'].str.contains('vs.', na=False)).astype(int),
        'plusMinusPoints': combined_games['PLUS_MINUS'],
        'gameId': combined_games['GAME_ID'],
        'teamId': combined_games['TEAM_ID'],
        'Season': combined_games['Season']
    })
    
    print(f"\n‚úÖ Created QEPC format: {len(qepc_format):,} rows")
    print(f"   Columns: {len(qepc_format.columns)}")
    
    # Preview
    print(f"\nüìä QEPC Format Preview:")
    display(qepc_format.head())
    
else:
    print("‚ùå No data to process")
    qepc_format = None

---

## üíæ Save Data

In [None]:
if combined_games is not None and qepc_format is not None:
    
    # Create historical data folder
    historical_dir = project_root / "data" / "historical"
    historical_dir.mkdir(parents=True, exist_ok=True)
    
    # Save raw NBA API data
    raw_path = historical_dir / "NBA_API_Raw_Data.csv"
    combined_games.to_csv(raw_path, index=False)
    print(f"üíæ Saved raw data: {raw_path}")
    print(f"   Size: {raw_path.stat().st_size / 1024 / 1024:.1f} MB")
    
    # Save QEPC-formatted data
    qepc_path = historical_dir / "NBA_API_QEPC_Format.csv"
    qepc_format.to_csv(qepc_path, index=False)
    print(f"üíæ Saved QEPC format: {qepc_path}")
    print(f"   Size: {qepc_path.stat().st_size / 1024 / 1024:.1f} MB")
    
    # Save summary
    summary_path = historical_dir / "NBA_API_Summary.txt"
    with open(summary_path, 'w') as f:
        f.write(f"NBA API Historical Data Fetch\n")
        f.write(f"Generated: {datetime.now()}\n")
        f.write(f"\nSeasons Fetched: {', '.join(SEASONS_TO_FETCH)}\n")
        f.write(f"Total Rows: {len(combined_games):,}\n")
        f.write(f"Unique Games: {len(combined_games) // 2:,}\n")
        f.write(f"Date Range: {combined_games['GAME_DATE'].min()} to {combined_games['GAME_DATE'].max()}\n")
        f.write(f"\nFiles Created:\n")
        f.write(f"  ‚Ä¢ NBA_API_Raw_Data.csv\n")
        f.write(f"  ‚Ä¢ NBA_API_QEPC_Format.csv\n")
    
    print(f"üíæ Saved summary: {summary_path}")
    
    print(f"\n‚úÖ All data saved to: {historical_dir}")
    
else:
    print("‚ùå No data to save")

---

## üîó Merge with Existing Data (Optional)

In [None]:
# Check if you have existing TeamStatistics.csv
existing_path = project_root / "data" / "raw" / "TeamStatistics.csv"

if existing_path.exists() and qepc_format is not None:
    print("üîó Merging with existing data...\n")
    
    # Load existing
    existing = pd.read_csv(existing_path)
    print(f"   Existing data: {len(existing):,} rows")
    
    # Combine
    merged = pd.concat([existing, qepc_format], ignore_index=True)
    print(f"   Combined: {len(merged):,} rows")
    
    # Remove duplicates (based on game ID and team)
    if 'gameId' in merged.columns and 'teamId' in merged.columns:
        merged = merged.drop_duplicates(subset=['gameId', 'teamId'], keep='first')
        print(f"   After dedup: {len(merged):,} rows")
    
    # Save merged version
    merged_path = project_root / "data" / "raw" / "TeamStatistics_Extended.csv"
    merged.to_csv(merged_path, index=False)
    
    print(f"\n‚úÖ Merged data saved: {merged_path}")
    print(f"   Total games: {len(merged) // 2:,}")
    print(f"   Size: {merged_path.stat().st_size / 1024 / 1024:.1f} MB")
    
    print(f"\nüéØ You can now use TeamStatistics_Extended.csv for backtesting!")
    
else:
    print("‚ÑπÔ∏è  No existing data to merge, or no new data fetched")
    print("   You can use NBA_API_QEPC_Format.csv directly")

---

## üìä Summary & Next Steps

In [None]:
print("="*60)
print("üìä FETCH SUMMARY")
print("="*60)

if combined_games is not None:
    print(f"\n‚úÖ Successfully fetched NBA data!")
    print(f"\nüìà What you got:")
    print(f"   ‚Ä¢ Seasons: {len(SEASONS_TO_FETCH)}")
    print(f"   ‚Ä¢ Total games: {len(combined_games) // 2:,}")
    print(f"   ‚Ä¢ Date range: {combined_games['GAME_DATE'].min().date()} to {combined_games['GAME_DATE'].max().date()}")
    
    print(f"\nüìÅ Files created:")
    print(f"   ‚Ä¢ data/historical/NBA_API_Raw_Data.csv")
    print(f"   ‚Ä¢ data/historical/NBA_API_QEPC_Format.csv")
    if existing_path.exists():
        print(f"   ‚Ä¢ data/raw/TeamStatistics_Extended.csv (merged)")
    
    print(f"\nüöÄ Next Steps:")
    print(f"   1. Open: notebooks/01_core/qepc_backtest_quick_fix.ipynb")
    print(f"   2. Update file path to use TeamStatistics_Extended.csv")
    print(f"   3. Set backtest dates within your new range")
    print(f"   4. Run backtest on thousands of games!")
    
    print(f"\nüí° Example backtest ranges:")
    for season in SEASONS_TO_FETCH:
        year = int(season.split('-')[0])
        print(f"   ‚Ä¢ {season}: Oct {year} - Apr {year+1}")
    
    print(f"\nüéØ You now have {len(combined_games) // 2:,} games for validation!")
    
else:
    print(f"\n‚ùå Fetch failed - check errors above")
    print(f"\nTroubleshooting:")
    print(f"   ‚Ä¢ Check internet connection")
    print(f"   ‚Ä¢ Verify nba_api is installed")
    print(f"   ‚Ä¢ Try fetching fewer seasons")
    print(f"   ‚Ä¢ Check if API is down")

---

## üéâ Success!

You now have historical NBA data ready for QEPC backtesting!

**What changed:**
- Before: 278 games (one season)
- After: 3,000+ games (multiple seasons)
- Result: 10x more data for validation!

**Next:**
1. Use TeamStatistics_Extended.csv in backtests
2. Test QEPC on different seasons
3. Compare accuracy across years
4. Calibrate based on results

---