# ðŸ”„ Build Pregrame Player Prediction Data)



**What We're Building:**
- **Historical Base:** 4 complete seasons (2021-22 through 2024-25)
- **Current Season:** 2025-26 games through TODAY (Oct-Dec 2025)
- **Auto-Refresh:** Updates weekly with new games

**Result:** ~120k records, always current, super relevant!

---

---

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import time

print("Pre-Game Prediction Data Preparation")
print("=" * 70)

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

# Step 1: Get today's games
print("\n1. Fetching today's games...")
try:
    from nba_api.live.nba.endpoints import scoreboard
    
    sb = scoreboard.ScoreBoard()
    games = sb.games.get_dict()
    
    print(f"   âœ“ Found {len(games)} games today")
    
    today_games = []
    for game in games:
        today_games.append({
            'GAME_ID': game.get('gameId', ''),
            'GAME_DATE': datetime.now().strftime('%Y-%m-%d'),
            'HOME_TEAM': game.get('homeTeam', {}).get('teamName', ''),
            'AWAY_TEAM': game.get('awayTeam', {}).get('teamName', ''),
            'HOME_TEAM_ID': game.get('homeTeam', {}).get('teamId', ''),
            'AWAY_TEAM_ID': game.get('awayTeam', {}).get('teamId', ''),
        })
    
    df_games = pd.DataFrame(today_games)
    print(f"   âœ“ Formatted {len(df_games)} matchups")
    
except Exception as e:
    print(f"   âš  Error: {e}")
    df_games = None

# Step 2: Get current season stats
print("\n2. Fetching current season player stats...")
try:
    from nba_api.stats.endpoints import leaguedashplayerstats
    
    stats = leaguedashplayerstats.LeagueDashPlayerStats(
        season='2025-26',
        season_type_all_star='Regular Season'
    )
    df_season = stats.get_data_frames()[0]
    
    print(f"   âœ“ Loaded stats for {len(df_season):,} players")
    
except Exception as e:
    print(f"   âš  Error: {e}")
    df_season = None

# Step 3: Load historical player profiles
print("\n3. Loading historical player profiles...")
hist_path = DATA_DIR / "raw" / "player_logs" / "all_seasons.csv"

try:
    df_hist = pd.read_csv(hist_path, on_bad_lines='skip')
    
    header_str = df_hist.columns[0]
    actual_columns = header_str.split(',')
    
    data = []
    for idx, row in df_hist.iterrows():
        parsed = row.iloc[0].split(',')
        if len(parsed) == len(actual_columns):
            data.append(parsed)
    
    df_hist = pd.DataFrame(data, columns=actual_columns)
    
    # Convert numeric columns
    numeric_cols = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 
                    'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 
                    'TOV', 'STL', 'BLK', 'PTS', 'PLUS_MINUS']
    
    for col in numeric_cols:
        if col in df_hist.columns:
            df_hist[col] = pd.to_numeric(df_hist[col], errors='coerce')
    
    print(f"   âœ“ Loaded {len(df_hist):,} historical records")
    
    # Calculate career averages by player
    player_career_avg = df_hist.groupby('PLAYER_ID').agg({
        'PTS': 'mean',
        'REB': 'mean',
        'AST': 'mean',
        'FG_PCT': 'mean',
        'FG3_PCT': 'mean',
        'FT_PCT': 'mean',
        'MIN': 'mean',
        'STL': 'mean',
        'BLK': 'mean',
        'TOV': 'mean',
    }).round(2)
    
    player_career_avg.columns = ['CAREER_AVG_PTS', 'CAREER_AVG_REB', 'CAREER_AVG_AST',
                                  'CAREER_AVG_FG_PCT', 'CAREER_AVG_FG3_PCT', 'CAREER_AVG_FT_PCT',
                                  'CAREER_AVG_MIN', 'CAREER_AVG_STL', 'CAREER_AVG_BLK', 'CAREER_AVG_TOV']
    
    print(f"   âœ“ Calculated career averages for {len(player_career_avg)} players")
    
except Exception as e:
    print(f"   âš  Error: {e}")
    player_career_avg = None

# Step 4: Get player rosters
print("\n4. Building player rosters for today's games...")
try:
    from nba_api.stats.endpoints import commonteamroster
    
    all_players = []
    
    if df_games is not None:
        unique_teams = set(df_games['HOME_TEAM_ID'].tolist() + df_games['AWAY_TEAM_ID'].tolist())
        
        for team_id in unique_teams:
            try:
                roster = commonteamroster.CommonTeamRoster(team_id=team_id, season='2025-26')
                roster_df = roster.get_data_frames()[0]
                
                for _, player in roster_df.iterrows():
                    all_players.append({
                        'PLAYER_ID': player.get('PLAYER_ID', ''),
                        'PLAYER_NAME': player.get('PLAYER_NAME', ''),
                        'TEAM_ID': team_id,
                    })
                
                time.sleep(0.3)
                
            except Exception as e:
                pass
    
    df_rosters = pd.DataFrame(all_players)
    print(f"   âœ“ Loaded {len(df_rosters)} players")
    
except Exception as e:
    print(f"   âš  Error: {e}")
    df_rosters = None

# Step 5: Combine everything
print("\n5. Combining all data...")

if df_rosters is not None and df_games is not None:
    df_home = df_rosters.copy()
    df_home = df_home.merge(df_games[['GAME_ID', 'HOME_TEAM_ID', 'AWAY_TEAM_ID', 'GAME_DATE']], 
                             left_on='TEAM_ID', right_on='HOME_TEAM_ID', how='inner')
    df_home['MATCHUP_TYPE'] = 'HOME'
    
    df_away = df_rosters.copy()
    df_away = df_away.merge(df_games[['GAME_ID', 'HOME_TEAM_ID', 'AWAY_TEAM_ID', 'GAME_DATE']], 
                             left_on='TEAM_ID', right_on='AWAY_TEAM_ID', how='inner')
    df_away['MATCHUP_TYPE'] = 'AWAY'
    
    df_pregame = pd.concat([df_home, df_away], ignore_index=True)
    
    # Add career averages
    if player_career_avg is not None:
        career_avg_reset = player_career_avg.reset_index()
        career_avg_reset['PLAYER_ID'] = career_avg_reset['PLAYER_ID'].astype(str)
        df_pregame['PLAYER_ID'] = df_pregame['PLAYER_ID'].astype(str)
        
        df_pregame = df_pregame.merge(career_avg_reset, on='PLAYER_ID', how='left')
    
    print(f"   âœ“ Combined {len(df_pregame)} player matchups")
    
    # Fill in missing PLAYER_NAME from historical data
    if df_pregame['PLAYER_NAME'].isna().any():
        print("   Filling in missing player names...")
        
        player_name_lookup = df_hist[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates()
        player_name_lookup['PLAYER_ID'] = player_name_lookup['PLAYER_ID'].astype(str)
        
        df_pregame_with_names = df_pregame.merge(
            player_name_lookup, 
            on='PLAYER_ID', 
            how='left',
            suffixes=('', '_HIST')
        )
        
        df_pregame_with_names['PLAYER_NAME'] = df_pregame_with_names['PLAYER_NAME'].fillna(
            df_pregame_with_names['PLAYER_NAME_HIST']
        )
        
        df_pregame_with_names = df_pregame_with_names.drop('PLAYER_NAME_HIST', axis=1)
        df_pregame = df_pregame_with_names
        
        print(f"   âœ“ Filled {df_pregame['PLAYER_NAME'].notna().sum()} player names")
    
    # Save
    output_path = DATA_DIR / "live" / "pregame_prediction_data.csv"
    df_pregame.to_csv(output_path, index=False)
    
    print(f"\n" + "=" * 70)
    print(f"âœ“ SAVED: {output_path}")
    print(f"=" * 70)
    print(f"âœ“ Players: {len(df_pregame)}")
    print(f"âœ“ Games: {df_pregame['GAME_ID'].nunique()}")
    print(f"âœ“ Ready for quantum predictions!")
    
else:
    print("Could not combine data - missing rosters or games")

In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

print("Filling in missing player names...")
print("=" * 70)

# Load the pregame file
pregame_path = DATA_DIR / "live" / "pregame_prediction_data.csv"
df_pregame = pd.read_csv(pregame_path)

print(f"\nBefore: {df_pregame['PLAYER_NAME'].notna().sum()} names out of {len(df_pregame)}")

# Get current season stats which has player names
try:
    from nba_api.stats.endpoints import leaguedashplayerstats
    
    print("Fetching current season player names from NBA API...")
    stats = leaguedashplayerstats.LeagueDashPlayerStats(
        season='2025-26',
        season_type_all_star='Regular Season'
    )
    df_season = stats.get_data_frames()[0]
    
    # Create lookup from current season
    player_names = df_season[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates()
    player_names['PLAYER_ID'] = player_names['PLAYER_ID'].astype(str)
    
    print(f"   âœ“ Got names for {len(player_names)} current season players")
    
    # Fill missing names
    df_pregame['PLAYER_ID'] = df_pregame['PLAYER_ID'].astype(str)
    df_pregame['PLAYER_NAME'] = df_pregame['PLAYER_NAME'].fillna(
        df_pregame['PLAYER_ID'].map(
            dict(zip(player_names['PLAYER_ID'], player_names['PLAYER_NAME']))
        )
    )
    
    print(f"After: {df_pregame['PLAYER_NAME'].notna().sum()} names out of {len(df_pregame)}")
    
    # Save
    df_pregame.to_csv(pregame_path, index=False)
    
    print(f"\nâœ“ Saved with all player names filled in!")
    print(f"\nSample data:")
    print(df_pregame[['PLAYER_ID', 'PLAYER_NAME', 'MATCHUP_TYPE']].head(10))
    
except Exception as e:
    print(f"Error: {e}")

In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

print("Filling in player names...")
print("=" * 70)

# Load the roster file
roster_path = DATA_DIR / "raw" / "schedule" / "league_roster_2025-26.csv"
df_roster = pd.read_csv(roster_path)

print(f"\nBefore: {df_roster['PLAYER_NAME'].notna().sum()} names out of {len(df_roster)}")

# Get player names from league stats
try:
    from nba_api.stats.endpoints import leaguedashplayerstats
    
    print("Fetching current season player stats...")
    stats = leaguedashplayerstats.LeagueDashPlayerStats(
        season='2025-26',
        season_type_all_star='Regular Season'
    )
    df_stats = stats.get_data_frames()[0]
    
    # Create lookup
    player_names = df_stats[['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID']].copy()
    player_names['PLAYER_ID'] = player_names['PLAYER_ID'].astype(int)
    
    print(f"   âœ“ Got {len(player_names)} player records")
    
    # Merge names into roster
    df_roster['PLAYER_ID'] = df_roster['PLAYER_ID'].astype(int)
    df_roster = df_roster.merge(
        player_names[['PLAYER_ID', 'PLAYER_NAME']],
        on='PLAYER_ID',
        how='left',
        suffixes=('_OLD', '')
    )
    
    # Drop old column if it exists
    if 'PLAYER_NAME_OLD' in df_roster.columns:
        df_roster = df_roster.drop('PLAYER_NAME_OLD', axis=1)
    
    print(f"After: {df_roster['PLAYER_NAME'].notna().sum()} names out of {len(df_roster)}")
    
    # Save
    df_roster.to_csv(roster_path, index=False)
    
    print(f"\nâœ“ Updated and saved!")
    print(f"\nSample:")
    print(df_roster[['PLAYER_ID', 'PLAYER_NAME', 'POSITION', 'TEAM_ID']].head(10).to_string())
    
except Exception as e:
    print(f"Error: {e}")

In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

print("Filling remaining player names from historical data...")
print("=" * 70)

# Load the roster file
roster_path = DATA_DIR / "raw" / "schedule" / "league_roster_2025-26.csv"
df_roster = pd.read_csv(roster_path)

print(f"\nMissing: {df_roster['PLAYER_NAME'].isna().sum()} players")

# Load historical data
hist_path = DATA_DIR / "raw" / "player_logs" / "all_seasons.csv"
df_hist = pd.read_csv(hist_path, on_bad_lines='skip')

# Parse header
header_str = df_hist.columns[0]
actual_columns = header_str.split(',')

data = []
for idx, row in df_hist.iterrows():
    parsed = row.iloc[0].split(',')
    if len(parsed) == len(actual_columns):
        data.append(parsed)

df_hist = pd.DataFrame(data, columns=actual_columns)

# Get unique player names from historical
hist_names = df_hist[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates()
hist_names['PLAYER_ID'] = hist_names['PLAYER_ID'].astype(int)

# Fill missing names
df_roster['PLAYER_ID'] = df_roster['PLAYER_ID'].astype(int)
missing_mask = df_roster['PLAYER_NAME'].isna()

for idx in df_roster[missing_mask].index:
    player_id = df_roster.loc[idx, 'PLAYER_ID']
    hist_match = hist_names[hist_names['PLAYER_ID'] == player_id]
    if len(hist_match) > 0:
        df_roster.loc[idx, 'PLAYER_NAME'] = hist_match.iloc[0]['PLAYER_NAME']

print(f"After: {df_roster['PLAYER_NAME'].notna().sum()} names out of {len(df_roster)}")
print(f"Still missing: {df_roster['PLAYER_NAME'].isna().sum()}")

# Save
df_roster.to_csv(roster_path, index=False)

print(f"\nâœ“ Saved!")
print(f"\nRemaining missing (if any):")
missing = df_roster[df_roster['PLAYER_NAME'].isna()]
if len(missing) > 0:
    print(missing[['PLAYER_ID', 'PLAYER_NAME', 'POSITION', 'TEAM_ID']].to_string())
else:
    print("None! All players have names!")

In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

print("Adding team names to roster...")
print("=" * 70)

# Team ID to Name mapping
TEAM_MAP = {
    1610612737: 'Atlanta Hawks',
    1610612738: 'Boston Celtics',
    1610612739: 'Cleveland Cavaliers',
    1610612740: 'New Orleans Pelicans',
    1610612741: 'Chicago Bulls',
    1610612742: 'Dallas Mavericks',
    1610612743: 'Denver Nuggets',
    1610612744: 'Golden State Warriors',
    1610612745: 'Houston Rockets',
    1610612746: 'Los Angeles Clippers',
    1610612747: 'Los Angeles Lakers',
    1610612748: 'Miami Heat',
    1610612749: 'Milwaukee Bucks',
    1610612750: 'Minnesota Timberwolves',
    1610612751: 'Brooklyn Nets',
    1610612752: 'New York Knicks',
    1610612753: 'Orlando Magic',
    1610612754: 'Philadelphia 76ers',
    1610612755: 'Phoenix Suns',
    1610612756: 'Portland Trail Blazers',
    1610612757: 'Sacramento Kings',
    1610612758: 'San Antonio Spurs',
    1610612759: 'Oklahoma City Thunder',
    1610612760: 'Toronto Raptors',
    1610612761: 'Utah Jazz',
    1610612762: 'Memphis Grizzlies',
    1610612763: 'Washington Wizards',
    1610612764: 'Charlotte Hornets',
    1610612765: 'Detroit Pistons',
    1610612766: 'Indiana Pacers',
}

# Load roster
roster_path = DATA_DIR / "raw" / "schedule" / "league_roster_2025-26.csv"
df_roster = pd.read_csv(roster_path)

# Add team name
df_roster['TEAM_NAME'] = df_roster['TEAM_ID'].map(TEAM_MAP)

print(f"\nAdded TEAM_NAME column")
print(f"Teams: {df_roster['TEAM_NAME'].nunique()}")

# Reorder columns
cols = ['PLAYER_ID', 'PLAYER_NAME', 'JERSEY_NUM', 'POSITION', 'HEIGHT', 'WEIGHT', 'TEAM_ID', 'TEAM_NAME']
df_roster = df_roster[[c for c in cols if c in df_roster.columns]]

# Save
df_roster.to_csv(roster_path, index=False)

print(f"\nâœ“ Saved with team names!")
print(f"\nSample:")
print(df_roster[['PLAYER_ID', 'PLAYER_NAME', 'POSITION', 'TEAM_NAME']].head(10).to_string())

In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\wdors\qepc_project\experimental\CLAUDE_REWRITE")
DATA_DIR = BASE_DIR / "data"

print("Filling pregame prediction data with roster info...")
print("=" * 70)

# Load roster reference
roster_path = DATA_DIR / "raw" / "schedule" / "league_roster_2025-26.csv"
df_roster = pd.read_csv(roster_path)

# Load pregame predictions
pregame_path = DATA_DIR / "live" / "pregame_prediction_data.csv"
df_pregame = pd.read_csv(pregame_path)

print(f"\nBefore:")
print(f"  Player names: {df_pregame['PLAYER_NAME'].notna().sum()}")
print(f"  Missing player names: {df_pregame['PLAYER_NAME'].isna().sum()}")

# Convert IDs to int for matching
df_roster['PLAYER_ID'] = df_roster['PLAYER_ID'].astype(int)
df_pregame['PLAYER_ID'] = df_pregame['PLAYER_ID'].astype(int)

# Merge player names from roster
df_pregame = df_pregame.merge(
    df_roster[['PLAYER_ID', 'PLAYER_NAME', 'POSITION', 'TEAM_NAME']],
    on='PLAYER_ID',
    how='left',
    suffixes=('_OLD', '')
)

# Keep the new names (drop old if exists)
if 'PLAYER_NAME_OLD' in df_pregame.columns:
    df_pregame['PLAYER_NAME'] = df_pregame['PLAYER_NAME'].fillna(df_pregame['PLAYER_NAME_OLD'])
    df_pregame = df_pregame.drop('PLAYER_NAME_OLD', axis=1)

# Update team info
if 'TEAM_NAME' in df_pregame.columns:
    df_pregame = df_pregame.rename(columns={'TEAM_NAME': 'PLAYER_TEAM_NAME'})

print(f"\nAfter:")
print(f"  Player names: {df_pregame['PLAYER_NAME'].notna().sum()}")
print(f"  Missing player names: {df_pregame['PLAYER_NAME'].isna().sum()}")

# Save
df_pregame.to_csv(pregame_path, index=False)

print(f"\nâœ“ Saved!")
print(f"\nColumns: {list(df_pregame.columns)}")
print(f"\nSample:")
print(df_pregame[['PLAYER_ID', 'PLAYER_NAME', 'POSITION', 'PLAYER_TEAM_NAME', 'MATCHUP_TYPE']].head(10).to_string())