In [None]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import playbyplayv2, boxscorestarterbenchstats

def extract_sub_players(description):
    """Extract player IDs for substitution plays (who goes in, who comes out)"""
    if pd.isna(description) or 'SUB' not in description:
        return None, None
        
    # The description format is typically: "SUB: PLAYER_OUT OUT, PLAYER_IN IN"
    parts = description.split(', ')
    if len(parts) != 2:
        return None, None
        
    try:
        out_part = parts[0].split(': ')[1] if ': ' in parts[0] else parts[0]
        out_player = out_part.split(' OUT')[0].strip()
        in_player = parts[1].split(' IN')[0].strip()
        return out_player, in_player
    except Exception:
        return None, None
        
def convert_to_seconds(time_str):
    """Convert MM:SS format to seconds"""
    if pd.isna(time_str):
        return 0
    minutes, seconds = map(float, time_str.split(':'))
    return minutes * 60 + seconds

def calculate_duration(start_period, start_time, end_period, end_time):
    """Calculate duration between two game clock times, handling period changes"""
    # NBA periods are 12 minutes in regulation
    PERIOD_LENGTH = 12 * 60  # 12 minutes in seconds
    # Overtime periods are 5 minutes
    OT_LENGTH = 5 * 60  # 5 minutes in seconds
    
    if start_period == end_period:
        # Simple case: same period
        return start_time - end_time
    else:
        duration = start_time  # Remaining time in start period
        
        # Add full periods in between
        for period in range(start_period + 1, end_period):
            if period <= 4:  # Regulation
                duration += PERIOD_LENGTH
            else:  # Overtime
                duration += OT_LENGTH
                
        # Add time elapsed in the final period
        if end_period <= 4:  # Regulation
            duration += (PERIOD_LENGTH - end_time)
        else:  # Overtime
            duration += (OT_LENGTH - end_time)
            
        return duration

def get_player_ids_from_names(pbp_df, team_id):
    """Get player IDs from player names for a specific team"""
    # Create a mapping from player names to IDs
    player_mapping = {}
    
    # Look for all player entries for the given team
    team_plays = pbp_df[
        (pbp_df['PLAYER1_TEAM_ID'] == team_id) | 
        (pbp_df['PLAYER2_TEAM_ID'] == team_id) | 
        (pbp_df['PLAYER3_TEAM_ID'] == team_id)
    ]
    
    # Add Player 1 mappings
    p1_df = team_plays[team_plays['PLAYER1_TEAM_ID'] == team_id]
    for _, row in p1_df.iterrows():
        if not pd.isna(row['PLAYER1_NAME']) and not pd.isna(row['PLAYER1_ID']):
            player_mapping[row['PLAYER1_NAME']] = row['PLAYER1_ID']
    
    # Add Player 2 mappings
    p2_df = team_plays[team_plays['PLAYER2_TEAM_ID'] == team_id]
    for _, row in p2_df.iterrows():
        if not pd.isna(row['PLAYER2_NAME']) and not pd.isna(row['PLAYER2_ID']):
            player_mapping[row['PLAYER2_NAME']] = row['PLAYER2_ID']
    
    # Add Player 3 mappings
    p3_df = team_plays[team_plays['PLAYER3_TEAM_ID'] == team_id]
    for _, row in p3_df.iterrows():
        if not pd.isna(row['PLAYER3_NAME']) and not pd.isna(row['PLAYER3_ID']):
            player_mapping[row['PLAYER3_NAME']] = row['PLAYER3_ID']
    
    return player_mapping

def track_lineups_with_stats(game_id):
    """
    Track all lineups throughout a game with detailed statistics
    
    Parameters:
    game_id (str): NBA game ID
    
    Returns:
    pd.DataFrame: Timeline of lineup segments with stats
    """
    # Get play-by-play data
    pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
    pbp_df = pbp.get_data_frames()[0]
    
    # Get team IDs
    home_team_id = pbp_df['PLAYER1_TEAM_ID'].iloc[0]
    for i in range(len(pbp_df)):
        if pbp_df['PLAYER1_TEAM_ID'].iloc[i] != home_team_id and not pd.isna(pbp_df['PLAYER1_TEAM_ID'].iloc[i]):
            away_team_id = pbp_df['PLAYER1_TEAM_ID'].iloc[i]
            break
    
    # Get player name to ID mappings for substitutions
    home_player_map = get_player_ids_from_names(pbp_df, home_team_id)
    away_player_map = get_player_ids_from_names(pbp_df, away_team_id)
    
    # Get starting lineups
    try:
        # Try to get starters from box score
        starters = boxscorestarterbenchstats.BoxScoreStarterBenchStats(game_id=game_id)
        starters_df = starters.get_data_frames()[0]
        
        home_starters = starters_df[starters_df['TEAM_ID'] == home_team_id]['PLAYER_ID'].tolist()[:5]
        away_starters = starters_df[starters_df['TEAM_ID'] == away_team_id]['PLAYER_ID'].tolist()[:5]
    except:
        # Fallback: infer starters from the first few plays
        home_starters = []
        away_starters = []
        
        for _, play in pbp_df.iterrows():
            if play['EVENTMSGTYPE'] not in [8, 10]:  # Skip non-player events
                if not pd.isna(play['PLAYER1_TEAM_ID']):
                    if play['PLAYER1_TEAM_ID'] == home_team_id and play['PLAYER1_ID'] not in home_starters:
                        home_starters.append(play['PLAYER1_ID'])
                    elif play['PLAYER1_TEAM_ID'] == away_team_id and play['PLAYER1_ID'] not in away_starters:
                        away_starters.append(play['PLAYER1_ID'])
                        
                if not pd.isna(play['PLAYER2_TEAM_ID']):
                    if play['PLAYER2_TEAM_ID'] == home_team_id and play['PLAYER2_ID'] not in home_starters:
                        home_starters.append(play['PLAYER2_ID'])
                    elif play['PLAYER2_TEAM_ID'] == away_team_id and play['PLAYER2_ID'] not in away_starters:
                        away_starters.append(play['PLAYER2_ID'])
                        
                if not pd.isna(play['PLAYER3_TEAM_ID']):
                    if play['PLAYER3_TEAM_ID'] == home_team_id and play['PLAYER3_ID'] not in home_starters:
                        home_starters.append(play['PLAYER3_ID'])
                    elif play['PLAYER3_TEAM_ID'] == away_team_id and play['PLAYER3_ID'] not in away_starters:
                        away_starters.append(play['PLAYER3_ID'])
            
            # Stop once we have 5 players for each team
            if len(home_starters) >= 5 and len(away_starters) >= 5:
                break
        
        # Ensure we have exactly 5 players
        home_starters = home_starters[:5]
        away_starters = away_starters[:5]
    
    # Initialize current lineups
    current_home_lineup = set(home_starters)
    current_away_lineup = set(away_starters)
    
    # Initialize stats trackers for current lineup
    home_stats = {
        'PTS': 0, 'FGM': 0, 'FGA': 0, 'FG3M': 0, 'FG3A': 0, 'FTM': 0, 'FTA': 0,
        'OREB': 0, 'DREB': 0, 'REB': 0, 'AST': 0, 'STL': 0, 'BLK': 0, 'BLKA': 0, 
        'TOV': 0, 'PF': 0, 'PFD': 0, 'PLUS_MINUS': 0
    }
    
    away_stats = {
        'PTS': 0, 'FGM': 0, 'FGA': 0, 'FG3M': 0, 'FG3A': 0, 'FTM': 0, 'FTA': 0,
        'OREB': 0, 'DREB': 0, 'REB': 0, 'AST': 0, 'STL': 0, 'BLK': 0, 'BLKA': 0, 
        'TOV': 0, 'PF': 0, 'PFD': 0, 'PLUS_MINUS': 0
    }
    
    # Track lineups over time
    lineups_timeline = []
    current_lineup_start_event = 0
    
    # Extract starting score
    starting_score = "0 - 0"
    start_home, start_away = 0, 0
    
    for idx, play in pbp_df.iterrows():
        # Check if lineup changed due to substitution
        lineup_changed = False
        
        # Handle home team substitution
        if not pd.isna(play['HOMEDESCRIPTION']) and 'SUB' in str(play['HOMEDESCRIPTION']):
            out_name, in_name = extract_sub_players(play['HOMEDESCRIPTION'])
            
            if out_name in home_player_map and in_name in home_player_map:
                out_id = home_player_map[out_name]
                in_id = home_player_map[in_name]
                
                if out_id in current_home_lineup:
                    current_home_lineup.remove(out_id)
                    current_home_lineup.add(in_id)
                    lineup_changed = True
        
        # Handle away team substitution
        if not pd.isna(play['VISITORDESCRIPTION']) and 'SUB' in str(play['VISITORDESCRIPTION']):
            out_name, in_name = extract_sub_players(play['VISITORDESCRIPTION'])
            
            if out_name in away_player_map and in_name in away_player_map:
                out_id = away_player_map[out_name]
                in_id = away_player_map[in_name]
                
                if out_id in current_away_lineup:
                    current_away_lineup.remove(out_id)
                    current_away_lineup.add(in_id)
                    lineup_changed = True
        
        # If lineup changed or end of game, record the segment
        if (lineup_changed or idx == len(pbp_df) - 1) and idx > 0:
            # Get the previous play for timing
            prev_play = pbp_df.iloc[current_lineup_start_event]
            
            # Calculate time duration
            start_period = prev_play['PERIOD']
            start_time = convert_to_seconds(prev_play['PCTIMESTRING'])
            end_period = play['PERIOD']
            end_time = convert_to_seconds(play['PCTIMESTRING'])
            
            duration = calculate_duration(start_period, start_time, end_period, end_time)
            
            # Get score change
            if not pd.isna(prev_play['SCORE']):
                prev_score = prev_play['SCORE']
                prev_home, prev_away = map(int, prev_score.split(' - '))
            else:
                # Use the global starting score
                prev_home, prev_away = start_home, start_away
            
            if not pd.isna(play['SCORE']):
                end_score = play['SCORE']
                end_home, end_away = map(int, end_score.split(' - '))
            else:
                # Use the previous known score
                end_home, end_away = prev_home, prev_away
            
            # Update plus/minus
            home_stats['PLUS_MINUS'] = (end_home - prev_home) - (end_away - prev_away)
            away_stats['PLUS_MINUS'] = (end_away - prev_away) - (end_home - prev_home)
            
            # Create unique lineup instance IDs
            home_lineup_instance_id = f"{game_id}-HOME-{current_lineup_start_event}-{idx}"
            away_lineup_instance_id = f"{game_id}-AWAY-{current_lineup_start_event}-{idx}"
            
            # Create entries for home and away lineups
            home_lineup_entry = {
                'GAME_ID': game_id,
                'LINEUP_ID': home_lineup_instance_id,
                'TEAM_ID': home_team_id,
                'START_EVENTNUM': current_lineup_start_event,
                'END_EVENTNUM': idx,
                'START_PERIOD': start_period,
                'END_PERIOD': end_period,
                'START_TIME': prev_play['PCTIMESTRING'],
                'END_TIME': play['PCTIMESTRING'],
                'DURATION_SECONDS': duration,
                'LINEUP': tuple(sorted(current_home_lineup)),
                'SCORE_START': prev_home,
                'SCORE_END': end_home,
                'PLUS_MINUS': home_stats['PLUS_MINUS']
            }
            
            away_lineup_entry = {
                'GAME_ID': game_id,
                'LINEUP_ID': away_lineup_instance_id,
                'TEAM_ID': away_team_id,
                'START_EVENTNUM': current_lineup_start_event,
                'END_EVENTNUM': idx,
                'START_PERIOD': start_period,
                'END_PERIOD': end_period,
                'START_TIME': prev_play['PCTIMESTRING'],
                'END_TIME': play['PCTIMESTRING'],
                'DURATION_SECONDS': duration,
                'LINEUP': tuple(sorted(current_away_lineup)),
                'SCORE_START': prev_away,
                'SCORE_END': end_away,
                'PLUS_MINUS': away_stats['PLUS_MINUS']
            }
            
            # Add all the collected stats
            for stat, value in home_stats.items():
                home_lineup_entry[stat] = value
            
            for stat, value in away_stats.items():
                away_lineup_entry[stat] = value
            
            lineups_timeline.append(home_lineup_entry)
            lineups_timeline.append(away_lineup_entry)
            
            # Reset stats for the new lineup
            home_stats = {stat: 0 for stat in home_stats}
            away_stats = {stat: 0 for stat in away_stats}
            
            # Update lineup start event
            current_lineup_start_event = idx
            
            # Update global starting score
            start_home, start_away = end_home, end_away
    
    return pd.DataFrame(lineups_timeline)


ImportError: cannot import name 'boxscorestarterbenchstats' from 'nba_api.stats.endpoints' (d:\Cerberus\Assets\Code\ML\NCI\DA For AI\TABA\taba\lib\site-packages\nba_api\stats\endpoints\__init__.py)

In [None]:
update the code to create separate entries for the home and away lineups with unique lineup ids and add a team_id field to identify which team a lineup is associated with

In [4]:
from nba_api.stats.endpoints import leaguegamefinder, boxscoreplayertrackv2, playbyplayv2
import pandas as pd
import time
import os
from tqdm import tqdm

def process_game_lineups(game_id):
    """
    Process a single game and return lineup analysis
    
    Parameters:
    game_id (str): NBA game ID
    
    Returns:
    None
    """
    # Define the CSV file path
    csv_file = "lineup_analysis.csv"

    # Check if the file exists
    if os.path.exists(csv_file):
        # Read the existing file
        existing_data = pd.read_csv(csv_file)
        # Check if the game_id is already processed
        if game_id in existing_data['GAME_ID'].values:
            print(f"Game ID {game_id} is already processed. Skipping...")
            return
    else:
        # Create an empty DataFrame if the file does not exist
        existing_data = pd.DataFrame()

    # Get the lineup timeline
    lineup_timeline = track_lineups_with_stats(game_id)

    # Append the new data to the existing data
    updated_data = pd.concat([existing_data, lineup_timeline], ignore_index=True)

    # Save the updated data back to the CSV file
    updated_data.to_csv(csv_file, index=False)
    print(f"Game ID {game_id} has been processed and saved to {csv_file}.")

season = "2023-24"
gamefinder = leaguegamefinder.LeagueGameFinder(
    season_nullable=season,
    season_type_nullable="Regular Season",
    league_id_nullable="00"  # NBA league ID
)
games_df = gamefinder.get_data_frames()[0]
game_ids = games_df["GAME_ID"].unique().tolist()
print(len(game_ids), "games found for season", season)
for game_id in tqdm(game_ids):
    try:
        process_game_lineups(game_id)
    except Exception as e:
        print(f"Error processing game {game_id}: {e}")
        continue
    time.sleep(1)  # Avoid hitting the API too quickly


1230 games found for season 2023-24


  0%|          | 1/1230 [00:00<10:36,  1.93it/s]

Error processing game 0022301196: name 'get_player_ids_from_names' is not defined


  0%|          | 2/1230 [00:00<09:29,  2.16it/s]

Error processing game 0022301188: name 'get_player_ids_from_names' is not defined


  0%|          | 3/1230 [00:01<11:42,  1.75it/s]

Error processing game 0022301192: name 'get_player_ids_from_names' is not defined


  0%|          | 4/1230 [00:02<11:21,  1.80it/s]

Error processing game 0022301187: name 'get_player_ids_from_names' is not defined


  0%|          | 5/1230 [00:03<13:34,  1.50it/s]

Error processing game 0022301190: name 'get_player_ids_from_names' is not defined


  0%|          | 6/1230 [00:03<12:24,  1.64it/s]

Error processing game 0022301195: name 'get_player_ids_from_names' is not defined


  1%|          | 7/1230 [00:04<11:39,  1.75it/s]

Error processing game 0022301200: name 'get_player_ids_from_names' is not defined


  1%|          | 8/1230 [00:04<11:25,  1.78it/s]

Error processing game 0022301197: name 'get_player_ids_from_names' is not defined


  1%|          | 9/1230 [00:05<10:45,  1.89it/s]

Error processing game 0022301189: name 'get_player_ids_from_names' is not defined


  1%|          | 10/1230 [00:06<15:40,  1.30it/s]

Error processing game 0022301194: name 'get_player_ids_from_names' is not defined


  1%|          | 11/1230 [00:07<15:03,  1.35it/s]

Error processing game 0022301193: name 'get_player_ids_from_names' is not defined


  1%|          | 12/1230 [00:07<13:17,  1.53it/s]

Error processing game 0022301199: name 'get_player_ids_from_names' is not defined


  1%|          | 13/1230 [00:07<11:50,  1.71it/s]

Error processing game 0022301186: name 'get_player_ids_from_names' is not defined


  1%|          | 14/1230 [00:08<10:44,  1.89it/s]

Error processing game 0022301191: name 'get_player_ids_from_names' is not defined


  1%|          | 15/1230 [00:08<11:30,  1.76it/s]

Error processing game 0022301198: name 'get_player_ids_from_names' is not defined


  1%|▏         | 16/1230 [00:09<10:41,  1.89it/s]

Error processing game 0022301171: name 'get_player_ids_from_names' is not defined


  1%|▏         | 16/1230 [00:09<12:25,  1.63it/s]


KeyboardInterrupt: 

In [11]:
from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2, boxscoretraditionalv2
import pandas as pd
import time
import os
from tqdm import tqdm

# Define season and save file
season = "2023-24"
file_name = "per_game_lineup_stats.csv"

# Get all games for the season
gamefinder = leaguegamefinder.LeagueGameFinder(
    season_nullable=season,
    season_type_nullable="Regular Season",
    league_id_nullable="00"
)
games_df = gamefinder.get_data_frames()[0]
game_ids = games_df["GAME_ID"].unique().tolist()

print(f"Total Games in the Season: {len(game_ids)}")

# Load existing progress
if os.path.exists(file_name):
    existing_data = pd.read_csv(file_name)
    completed_game_ids = set(existing_data["GAME_ID"].unique())
else:
    completed_game_ids = set()

# Open the file in append mode after headers are written (if needed)

Total Games in the Season: 1230


In [12]:
print(len(game_ids), "games found for season", season)

1230 games found for season 2023-24


In [13]:
print(len(completed_game_ids))

2


In [10]:
first_write = not os.path.exists(file_name)

with open(file_name, "a", newline="") as f:
    # Iterate through games
    for game_id in tqdm(game_ids, desc="Processing Games"):
        if game_id in completed_game_ids:
            continue  # Skip already processed games

        attempt = 0
        while attempt < 5:  # Retry up to 5 times
            try:
                # Step 1: Fetch Play-by-Play Data to Identify Lineups
                pbp_data = playbyplayv2.PlayByPlayV2(game_id=game_id)
                pbp_df = pbp_data.get_data_frames()[0]

                # Identify unique lineup changes based on substitutions
                sub_events = pbp_df[pbp_df["EVENTMSGTYPE"] == 8]  # Substitutions
                lineups = {}  # {Team_ID: Active Lineup (set of player IDs)}

                for index, row in sub_events.iterrows():
                    team_id = row["PLAYER1_TEAM_ID"]
                    in_player = row["PLAYER2_ID"]
                    out_player = row["PLAYER1_ID"]

                    if team_id not in lineups:
                        lineups[team_id] = set()

                    lineups[team_id].discard(out_player)
                    lineups[team_id].add(in_player)

                # Step 2: Fetch Box Score Data for Player Performance
                boxscore_data = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
                boxscore_df = boxscore_data.get_data_frames()[0]

                # Step 3: Aggregate Player Stats Based on Active Lineups
                final_data = []
                team_ids = list(lineups.keys())

                for team_id in team_ids:
                    opponent_team_id = [tid for tid in team_ids if tid != team_id][0]  # Find opponent team
                    lineup_players = lineups[team_id]
                    opponent_lineup_players = lineups[opponent_team_id]

                    lineup_stats = boxscore_df[boxscore_df["PLAYER_ID"].isin(lineup_players)].copy()
                    opponent_stats = boxscore_df[boxscore_df["PLAYER_ID"].isin(opponent_lineup_players)].copy()

                    if lineup_stats.empty or opponent_stats.empty:
                        continue

                    # Aggregate lineup performance
                    aggregated_stats = lineup_stats.groupby("TEAM_ID").sum()
                    aggregated_stats["GAME_ID"] = game_id
                    aggregated_stats["LINEUP"] = ", ".join(map(str, lineup_players))
                    aggregated_stats["OPPONENT_LINEUP"] = ", ".join(map(str, opponent_lineup_players))

                    final_data.append(aggregated_stats)

                # Convert to DataFrame and append to CSV
                if final_data:
                    df = pd.concat(final_data, ignore_index=True)
                    df.to_csv(f, index=False, header=first_write, mode="a")
                    first_write = False  # Ensure headers are written only once

                time.sleep(1.5)  # Sleep to avoid rate limits
                break  # Exit retry loop

            except Exception as e:
                attempt += 1
                wait_time = 2 ** attempt
                print(f"⚠️ Error fetching {game_id}, retrying in {wait_time}s... ({attempt}/5)")
                print(e)
                time.sleep(wait_time)

print("✅ Dynamic per-game lineup stats saved successfully!")

Processing Games:   0%|          | 1/1230 [00:01<40:49,  1.99s/it]

⚠️ Error fetching 0022301188, retrying in 2s... (1/5)
⚠️ Error fetching 0022301188, retrying in 4s... (2/5)
⚠️ Error fetching 0022301188, retrying in 8s... (3/5)
⚠️ Error fetching 0022301188, retrying in 16s... (4/5)
⚠️ Error fetching 0022301188, retrying in 32s... (5/5)


Processing Games:   0%|          | 1/1230 [01:04<22:08:58, 64.88s/it]


KeyboardInterrupt: 

In [15]:
from nba_api.stats.endpoints import playbyplayv2, boxscoreplayertrackv2
import pandas as pd
import time

game_id = "0022301196"  # Example Game ID

# Step 1: Get Play-by-Play Data
pbp_data = playbyplayv2.PlayByPlayV2(game_id=game_id)
pbp_df = pbp_data.get_data_frames()[0]

# Step 2: Identify Possessions
pbp_df["POSSESSION_CHANGE"] = pbp_df["EVENTMSGTYPE"].isin([1, 5, 6, 7, 8])  # Shot, Turnover, Foul, etc.

# Step 3: Get Player Tracking Data
tracking_data = boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id=game_id)
tracking_df = tracking_data.get_data_frames()[0]

# Merge both datasets based on timestamps (approximate)
merged_df = pbp_df.merge(tracking_df, on="GAME_ID", how="left")
merged_df.head() # View possession-level stats


Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22301196,2,12,0,1,3:41 PM,12:00,,Start of 1st Period (3:41 PM EST),,...,0,1,0.0,1,7,0.143,0.125,0,0,0.0
1,22301196,2,12,0,1,3:41 PM,12:00,,Start of 1st Period (3:41 PM EST),,...,2,5,0.4,3,9,0.333,0.356,7,8,0.875
2,22301196,2,12,0,1,3:41 PM,12:00,,Start of 1st Period (3:41 PM EST),,...,1,1,1.0,0,1,0.0,0.5,4,6,0.667
3,22301196,2,12,0,1,3:41 PM,12:00,,Start of 1st Period (3:41 PM EST),,...,0,4,0.0,1,5,0.2,0.111,4,4,1.0
4,22301196,2,12,0,1,3:41 PM,12:00,,Start of 1st Period (3:41 PM EST),,...,1,7,0.143,3,7,0.429,0.285,3,3,1.0


In [1]:
from nba_api.stats.endpoints import leaguegamefinder, boxscoreadvancedv2
import pandas as pd
import time
import os

# Define the season to retrieve data for
SEASON = "2023-24"
SEASON_TYPE = "Regular Season"  # Change to "Playoffs" if needed
OUTPUT_FILE = "possession_based_analysis.csv"

# Retrieve all game IDs for the season
gamefinder = leaguegamefinder.LeagueGameFinder(
    season_nullable=SEASON, season_type_nullable=SEASON_TYPE, league_id_nullable="00"
)
games = gamefinder.get_data_frames()[0]
game_ids = games["GAME_ID"].unique().tolist()
print(f"Total Games in {SEASON} ({SEASON_TYPE}): {len(game_ids)}")

# Load existing data to avoid duplicate API calls
if os.path.exists(OUTPUT_FILE):
    existing_data = pd.read_csv(OUTPUT_FILE)
    processed_game_ids = existing_data["GAME_ID"].unique().tolist()
else:
    existing_data = pd.DataFrame()
    processed_game_ids = []

# List to store new data
all_data = []

# Loop through all game IDs


Total Games in 2023-24 (Regular Season): 1230


In [2]:
print(len(processed_game_ids))

994


In [3]:
print(game_ids)

['0022301200', '0022301192', '0022301196', '0022301194', '0022301197', '0022301190', '0022301193', '0022301187', '0022301189', '0022301191', '0022301199', '0022301198', '0022301195', '0022301186', '0022301188', '0022301182', '0022301179', '0022301175', '0022301172', '0022301177', '0022301184', '0022301185', '0022301180', '0022301173', '0022301183', '0022301176', '0022301171', '0022301181', '0022301174', '0022301178', '0022301168', '0022301170', '0022301166', '0022301169', '0022301167', '0022301159', '0022301165', '0022301162', '0022301163', '0022301164', '0022301158', '0022301161', '0022301160', '0022301153', '0022301156', '0022301152', '0022301154', '0022301148', '0022301145', '0022301146', '0022301144', '0022301151', '0022301150', '0022301155', '0022301147', '0022301149', '0022301157', '0022301139', '0022301140', '0022301138', '0022301131', '0022301143', '0022301135', '0022301134', '0022301133', '0022301136', '0022301141', '0022301132', '0022301142', '0022301137', '0022301130', '0022

In [4]:
print(processed_game_ids)

[22301178, 22301194, 22301189, 22301191, 22301198, 22301195, 22301192, 22301190, 22301187, 22301197, 22301188, 22301193, 22301186, 22301200, 22301199, 22301196, 22301185, 22301183, 22301171, 22301172, 22301173, 22301176, 22301177, 22301179, 22301184, 22301175, 22301174, 22301182, 22301180, 22301181, 22301169, 22301168, 22301170, 22301166, 22301167, 22301163, 22301162, 22301158, 22301159, 22301161, 22301160, 22301164, 22301165, 22301147, 22301152, 22301154, 22301157, 22301148, 22301156, 22301150, 22301144, 22301149, 22301151, 22301146, 22301155, 22301145, 22301153, 22301134, 22301143, 22301140, 22301141, 22301138, 22301131, 22301135, 22301137, 22301136, 22301139, 22301142, 22301133, 22301132, 22301130, 22301127, 22301128, 22301129, 22301122, 22301119, 22301123, 22301120, 22301117, 22301097, 22301115, 22301125, 22301118, 22301116, 22301126, 22301121, 22301113, 22301114, 22301112, 22301111, 22301124, 22301109, 22301106, 22301107, 22301108, 22301104, 22301110, 22301105, 22301102, 22301103,

In [5]:
from tqdm import tqdm

# Wrap the game_ids list with tqdm for a progress bar
for game_id in tqdm(game_ids, desc="Processing Games"):
    if int(game_id) in processed_game_ids:
        continue  # Skip already processed games

    try:
        # Retrieve possession-based stats
        boxscore = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id)
        df = boxscore.get_data_frames()[0]

        # Check if the file exists
        if os.path.exists(OUTPUT_FILE):
            # Load existing data
            existing_data = pd.read_csv(OUTPUT_FILE)
            # Check if the game_id is already processed
            if game_id in existing_data['GAME_ID'].values:
                continue
        else:
            # Create an empty DataFrame if the file does not exist
            existing_data = pd.DataFrame()

        # Append new data
        updated_data = pd.concat([existing_data, df], ignore_index=True)

        # Save the updated data back to the file
        updated_data.to_csv(OUTPUT_FILE, index=False)

        # Sleep to avoid rate limit
        time.sleep(1.5)
    except Exception as e:
        print(f"⚠️ Error retrieving data for Game ID {game_id}: {e}")

print("✅ Data retrieval complete!")


Processing Games: 100%|██████████| 1230/1230 [09:42<00:00,  2.11it/s]

✅ Data retrieval complete!



