In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import plotly.express as px

  from pandas.core import (


In [2]:
!pip install --upgrade nba_api



In [2]:
from nba_api.stats.static import teams #to get team ids
from nba_api.stats.endpoints import commonteamroster #from here we want the player ids (clusters + models)
from nba_api.stats.endpoints import commonplayerinfo #from here we want height and weight of players (clusters + models)
from nba_api.stats.endpoints import cumestatsplayer #from here we want the player avg stats (clusters + models)
from nba_api.stats.endpoints import playercareerstats #from here we want career stats of the players (clusters + models)
from nba_api.stats.endpoints import playerdashboardbyyearoveryear #from here we want year over year stats (clusters + models)
from nba_api.stats.endpoints import playerdashptreb #from here we want the rebounding stats (clusters)
from nba_api.stats.endpoints import playerdashptshotdefend #from here we want the shot defense stats (clusters)
from nba_api.stats.endpoints import playerdashptshots #from here we want the shooting stats (clusters)
from nba_api.stats.endpoints import leaguedashptdefend
from nba_api.stats.endpoints import leaguedashplayershotlocations #from here we want the shot locations (clusters)
from nba_api.stats.endpoints import leaguedashplayerptshot #from here we want the frequencies (clusters)
from nba_api.stats.endpoints import synergyplaytypes #from here we will take stats per playtype (clusters, models)
from nba_api.stats.endpoints import playerdashptpass #from here we want the passing stats for our all stars (for visualizations)
from nba_api.stats.endpoints import shotchartlineupdetail #from here we want the shot charts for our lineups (for visualizations)
from nba_api.stats.endpoints import leaguelineupviz #from here we want lineup shot frequencies (for visualizations)
from nba_api.stats.endpoints import leaguedashlineups #from here we want league average lineup stats (might not use)
from nba_api.stats.endpoints import teamdashlineups #from here we want team lineup stats (for the main analysis)
from nba_api.stats.endpoints import leaguedashptstats
from nba_api.stats.endpoints import leaguedashplayerstats

import time
from multiprocessing import Pool
import logging
from time import sleep
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import ConnectionError, ReadTimeout
import warnings
from pandas.errors import PerformanceWarning

In [3]:
teams_list = teams.get_teams()
team_ids = [team['id'] for team in teams_list]
team_id_name_map = {team['id']: team['full_name'] for team in teams_list}

seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(2015, 2025)]

#### The data pulling needs to be done carefully, we have a lot of data and also we need to keep our data and analysis "True" so we will loop year by year, take the players stats from that year and we will look at every relevant player's different seasons as individual rows
#### the clusters will be unsupervised/semi supervised so the samples don't have to be independent

#### We will do 2 pulls one for the clusters and one after that for the lineups and the specific per year data for the models

In [5]:
# Here we are getting the players from the 2024-25 season info and saving it 
players_for_clusters = pd.DataFrame()
for team_id in team_ids:
    commonteamrosters = commonteamroster.CommonTeamRoster(team_id=team_id, season='2024-25')
    commonteamrosters_df = commonteamrosters.get_data_frames()[0]
    players_for_clusters = pd.concat([players_for_clusters, commonteamrosters_df], ignore_index=True)
players_for_clusters_list = players_for_clusters[['PLAYER_ID', 'PLAYER']]

players_for_clusters_list.to_csv("Data/players_for_clusters_list_base.csv")
players_for_clusters.to_csv("Data/players_for_clusters_base.csv")

players_for_clusters_list

Unnamed: 0,PLAYER_ID,PLAYER
0,1631210,Jacob Toppin
1,1630552,Jalen Johnson
2,1630811,Keaton Wallace
3,1627747,Caris LeVert
4,1641723,Kobe Bufkin
...,...,...
529,1641706,Brandon Miller
530,1642354,KJ Simpson
531,203552,Seth Curry
532,1642275,Tidjane Salaün


In [6]:
# We also need all the players that have played with our allstars in the past 10 years
# we will go through the players first to see all the exact teams they played in on every season to avoid redundancy
allstar_ids = [203999, 201939, 201935, 202695, 2544, 1629029, 203507, 1630162, 1626164, 201142, 1628983]
season_team_df = pd.DataFrame()
for pid in allstar_ids:
    playeryear = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id=pid)
    allstaryear_df = playeryear.get_data_frames()[1]
    season_team_df = pd.concat([season_team_df, allstaryear_df[["GROUP_VALUE", "TEAM_ID"]]], ignore_index=True)
season_team_df = season_team_df[season_team_df["GROUP_VALUE"].isin(seasons)]
season_team_df = season_team_df[season_team_df["TEAM_ID"] != -1]
season_team_df = season_team_df.drop_duplicates()
season_team_df.to_csv("Data/allstars_season_team.csv")
# now we will go through our season-team combos and take all the lineups needed
# we will keep only lineups that have one of our allstars and make a list of all the players from the filtered lineups

target_ids_str = [str(int(x)) for x in allstar_ids]

# deduplicate combos 
combos = list(season_team_df[['TEAM_ID', 'GROUP_VALUE']].drop_duplicates().itertuples(index=False, name=None))

results = []
count = 0
for team_id, season1 in combos:
    count += 1
    tried = 0
    success = False
    while tried < 3 and not success:
        try:
            obj = teamdashlineups.TeamDashLineups(
                team_id=int(team_id),
                season=str(season1),
                measure_type_detailed_defense="Advanced",
                per_mode_detailed="Per48",
                group_quantity=5,
            )
            lineups_df = obj.get_data_frames()[1]
            if lineups_df is not None and not lineups_df.empty:
                results.append(lineups_df)
            print(f"Fetched lineups for team {team_id} season {season1} ({count}/{len(combos)})")
            success = True
        except Exception as e:
            tried += 1
            print(f"Fetch failed for {team_id} {season1} (attempt {tried}): {e}")
            sleep(0.6 * tried)  # polite backoff

# single concat 
if results:
    full_lineups_data = pd.concat(results, ignore_index=True)
else:
    full_lineups_data = pd.DataFrame()

# downstream processing 
if not full_lineups_data.empty:
    full_lineups_data['GROUP_ID'] = full_lineups_data['GROUP_ID'].str[1:-1]
    mask = full_lineups_data['GROUP_ID'].apply(
        lambda s: any(pid in target_ids_str for pid in s.split('-'))
    )
    filtered = full_lineups_data[mask].copy()
    rows = []
    for _, row in filtered.iterrows():
        ids = row['GROUP_ID'].split('-')
        names = row['GROUP_NAME'].split(' - ')
        for pid, pname in zip(ids, names):
            rows.append({'PLAYER_ID': pid, 'PLAYER': pname})
    result_df = pd.DataFrame(rows)
    full_lineups_names = result_df.drop_duplicates()
else:
    full_lineups_names = pd.DataFrame(columns=['PLAYER_ID', 'PLAYER'])

full_lineups_names["PLAYER_ID"] = full_lineups_names["PLAYER_ID"].astype(int)
full_lineups_names["PLAYER"] = full_lineups_names["PLAYER"].astype(str)
players_for_clusters_list["PLAYER"] = players_for_clusters_list["PLAYER"].astype(str)

full_players_for_clusters = pd.concat([players_for_clusters_list, full_lineups_names], ignore_index=True)
full_players_for_clusters = full_players_for_clusters.drop_duplicates(subset=['PLAYER_ID'])
full_players_for_clusters.to_csv("Data/full_players_list.csv")

full_players_for_clusters

Fetched lineups for team 1610612743 season 2024-25 (1/90)
Fetched lineups for team 1610612743 season 2023-24 (2/90)
Fetched lineups for team 1610612743 season 2022-23 (3/90)
Fetched lineups for team 1610612743 season 2021-22 (4/90)
Fetched lineups for team 1610612743 season 2020-21 (5/90)
Fetched lineups for team 1610612743 season 2019-20 (6/90)
Fetched lineups for team 1610612743 season 2018-19 (7/90)
Fetched lineups for team 1610612743 season 2017-18 (8/90)
Fetched lineups for team 1610612743 season 2016-17 (9/90)
Fetched lineups for team 1610612743 season 2015-16 (10/90)
Fetched lineups for team 1610612744 season 2024-25 (11/90)
Fetched lineups for team 1610612744 season 2023-24 (12/90)
Fetched lineups for team 1610612744 season 2022-23 (13/90)
Fetched lineups for team 1610612744 season 2021-22 (14/90)
Fetched lineups for team 1610612744 season 2020-21 (15/90)
Fetched lineups for team 1610612744 season 2019-20 (16/90)
Fetched lineups for team 1610612744 season 2018-19 (17/90)
Fetche

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_lineups_names["PLAYER_ID"] = full_lineups_names["PLAYER_ID"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_lineups_names["PLAYER"] = full_lineups_names["PLAYER"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_for_clusters_list["PLAYER"] = players_for_clust

Unnamed: 0,PLAYER_ID,PLAYER
0,1631210,Jacob Toppin
1,1630552,Jalen Johnson
2,1630811,Keaton Wallace
3,1627747,Caris LeVert
4,1641723,Kobe Bufkin
...,...,...
1133,1628462,M. Teodosic
1134,1627820,T. Wallace
1135,1628414,S. Thornwell
1136,1628405,J. Motley


In [7]:
last_season_players = players_for_clusters_list[~players_for_clusters_list["PLAYER_ID"].isin(full_lineups_names["PLAYER_ID"].to_list())].copy()
last_season_players.to_csv("Data/only_2024_25_players.csv")

### Now we will start pulling stats and metrics for players

In [8]:
# first we will pull metrics before splitting for years
full_players_features = full_players_for_clusters.copy()
player_info = pd.DataFrame()
count = 1
for pid in full_players_for_clusters['PLAYER_ID']:
    count += 1
    info = commonplayerinfo.CommonPlayerInfo(player_id=pid)
    info_df = info.get_data_frames()[0]
    player_info = pd.concat([player_info, info_df], ignore_index=True)
    if count % 50 == 0:
        print(f"Processed {count} players")
player_info_indexed = player_info.drop_duplicates(subset=['PERSON_ID']).set_index('PERSON_ID')
pid_series = full_players_features['PLAYER_ID']

full_players_features['Height'] = pid_series.map(player_info_indexed['HEIGHT'])
full_players_features['Weight'] = pid_series.map(player_info_indexed['WEIGHT'])
full_players_features['Position'] = pid_series.map(player_info_indexed['POSITION'])
full_players_features['Draft Year'] = pid_series.map(player_info_indexed['DRAFT_YEAR'])

full_players_features.to_csv("Data/General_player_features.csv")
full_players_features


Processed 50 players
Processed 100 players
Processed 150 players
Processed 200 players
Processed 250 players
Processed 300 players
Processed 350 players
Processed 400 players
Processed 450 players
Processed 500 players
Processed 550 players
Processed 600 players
Processed 650 players
Processed 700 players
Processed 750 players
Processed 800 players
Processed 850 players


Unnamed: 0,PLAYER_ID,PLAYER,Height,Weight,Position,Draft Year
0,1631210,Jacob Toppin,6-9,200,Forward,Undrafted
1,1630552,Jalen Johnson,6-8,219,Forward,2021
2,1630811,Keaton Wallace,6-3,185,Guard,Undrafted
3,1627747,Caris LeVert,6-7,205,Guard,2016
4,1641723,Kobe Bufkin,6-4,195,Guard,2023
...,...,...,...,...,...,...
1133,1628462,M. Teodosic,6-5,196,Guard,Undrafted
1134,1627820,T. Wallace,6-5,198,Guard,2016
1135,1628414,S. Thornwell,6-4,215,Guard,2017
1136,1628405,J. Motley,6-8,230,Forward,Undrafted


we will make a basic full df with rows for every player-year combo

In [20]:
from concurrent.futures import ThreadPoolExecutor
# We need to import the specific exceptions to catch them
from requests.exceptions import ConnectionError, ReadTimeout

# --- Helper Function ---
# This function contains the logic that will be run in parallel for each player.
# It now includes retries and exponential backoff.
def fetch_player_seasons(pid, seasons_to_check):
    """
    Fetches the seasons played for a single player ID with retries and backoff.
    Returns a tuple: (pid, list_of_seasons_or_None)
    """
    max_retries = 3
    base_delay = 1  # start with 1 second delay on retry
    
    # Add a small, consistent 0.6s delay to every request
    # This is a "polite" way to respect the API rate limits.
    time.sleep(0.6)

    for attempt in range(max_retries):
        try:
            # Add a timeout to the API call itself (e.g., 10 seconds)
            years_obj = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
                player_id=pid,
                timeout=10 
            )
            
            # We get [1] which is the 'ByYearPlayerDashboard'
            years_df = years_obj.get_data_frames()[1] 
            
            if years_df is None or years_df.empty:
                # Player has no data or an empty frame was returned
                return (pid, None)
            
            # Filter out "Total" rows (where TEAM_ID == -1)
            years_df = years_df[years_df["TEAM_ID"] != -1]
            
            # Filter to only include seasons we care about
            years_df = years_df[years_df["GROUP_VALUE"].isin(seasons_to_check)]
            
            # Get the unique list of seasons the player played
            player_seasons = list(set(years_df["GROUP_VALUE"].to_list()))
            
            if not player_seasons:
                # Player played, but not in the seasons we are checking
                return (pid, None)
                
            return (pid, player_seasons) # Success!
            
        # Catch specific connection/timeout errors
        except (ConnectionError, ReadTimeout) as e:
            if attempt == max_retries - 1:
                # This was the last attempt, log and give up
                print(f"Error processing PLAYER_ID {pid} after {max_retries} attempts: {e}")
                return (pid, None) # Final attempt failed
            
            # Calculate exponential backoff
            delay = base_delay * (2 ** attempt) 
            print(f"Connection error for PLAYER_ID {pid}. Retrying in {delay}s... (Attempt {attempt+1}/{max_retries})")
            time.sleep(delay)
            
        except Exception as e:
            # Some other unexpected error (e.g., bad data from API)
            print(f"Non-retryable error processing PLAYER_ID {pid}: {e}")
            return (pid, None) # Return None on non-retryable failure

    return (pid, None) # Should only be reached if all retries fail

# --- Main Script ---
# ASSUMPTION: 'full_players_features' and 'seasons' are already defined
# in your environment, as requested.

print(f"Starting processing for {len(full_players_features)} players...")
start_time = time.time()

# This will be the list of dictionaries for our new DataFrame
per_year_player_rows = []
error_players = []

# Get a unique list of players to fetch
# Using drop_duplicates and set_index is a robust way to get base info
players_base_info = full_players_features.drop_duplicates(subset=['PLAYER_ID']).set_index('PLAYER_ID')
player_id_list = players_base_info.index.to_list()

# Use a ThreadPoolExecutor to fetch data in parallel
# Reduced max_workers from 10 to 5 to be nicer to the API
with ThreadPoolExecutor(max_workers=5) as executor:
    
    # We use executor.map to run fetch_player_seasons for each pid in player_id_list
    # We pass 'seasons' to the function (it will be the same for every call)
    # The map function returns results in the same order the calls were made
    results = executor.map(fetch_player_seasons, player_id_list, [seasons]*len(player_id_list))

    #print("API requests submitted. Processing results as they arrive...")

    # Process results as they come in
    for i, result in enumerate(results):
        pid, player_seasons = result
        
        if (i + 1) % 50 == 0 or (i + 1) == len(player_id_list):
            print(f"Processed {i+1}/{len(player_id_list)} players...")

        if player_seasons is None:
            # This player had an error or no relevant seasons
            error_players.append(pid)
            continue
            
        # Get the player's base info (the row from full_players_features)
        base_row_dict = players_base_info.loc[pid].to_dict()
        
        # For each season this player played, create a new row
        for season in player_seasons:
            # We copy the base info
            new_row = base_row_dict.copy()
            # And add the specific season
            new_row["SEASON"] = season
            # Add PLAYER_ID back in (since it was the index)
            new_row["PLAYER_ID"] = pid
            
            per_year_player_rows.append(new_row)

# Create the final DataFrame from our list of dictionaries
per_year_player_features = pd.DataFrame(per_year_player_rows)

# Re-order columns to be nice (optional)
if not per_year_player_features.empty:
    # Ensure PLAYER and PLAYER_ID are first, if they exist
    cols_to_add = []
    if 'PLAYER' in players_base_info.columns:
        cols_to_add.append('PLAYER')
    
    cols = ['PLAYER_ID'] + cols_to_add + ['SEASON'] + [c for c in per_year_player_features.columns if c not in ['PLAYER_ID', 'PLAYER', 'SEASON']]
    per_year_player_features = per_year_player_features[cols]

end_time = time.time()

print("\n--- Processing Complete ---")
print(f"Total time taken: {end_time - start_time:.2f} seconds")
print(f"Total rows generated: {len(per_year_player_features)}")
print(f"Players with errors or no data: {len(error_players)}")
if error_players:
    print(f"Error PIDs: {error_players[:20]}...") # Print first 20 error IDs

# Save the result
per_year_player_features.to_csv("Data/Updating_per_year_player_features.csv", index=False)
print("Saved to Data/Updating_per_year_player_features.csv")

print("\n--- Final DataFrame (head) ---")
print(per_year_player_features.head())

Starting processing for 885 players...
Connection error for PLAYER_ID 1641723. Retrying in 1s... (Attempt 1/3)
Connection error for PLAYER_ID 1627747. Retrying in 1s... (Attempt 1/3)
Connection error for PLAYER_ID 1630700. Retrying in 1s... (Attempt 1/3)
Connection error for PLAYER_ID 1631243. Retrying in 1s... (Attempt 1/3)
Connection error for PLAYER_ID 1629726. Retrying in 1s... (Attempt 1/3)
Processed 50/885 players...
Processed 100/885 players...
Processed 150/885 players...
Processed 200/885 players...
Processed 250/885 players...
Processed 300/885 players...
Processed 350/885 players...
Processed 400/885 players...
Processed 450/885 players...
Processed 500/885 players...
Processed 550/885 players...
Processed 600/885 players...
Processed 650/885 players...
Processed 700/885 players...
Processed 750/885 players...
Processed 800/885 players...
Processed 850/885 players...
Processed 885/885 players...

--- Processing Complete ---
Total time taken: 181.89 seconds
Total rows generat

running shot type pull

In [None]:
import warnings
from pandas.errors import PerformanceWarning
from concurrent.futures import ThreadPoolExecutor

from requests.exceptions import ConnectionError, ReadTimeout

# Suppress PerformanceWarning from pandas (often happens in complex aggregations)
warnings.simplefilter("ignore", PerformanceWarning)

# --- Helper Function (from your script) ---
def _get_frame(frames, i):
    """Safely get a DataFrame from the API response list."""
    try:
        return frames[i] if frames and len(frames) > i and frames[i] is not None else pd.DataFrame()
    except Exception:
        return pd.DataFrame()

# --- Helper Function (for parallel execution) ---
def fetch_player_season_shot_data(player_season_row):
    """
    Fetches all shot data for a single player-season combination.
    This function performs both API calls and all the processing logic.
    """
    pid = player_season_row['PLAYER_ID']
    season = player_season_row['SEASON']
    player_name = player_season_row.get('PLAYER', pid) # Get name for logging
    
    max_retries = 3
    base_delay = 1

    try:
        # --- API Call 1: Get Team(s) for the season ---
        time.sleep(0.6) # Polite delay
        yrs_frames = None
        for attempt in range(max_retries):
            try:
                yrs_obj = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
                    player_id=pid, 
                    timeout=20
                )
                yrs_frames = yrs_obj.get_data_frames()
                break # Success
            except (ConnectionError, ReadTimeout) as e:
                if attempt == max_retries - 1:
                    print(f"  -> API Error 1 (PID {pid}, {season}): Failed to get teams after {max_retries} attempts. {e}")
                    return None
                delay = base_delay * (2 ** attempt)
                time.sleep(delay)
        
        # Process API Call 1 result
        years_teams_df = _get_frame(yrs_frames, 1)
        if years_teams_df.empty:
            return None

        # Filter to *only* the specific season we care about
        years_teams_df = years_teams_df[years_teams_df["TEAM_ID"] != -1]
        season_teams_df = years_teams_df[years_teams_df["GROUP_VALUE"] == season]

        if season_teams_df.empty:
            # This player-season combo had no team data (e.g., sat out)
            return None

        # --- FIX: This will hold a LIST of DataFrames, one per team ---
        team_dfs_list = []
        # --- END FIX ---

        # --- Inner Loop: Process each team for that season ---
        for _, team_row in season_teams_df.iterrows():
            team_id = team_row.get("TEAM_ID")
            if team_id is None or team_id == -1:
                continue

            # --- API Call 2: Get Shot Details ---
            time.sleep(0.6) # Polite delay
            frames = None
            for attempt in range(max_retries):
                try:
                    player_shots_obj = playerdashptshots.PlayerDashPtShots(
                        player_id=pid, 
                        season=season, 
                        team_id=team_id,
                        timeout=20
                    )
                    frames = player_shots_obj.get_data_frames()
                    break # Success
                except (ConnectionError, ReadTimeout) as e:
                    if attempt == max_retries - 1:
                        print(f"  -> API Error 2 (PID {pid}, {season}, TID {team_id}): Failed to get shots. {e}")
                        continue # Skip this team, try next
                    delay = base_delay * (2 ** attempt)
                    time.sleep(delay)
            
            if frames is None:
                continue # Failed to get data for this team

            # --- Process API Call 2 results (Your logic) ---
            player_shots_overall = _get_frame(frames, 0)
            player_shots_type = _get_frame(frames, 1)
            player_shots_dribbles = _get_frame(frames, 3)
            player_shots_close_def = _get_frame(frames, 4)
            player_shots_touch_time = _get_frame(frames, 6)

            row_map = {}

            # overall
            if not player_shots_overall.empty:
                vals = player_shots_overall.iloc[0]
                row_map[("OVERALL", "FGM")] = vals.get("FGM", 0)
                row_map[("OVERALL", "FGA")] = vals.get("FGA", 0)
                row_map[("OVERALL", "FGA_FREQ")] = vals.get("FGA_FREQUENCY", 0)
                row_map[("OVERALL", "FG_PCT")] = vals.get("FG_PCT", 0)
                row_map[("OVERALL", "2 FGM")] = vals.get("FG2M", 0)
                row_map[("OVERALL", "2 FGA")] = vals.get("FG2A", 0)
                row_map[("OVERALL", "2 FGA_FREQ")] = vals.get("FG2A_FREQUENCY", 0)
                row_map[("OVERALL", "2 FG_PCT")] = vals.get("FG_PCT", 0)
                row_map[("OVERALL", "3 FGM")] = vals.get("FG3M", 0)
                row_map[("OVERALL", "3 FGA")] = vals.get("FG3A", 0)
                row_map[("OVERALL", "3 FGA_FREQ")] = vals.get("FG3A_FREQUENCY", 0)
                row_map[("OVERALL", "3 FG_PCT")] = vals.get("FG_PCT", 0)

            # shot type
            if not player_shots_type.empty:
                for _, r in player_shots_type.iterrows():
                    key_base = r.get("SHOT_TYPE")
                    if key_base:
                        row_map[(key_base, "FGM")] = r.get("FGM", 0)
                        row_map[(key_base, "FGA")] = r.get("FGA", 0)
                        row_map[(key_base, "FGA_FREQ")] = r.get("FGA_FREQUENCY", 0)
                        row_map[(key_base, "FG_PCT")] = r.get("FG_PCT", 0)
                        row_map[(key_base, "2 FGM")] = r.get("FG2M", 0)
                        row_map[(key_base, "2 FGA")] = r.get("FG2A", 0)
                        row_map[(key_base, "2 FGA_FREQ")] = r.get("FG2A_FREQUENCY", 0)
                        row_map[(key_base, "2 FG_PCT")] = r.get("FG2_PCT", 0)
                        row_map[(key_base, "3 FGM")] = r.get("FG3M", 0)
                        row_map[(key_base, "3 FGA")] = r.get("FG3A", 0)
                        row_map[(key_base, "3 FGA_FREQ")] = r.get("FG3A_FREQUENCY", 0)
                        row_map[(key_base, "3 FG_PCT")] = r.get("FG3_PCT", 0)
            
            # dribbles
            if not player_shots_dribbles.empty:
                for _, r in player_shots_dribbles.iterrows():
                    key_base = r.get("DRIBBLE_RANGE")
                    if key_base:
                        row_map[(key_base, "FGM")] = r.get("FGM", 0)
                        row_map[(key_base, "FGA")] = r.get("FGA", 0)
                        row_map[(key_base, "FGA_FREQ")] = r.get("FGA_FREQUENCY", 0)
                        row_map[(key_base, "FG_PCT")] = r.get("FG_PCT", 0)
                        row_map[(key_base, "2 FGM")] = r.get("FG2M", 0)
                        row_map[(key_base, "2 FGA")] = r.get("FG2A", 0)
                        row_map[(key_base, "2 FGA_FREQ")] = r.get("FG2A_FREQUENCY", 0)
                        row_map[(key_base, "2 FG_PCT")] = r.get("FG2_PCT", 0)
                        row_map[(key_base, "3 FGM")] = r.get("FG3M", 0)
                        row_map[(key_base, "3 FGA")] = r.get("FG3A", 0)
                        row_map[(key_base, "3 FGA_FREQ")] = r.get("FG3A_FREQUENCY", 0)
                        row_map[(key_base, "3 FG_PCT")] = r.get("FG3_PCT", 0)
            
            if not player_shots_close_def.empty:
                for _, r_cd in player_shots_close_def.iterrows():
                    try:
                        key_base = r_cd.get("CLOSE_DEF_DIST_RANGE")
                        row_map[(key_base, "FGM")] = r_cd.get("FGM", 0)
                        row_map[(key_base, "FGA")] = r_cd.get("FGA", 0)
                        row_map[(key_base, "FGA_FREQ")] = r_cd.get("FGA_FREQUENCY", 0)
                        row_map[(key_base, "FG_PCT")] = r_cd.get("FG_PCT", 0)
                        row_map[(key_base, "2 FGM")] = r_cd.get("FG2M", 0)
                        row_map[(key_base, "2 FGA")] = r_cd.get("FG2A", 0)
                        row_map[(key_base, "2 FGA_FREQ")] = r_cd.get("FG2A_FREQUENCY", 0)
                        row_map[(key_base, "2 FG_PCT")] = r_cd.get("FG2_PCT", 0)
                        row_map[(key_base, "3 FGM")] = r_cd.get("FG3M", 0)
                        row_map[(key_base, "3 FGA")] = r_cd.get("FG3A", 0)
                        row_map[(key_base, "3 FGA_FREQ")] = r_cd.get("FG3A_FREQUENCY", 0)
                        row_map[(key_base, "3 FG_PCT")] = r_cd.get("FG3_PCT", 0)
                    except Exception as e:
                        error_players.append(pid)
                        print(f"      Warning close_def row assign ({pid} {season}): {e}")

            # touch time
            if not player_shots_touch_time.empty:
                for _, r_tt in player_shots_touch_time.iterrows():
                    try:
                        key_base = r_tt.get("TOUCH_TIME_RANGE")
                        row_map[(key_base, "FGM")] = r_tt.get("FGM", 0)
                        row_map[(key_base, "FGA")] = r_tt.get("FGA", 0)
                        row_map[(key_base, "FGA_FREQ")] = r_tt.get("FGA_FREQUENCY", 0)
                        row_map[(key_base, "FG_PCT")] = r_tt.get("FG_PCT", 0)
                        row_map[(key_base, "2 FGM")] = r_tt.get("FG2M", 0)
                        row_map[(key_base, "2 FGA")] = r_tt.get("FG2A", 0)
                        row_map[(key_base, "2 FGA_FREQ")] = r_tt.get("FG2A_FREQUENCY", 0)
                        row_map[(key_base, "2 FG_PCT")] = r_tt.get("FG2_PCT", 0)
                        row_map[(key_base, "3 FGM")] = r_tt.get("FG3M", 0)
                        row_map[(key_base, "3 FGA")] = r_tt.get("FG3A", 0)
                        row_map[(key_base, "3 FGA_FREQ")] = r_tt.get("FG3A_FREQUENCY", 0)
                        row_map[(key_base, "3 FG_PCT")] = r_tt.get("FG3_PCT", 0)
                    except Exception as e:
                        error_players.append(pid)
                        print(f"      Warning touch_time row assign ({pid} {season}): {e}")


            # --- FIX: Create a single row for THIS TEAM ---
            if row_map:
                # Create a single-row DataFrame for the current team
                curr_team_df = pd.DataFrame([row_map]).fillna(0)
                
                # Add identifiers for this specific row
                curr_team_df["PLAYER_ID"] = pid
                curr_team_df["SEASON"] = season
                curr_team_df["TEAM_ID"] = team_id
                
                # Add this team's DataFrame to our list
                team_dfs_list.append(curr_team_df)
            # --- END FIX ---
            
        # --- End of team loop ---

        # --- FIX: Return the list of team DataFrames ---
        if team_dfs_list:
            return team_dfs_list
        else:
            return None
        # --- END FIX ---

    except Exception as e:
        print(f"  !! UNEXPECTED error processing {player_name} ({pid}, {season}): {e}")
        return None

# --- Main Script ---
# ASSUMPTION: 'full_players_features' and 'seasons' are defined.
print("Starting optimized shot detail fetch...")
start_time = time.time()

# 1. Create the job list: one row per player-season.
# FIX: Drop duplicates to avoid re-processing the same player-season.
player_shots_param = per_year_player_features[["PLAYER_ID", "PLAYER", "SEASON"]].copy()
player_shots_param = player_shots_param.drop_duplicates(subset=['PLAYER_ID', 'SEASON']).reset_index(drop=True)
jobs = player_shots_param.to_dict('records')

total_jobs = len(jobs)
print(f"Created {total_jobs} unique player-season jobs.")

full_shots_list = []
num_workers = 5 # Keep workers low to be polite to API

# 2. Run all jobs in parallel
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    
    results = executor.map(fetch_player_season_shot_data, jobs)

    # Process results as they come in
    for i, result_df_list in enumerate(results):
        if (i + 1) % 50 == 0 or (i + 1) == total_jobs:
            print(f"Processed {i+1}/{total_jobs} jobs...")
        
        # --- FIX: Use extend to add all rows from the returned list ---
        if result_df_list is not None and result_df_list:
            full_shots_list.extend(result_df_list)
        # --- END FIX ---

# 3. Final Concat
# After concatenating all results...

if full_shots_list:
    print("Concatenating all results...")
    full_shots_df = pd.concat(full_shots_list, ignore_index=True).fillna(0)
    
    print(f"Final full_shots_df with {len(full_shots_df)} rows.")
    full_shots_df.to_csv("Data/full_players_shottype1.csv", index=False)
    print("Saved to full_players_shottype1.csv")
else:
    full_shots_df = pd.DataFrame()
    print("No shot rows collected")

end_time = time.time()
print(f"\n--- Total time taken: {end_time - start_time:.2f} seconds ---")
full_shots_df
# --- END FIX ---
full_shots_df

Starting optimized shot detail fetch...
Created 4327 unique player-season jobs.
Processed 50/4327 jobs...
Processed 100/4327 jobs...
Processed 150/4327 jobs...
Processed 200/4327 jobs...
Processed 250/4327 jobs...
Processed 300/4327 jobs...
Processed 350/4327 jobs...
Processed 400/4327 jobs...
Processed 450/4327 jobs...
Processed 500/4327 jobs...
Processed 550/4327 jobs...
Processed 600/4327 jobs...
Processed 650/4327 jobs...
  !! UNEXPECTED error processing Braxton Key (1630296, 2023-24): Expecting value: line 1 column 1 (char 0)
Processed 700/4327 jobs...
Processed 750/4327 jobs...
Processed 800/4327 jobs...
Processed 850/4327 jobs...
Processed 900/4327 jobs...
Processed 950/4327 jobs...
Processed 1000/4327 jobs...
Processed 1050/4327 jobs...
Processed 1100/4327 jobs...
Processed 1150/4327 jobs...
Processed 1200/4327 jobs...
Processed 1250/4327 jobs...
Processed 1300/4327 jobs...
  !! UNEXPECTED error processing Tosan Evbuomwan (1641787, 2023-24): Expecting value: line 1 column 1 (ch

TypeError: '<' not supported between instances of 'NoneType' and 'str'

In [58]:
full_shots_df = pd.read_csv("Data/full_players_shottype1.csv")
id_cols = ['PLAYER_ID', 'SEASON', 'TEAM_ID']
data_cols = [col for col in full_shots_df.columns if col not in id_cols]
full_shots_df = full_shots_df[id_cols + sorted(data_cols)]

# --- FIX: Handle duplicate PLAYER_ID + SEASON combinations ---
# 1. Identify duplicates (players with multiple teams in same season)
duplicates_mask = full_shots_df.duplicated(subset=['PLAYER_ID', 'SEASON'], keep=False)

# 2. Split into unique and duplicate DataFrames
unique_df = full_shots_df[~duplicates_mask].copy()
duplicates_df = full_shots_df[duplicates_mask].copy()

print(f"Unique rows: {len(unique_df)}")
print(f"Duplicate rows (to aggregate): {len(duplicates_df)}")

# 3. Process duplicates: aggregate by PLAYER_ID + SEASON
if not duplicates_df.empty:
    agg_dict = {}
    
    # Identify FGM, FGA columns (sum these)
    fgm_fga_cols = [col for col in data_cols if 'FGM' in str(col) or 'FGA' in str(col)]
    for col in fgm_fga_cols:
        agg_dict[col] = 'sum'
    
    # Identify FREQ and FG_PCT columns (weighted average by FGA)
    freq_pct_cols = [col for col in data_cols if 'FREQ' in str(col) or 'FG_PCT' in str(col)]
    
    # Group by PLAYER_ID and SEASON
    grouped = duplicates_df.groupby(['PLAYER_ID', 'SEASON'], as_index=False)
    
    # Start with summing FGM/FGA
    agg_result = grouped[fgm_fga_cols].sum()
    
    # For each FREQ/FG_PCT column, compute weighted average
    # For each FREQ/FG_PCT column, compute weighted average
# For each FREQ/FG_PCT column, compute weighted average
for pct_col in freq_pct_cols:
    # Find corresponding FGA column
    if isinstance(pct_col, tuple):
        base_key = pct_col[0]
        fga_col = (base_key, 'FGA')
    else:
        base_key = pct_col.replace('FG_PCT', '').replace('FREQ', '')
        fga_col = f"{base_key}FGA"
    
    if fga_col in duplicates_df.columns:
        # Weighted average
        weighted_sum = (duplicates_df[pct_col] * duplicates_df[fga_col]).groupby(
            [duplicates_df['PLAYER_ID'], duplicates_df['SEASON']]
        ).sum()
        total_fga = duplicates_df[fga_col].groupby(
            [duplicates_df['PLAYER_ID'], duplicates_df['SEASON']]
        ).sum()
        weighted_avg = weighted_sum / total_fga.replace(0, np.nan)
        agg_result[pct_col] = weighted_avg.values
    else:
        # Simple average - reset index to align properly
        mean_series = grouped[pct_col].mean().reset_index()
        
        # Merge the mean values back into agg_result
        for idx, row in mean_series.iterrows():
            pid = row['PLAYER_ID']
            season = row['SEASON']
            val = row[pct_col]
            mask = (agg_result['PLAYER_ID'] == pid) & (agg_result['SEASON'] == season)
            agg_result.loc[mask, pct_col] = val
    
    # Don't include TEAM_ID in the aggregated result
    agg_result = agg_result[['PLAYER_ID', 'SEASON'] + [c for c in agg_result.columns if c not in ['PLAYER_ID', 'SEASON', 'TEAM_ID']]]
    
    print(f"Aggregated {len(duplicates_df)} duplicate rows into {len(agg_result)} rows")
    
    # 4. Concatenate unique + aggregated
    full_shots_df = pd.concat([unique_df, agg_result], ignore_index=True).fillna(0)
    full_shots_df = full_shots_df.sort_values(by=['PLAYER_ID', 'SEASON']).reset_index(drop=True)


Unique rows: 3554
Duplicate rows (to aggregate): 1190
Aggregated 1190 duplicate rows into 579 rows


  agg_result = grouped[fgm_fga_cols].sum()
  agg_result = grouped[fgm_fga_cols].sum()


Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated 1190 duplicate rows into 579 rows
Aggregated

In [59]:
full_shots_df.to_csv("Data/full_players_shottype_small.csv")

## Here I stopped, so after this i will load again the data saved and use it

In [5]:
per_year_player_features = pd.read_csv("Data/Updating_per_year_player_features.csv")
players_for_clusters_list = pd.read_csv("Data/players_for_clusters_list_base.csv")
players_for_clusters = pd.read_csv("Data/players_for_clusters_base.csv")
full_players_features = pd.read_csv("Data/General_player_features.csv")
full_players_for_clusters = pd.read_csv("Data/full_players_list.csv")
full_shottype_df = pd.read_csv("Data/full_players_shottype1.csv")
allstar_season_team_df = pd.read_csv("Data/allstars_season_team.csv")



In [None]:
per_year_player_features

In [33]:
# we will start with shot locations averages
shotloc = leaguedashplayershotlocations.LeagueDashPlayerShotLocations(
    season="2024-25")
shotloc_df = shotloc.get_data_frames()[0]
shotloc_df
cols = shotloc_df.columns.tolist()
cols

[('', 'PLAYER_ID'),
 ('', 'PLAYER_NAME'),
 ('', 'TEAM_ID'),
 ('', 'TEAM_ABBREVIATION'),
 ('', 'AGE'),
 ('', 'NICKNAME'),
 ('Restricted Area', 'FGM'),
 ('Restricted Area', 'FGA'),
 ('Restricted Area', 'FG_PCT'),
 ('In The Paint (Non-RA)', 'FGM'),
 ('In The Paint (Non-RA)', 'FGA'),
 ('In The Paint (Non-RA)', 'FG_PCT'),
 ('Mid-Range', 'FGM'),
 ('Mid-Range', 'FGA'),
 ('Mid-Range', 'FG_PCT'),
 ('Left Corner 3', 'FGM'),
 ('Left Corner 3', 'FGA'),
 ('Left Corner 3', 'FG_PCT'),
 ('Right Corner 3', 'FGM'),
 ('Right Corner 3', 'FGA'),
 ('Right Corner 3', 'FG_PCT'),
 ('Above the Break 3', 'FGM'),
 ('Above the Break 3', 'FGA'),
 ('Above the Break 3', 'FG_PCT'),
 ('Backcourt', 'FGM'),
 ('Backcourt', 'FGA'),
 ('Backcourt', 'FG_PCT'),
 ('Corner 3', 'FGM'),
 ('Corner 3', 'FGA'),
 ('Corner 3', 'FG_PCT')]

In [34]:
# we are making a list of the cols needed
cols.remove(('', 'PLAYER_ID'))
cols.remove(('', 'PLAYER_NAME'))
cols.remove(('', 'TEAM_ID'))
cols.remove(('', 'TEAM_ABBREVIATION'))
cols.remove(('', 'AGE'))
cols.remove(('', 'NICKNAME'))
cols.remove(('Restricted Area', 'FG_PCT'))
cols.remove(('In The Paint (Non-RA)', 'FG_PCT'))
cols.remove(('Mid-Range', 'FG_PCT'))
cols.remove(('Left Corner 3', 'FG_PCT'))
cols.remove(('Right Corner 3', 'FG_PCT'))
cols.remove(('Above the Break 3', 'FG_PCT'))
cols.remove(('Backcourt', 'FG_PCT'))
cols.remove(('Backcourt', 'FGM'))
cols.remove(('Backcourt', 'FGA'))
cols.remove(('Corner 3', 'FG_PCT'))
print(cols)

[('Restricted Area', 'FGM'), ('Restricted Area', 'FGA'), ('In The Paint (Non-RA)', 'FGM'), ('In The Paint (Non-RA)', 'FGA'), ('Mid-Range', 'FGM'), ('Mid-Range', 'FGA'), ('Left Corner 3', 'FGM'), ('Left Corner 3', 'FGA'), ('Right Corner 3', 'FGM'), ('Right Corner 3', 'FGA'), ('Above the Break 3', 'FGM'), ('Above the Break 3', 'FGA'), ('Corner 3', 'FGM'), ('Corner 3', 'FGA')]


In [51]:
# --- 1. SETUP MASTER DATAFRAME CORRECTLY ---
# Ensure the master DF uses a MultiIndex to match the incoming data structure
full_shotloc_df = per_year_player_features[['PLAYER_ID', 'PLAYER', 'SEASON']].copy()
full_shotloc_df['PLAYER_ID'] = full_shotloc_df['PLAYER_ID'].astype(int)

# Create a clean MultiIndex for the base columns
base_columns = [('', 'PLAYER_ID'), ('', 'PLAYER'), ('', 'SEASON')]
full_shotloc_df.columns = pd.MultiIndex.from_tuples(base_columns)

# Ensure target columns are in the master df
current_cols = full_shotloc_df.columns.tolist()
for c in cols:
    if c not in current_cols:
        current_cols.append(c)

full_shotloc_df = full_shotloc_df.reindex(columns=pd.MultiIndex.from_tuples(current_cols), fill_value=0)

# --- 2. ROBUST PROCESSING FUNCTION ---
def fetch_and_process_season(year, master_df, target_cols):
    season_mask = master_df[('', 'SEASON')] == year
    current_season_df = master_df[season_mask].copy()

    if current_season_df.empty:
        print(f"No players found for {year}")
        return current_season_df

    try:
        print(f"Fetching shot location data for {year}...")
        
        max_retries = 3
        season_df = None

        for attempt in range(max_retries):
            try:
                # Jitter sleep to avoid simultaneous hits
                time.sleep(1 + (attempt * 2)) 
                season_shot_loc = leaguedashplayershotlocations.LeagueDashPlayerShotLocations(
                    season=year,
                    timeout=30
                )
                season_df = season_shot_loc.get_data_frames()[0]
                print(f"Fetched data for {year}")
                break 
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"API Error for {year}: {e}")
                    return current_season_df
                time.sleep(2 * (2 ** attempt))
        
        if season_df is None or season_df.empty:
            print(f"No data returned for {year}")
            return current_season_df

        # --- FIX: ROBUST ID SEARCH ---
        
        # 1. Reset index just in case ID is hidden there
        season_df = season_df.reset_index()

        # 2. Find the column containing 'PLAYER_ID' regardless of Level 0 or Level 1
        pid_col = None
        for c in season_df.columns:
            # Check the string representation of the WHOLE tuple/string
            # This catches ('', 'PLAYER_ID'), ('PLAYER_ID', ''), or just 'PLAYER_ID'
            if 'PLAYER_ID' in str(c): 
                pid_col = c
                break
        
        if pid_col:
            # Rename specifically to the format we need: ('', 'PLAYER_ID')
            season_df = season_df.rename(columns={pid_col: ('', 'PLAYER_ID')})
        else:
            # DEBUG: If it fails, print the first 5 columns to see what we actually got
            print(f"PLAYER_ID not found for {year}. First 5 cols: {season_df.columns[:5].tolist()}")
            return current_season_df

        # 3. Clean the DataFrame columns to ensure they are all MultiIndex
        # (Mixed Index of tuples and strings causes crashes later)
        clean_cols = []
        for c in season_df.columns:
            if isinstance(c, tuple):
                if len(c) >= 2:
                    clean_cols.append((str(c[0]), str(c[1]))) # Keep existing structure
                else:
                    clean_cols.append(('', str(c[0])))
            else:
                # Convert flat strings (like 'index' from reset_index) to tuple
                clean_cols.append((str(c), ''))
        
        season_df.columns = pd.MultiIndex.from_tuples(clean_cols)

        # 4. Standardize ID format
        season_df[('', 'PLAYER_ID')] = pd.to_numeric(season_df[('', 'PLAYER_ID')], errors='coerce').astype('Int64')
        season_df = season_df.dropna(subset=[('', 'PLAYER_ID')])
        
        # Filter to relevant players
        player_ids_for_season = current_season_df[('', 'PLAYER_ID')].unique()
        season_df = season_df[season_df[('', 'PLAYER_ID')].isin(player_ids_for_season)]

        if season_df.empty:
            return current_season_df

        # Find intersecting columns (Shot Zones)
        intersect_cols = [c for c in target_cols if c in season_df.columns]
        
        if not intersect_cols:
            print(f"No matching shot columns for {year}")
            return current_season_df

        # Update data
        season_df_indexed = season_df.set_index(('', 'PLAYER_ID'))[intersect_cols]
        current_season_df_indexed = current_season_df.set_index(('', 'PLAYER_ID'))
        
        current_season_df_indexed.update(season_df_indexed)
        
        print(f"SUCCESS {year}: Updated {len(season_df)} players")
        return current_season_df_indexed.reset_index()

    except Exception as e:
        print(f"CRITICAL ERROR processing {year}: {e}")
        season_mask_on_error = master_df[('', 'SEASON')] == year
        return master_df[season_mask_on_error].copy()

# --- 3. EXECUTION ---
all_season_data_frames = []
start_time = time.time()

num_workers = min(len(seasons), 4) 
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(fetch_and_process_season, year, full_shotloc_df, cols) 
        for year in seasons
    ]
    all_season_data_frames = [f.result() for f in futures]

print(f"Finished in {time.time() - start_time:.2f} seconds")

# --- 4. CONCATENATE ---
if all_season_data_frames:
    full_shotloc_df = pd.concat(all_season_data_frames, ignore_index=True)
    full_shotloc_df = full_shotloc_df.sort_values(
        by=[('', 'PLAYER_ID'), ('', 'SEASON')]
    ).reset_index(drop=True)

# --- 5. CALCULATIONS ---
full_shotloc_df[('', 'TOTAL_FGM')] = 0.0
full_shotloc_df[('', 'TOTAL_FGA')] = 0.0

areas_for_totals = ['Restricted Area', 'In The Paint (Non-RA)', 'Mid-Range', 
                    'Left Corner 3', 'Right Corner 3', 'Above the Break 3', 'Corner 3']

for area in areas_for_totals:
    fgm_col = (area, 'FGM')
    fga_col = (area, 'FGA')
    pct_col = (area, 'FG_PCT')
    
    # Only calculate if columns exist
    if fgm_col in full_shotloc_df.columns and fga_col in full_shotloc_df.columns:
        full_shotloc_df[fgm_col] = pd.to_numeric(full_shotloc_df[fgm_col], errors='coerce').fillna(0)
        full_shotloc_df[fga_col] = pd.to_numeric(full_shotloc_df[fga_col], errors='coerce').fillna(0)
        
        full_shotloc_df.loc[:, ('', 'TOTAL_FGM')] += full_shotloc_df[fgm_col]
        full_shotloc_df.loc[:, ('', 'TOTAL_FGA')] += full_shotloc_df[fga_col]
        
        full_shotloc_df.loc[:, pct_col] = (
            full_shotloc_df[fgm_col] / full_shotloc_df[fga_col].replace(0, np.nan)
        ).round(2)

for area in areas_for_totals:
    fga_col = (area, 'FGA')
    freq_col = (area, 'FREQ')
    
    if fga_col in full_shotloc_df.columns:
        if freq_col not in full_shotloc_df.columns:
            full_shotloc_df[freq_col] = 0.0
            
        full_shotloc_df.loc[:, freq_col] = (
            full_shotloc_df[fga_col] / full_shotloc_df[('', 'TOTAL_FGA')].replace(0, np.nan)
        ).round(2)

full_shotloc_df.to_csv("Data/full_players_shotloc.csv", index=False)
full_shotloc_df

Fetching shot location data for 2015-16...
Fetching shot location data for 2016-17...
Fetching shot location data for 2017-18...
Fetching shot location data for 2018-19...
Fetched data for 2016-17
SUCCESS 2016-17: Updated 327 players
Fetching shot location data for 2019-20...
Fetched data for 2018-19
SUCCESS 2018-19: Updated 393 players
Fetching shot location data for 2020-21...
Fetched data for 2015-16
SUCCESS 2015-16: Updated 305 players
Fetched data for 2017-18
Fetching shot location data for 2021-22...
SUCCESS 2017-18: Updated 366 players
Fetching shot location data for 2022-23...
Fetched data for 2019-20
SUCCESS 2019-20: Updated 402 players
Fetching shot location data for 2023-24...
Fetched data for 2020-21
SUCCESS 2020-21: Updated 420 players
Fetching shot location data for 2024-25...
Fetched data for 2021-22
Fetched data for 2022-23
SUCCESS 2022-23: Updated 460 players
SUCCESS 2021-22: Updated 463 players
Fetched data for 2023-24
SUCCESS 2023-24: Updated 489 players
Fetched data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Restricted Area,Restricted Area,In The Paint (Non-RA),In The Paint (Non-RA),Mid-Range,Mid-Range,Left Corner 3,...,Right Corner 3,Above the Break 3,Corner 3,Restricted Area,In The Paint (Non-RA),Mid-Range,Left Corner 3,Right Corner 3,Above the Break 3,Corner 3
Unnamed: 0_level_1,PLAYER_ID,PLAYER,SEASON,FGM,FGA,FGM,FGA,FGM,FGA,FGM,...,FG_PCT,FG_PCT,FG_PCT,FREQ,FREQ,FREQ,FREQ,FREQ,FREQ,FREQ
0,1495,T. Duncan,2015-16,108,170,72,154,35,115,0,...,,0.00,,0.39,0.35,0.26,0.00,0.00,0.00,0.00
1,1717,D. Nowitzki,2015-16,64,90,46,99,262,581,10,...,0.12,0.38,0.30,0.08,0.09,0.51,0.03,0.01,0.27,0.03
2,1717,D. Nowitzki,2016-17,27,47,30,61,160,361,4,...,0.20,0.38,0.38,0.07,0.09,0.52,0.01,0.01,0.28,0.02
3,1717,D. Nowitzki,2017-18,28,37,26,53,154,331,6,...,0.60,0.40,0.53,0.05,0.07,0.43,0.02,0.01,0.41,0.02
4,1717,D. Nowitzki,2018-19,3,5,13,27,55,139,1,...,0.38,0.31,0.33,0.01,0.07,0.35,0.01,0.04,0.46,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4322,1642450,Daniss Jenkins,2024-25,2,3,0,0,0,0,0,...,0.00,0.20,0.00,0.25,0.00,0.00,0.08,0.08,0.42,0.17
4323,1642461,Spencer Jones,2024-25,4,7,5,8,1,2,0,...,0.00,0.17,0.00,0.16,0.18,0.04,0.16,0.09,0.13,0.24
4324,1642484,RayJ Dennis,2024-25,2,2,1,8,2,2,0,...,0.00,0.35,0.00,0.06,0.23,0.06,0.03,0.06,0.49,0.09
4325,1642505,Alex Ducas,2024-25,1,4,0,3,1,2,3,...,0.50,0.42,0.56,0.10,0.08,0.05,0.13,0.10,0.31,0.23


In [52]:
full_shotloc_df.fillna(0).to_csv("Data/full_players_shotloc.csv", index=False)

running synergy play types:

In [9]:
synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season="2023-24",
    play_type_nullable="Transition",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
synergy_df = synergy.get_data_frames()[0]
synergy_df

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,PLAY_TYPE,TYPE_GROUPING,PERCENTILE,GP,...,TOV_POSS_PCT,SF_POSS_PCT,PLUSONE_POSS_PCT,SCORE_POSS_PCT,EFG_PCT,POSS,PTS,FGM,FGA,FGMX
0,22023,203507,Giannis Antetokounmpo,1610612749,MIL,Milwaukee Bucks,Transition,Offensive,0.687,72,...,0.126,0.284,0.079,0.648,0.691,6.3,7.8,2.8,4.0,1.3
1,22023,1628983,Shai Gilgeous-Alexander,1610612760,OKC,Oklahoma City Thunder,Transition,Offensive,0.673,75,...,0.069,0.145,0.033,0.583,0.588,5.6,6.8,2.5,4.4,1.9
2,22023,1628368,De'Aaron Fox,1610612758,SAC,Sacramento Kings,Transition,Offensive,0.563,73,...,0.103,0.128,0.034,0.524,0.624,6.0,7.0,2.6,4.8,2.2
3,22023,1627759,Jaylen Brown,1610612738,BOS,Boston Celtics,Transition,Offensive,0.593,69,...,0.094,0.143,0.034,0.553,0.627,6.0,7.1,2.7,4.8,2.1
4,22023,2544,LeBron James,1610612747,LAL,Los Angeles Lakers,Transition,Offensive,0.783,70,...,0.168,0.205,0.069,0.603,0.724,5.4,6.8,2.5,3.7,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,22023,1629007,Jontay Porter,1610612761,TOR,Toronto Raptors,Transition,Offensive,0.014,25,...,0.250,0.083,0.000,0.250,0.375,0.5,0.3,0.1,0.3,0.2
361,22023,201988,Patty Mills,1610612737,ATL,Atlanta Hawks,Transition,Offensive,0.006,19,...,0.167,0.000,0.000,0.167,0.300,0.6,0.3,0.1,0.5,0.4
362,22023,203490,Otto Porter Jr.,1610612761,TOR,Toronto Raptors,Transition,Offensive,0.006,15,...,0.000,0.000,0.000,0.200,0.250,0.7,0.3,0.1,0.7,0.5
363,22023,1629669,Jaylen Nowell,1610612763,MEM,Memphis Grizzlies,Transition,Offensive,0.003,9,...,0.182,0.091,0.000,0.182,0.125,1.2,0.4,0.1,0.9,0.8


In [16]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_transition = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_transition[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="Transition",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_transition = full_synergy_transition[full_synergy_transition['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_transition.loc[year_synergy_transition['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_transition.copy())
    print("year")
full_synergy_transition_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_transition = full_synergy_transition_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_transition[col] = full_synergy_transition[col] / full_synergy_transition["GP"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_transition[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_transition[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_transition[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try

year
year
year
year
year
year
year
year
year
year


In [17]:
full_synergy_transition.fillna(0).to_csv("Data/synergy_transition.csv")

In [19]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_isolation = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_isolation[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="Isolation",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_isolation = full_synergy_isolation[full_synergy_isolation['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_isolation.loc[year_synergy_isolation['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_isolation.copy())
    print("year")
full_synergy_isolation_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_isolation = full_synergy_isolation_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_isolation[col] = full_synergy_isolation[col] / full_synergy_isolation["GP"]
full_synergy_isolation.fillna(0).to_csv("Data/synergy_isolation.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_isolation[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_isolation[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_isolation[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

year
year
year
year
year
year
year
year
year
year


In [20]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_pnr = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_pnr[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="PRBallHandler",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_pnr = full_synergy_pnr[full_synergy_pnr['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_pnr.loc[year_synergy_pnr['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_pnr.copy())
full_synergy_pnr_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_pnr = full_synergy_pnr_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_pnr[col] = full_synergy_pnr[col] / full_synergy_pnr["GP"]
full_synergy_pnr.fillna(0).to_csv("Data/synergy_pnr.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [21]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_pnr_roll = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_pnr_roll[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="PRRollman",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_pnr_roll = full_synergy_pnr_roll[full_synergy_pnr_roll['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_pnr_roll.loc[year_synergy_pnr_roll['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_pnr_roll.copy())
full_synergy_pnr_roll_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_pnr_roll = full_synergy_pnr_roll_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_pnr_roll[col] = full_synergy_pnr_roll[col] / full_synergy_pnr_roll["GP"]
full_synergy_pnr_roll.fillna(0).to_csv("Data/synergy_pnr_roll.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr_roll[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr_roll[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_pnr_roll[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [22]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_postup = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_postup[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="Postup",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_postup = full_synergy_postup[full_synergy_postup['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_postup.loc[year_synergy_postup['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_postup.copy())
full_synergy_postup_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_postup = full_synergy_postup_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_postup[col] = full_synergy_postup[col] / full_synergy_postup["GP"]
full_synergy_postup.fillna(0).to_csv("Data/synergy_postup.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_postup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_postup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_postup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [23]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_spotup = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_spotup[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="Spotup",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_spotup = full_synergy_spotup[full_synergy_spotup['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_spotup.loc[year_synergy_spotup['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_spotup.copy())
full_synergy_spotup_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_spotup = full_synergy_spotup_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_spotup[col] = full_synergy_spotup[col] / full_synergy_spotup["GP"]
full_synergy_spotup.fillna(0).to_csv("Data/synergy_spotup.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_spotup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_spotup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_spotup[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [24]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_cut = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_cut[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="Cut",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_cut = full_synergy_cut[full_synergy_cut['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_cut.loc[year_synergy_cut['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_cut.copy())
full_synergy_cut_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_cut = full_synergy_cut_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_cut[col] = full_synergy_cut[col] / full_synergy_cut["GP"]
full_synergy_cut.fillna(0).to_csv("Data/synergy_cut.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_cut[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_cut[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_cut[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [25]:
synergy_columns = ['PERCENTILE', 'GP', 'FREQ', 'PPP',
                    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
                    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
                    'PTS', 'FGM', 'FGA']
full_synergy_putbacks = per_year_player_features[['PLAYER_ID', 'SEASON']]
full_synergy_putbacks[synergy_columns] = 0
synergy_list = []
players_list = full_players_features['PLAYER_ID'].to_list()
for year in seasons:
    synergy = synergyplaytypes.SynergyPlayTypes(
    league_id="00",
    player_or_team_abbreviation="P",   # "P" players, "T" teams
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    season=year,
    play_type_nullable="OffRebound",    # <-- Capitalized
    type_grouping_nullable="Offensive" # <-- Capitalized
    )
    year_synergy_putbacks = full_synergy_putbacks[full_synergy_putbacks['SEASON'] == year].copy()
    synergy_df = synergy.get_data_frames()[0]
    synergy_df = synergy_df[synergy_df['PLAYER_ID'].isin(players_list)]
    for _, row in synergy_df.iterrows():
        pid = row['PLAYER_ID']
        gp = row['GP']
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'PERCENTILE'] += row['PERCENTILE']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'GP'] += gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'FREQ'] += row['POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'PPP'] += row['PPP']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'FG_PCT'] += row['FG_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'FT_FREQ'] += row['FT_POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'TOV_FREQ'] += row['TOV_POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'SF_FREQ'] += row['SF_POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'AND1_FREQ'] += row['PLUSONE_POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'SCORE_FREQ'] += row['SCORE_POSS_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'EFG_PCT'] += row['EFG_PCT']*gp
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'POSS'] += row['POSS']
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'PTS'] += row['PTS']
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'FGM'] += row['FGM']
        year_synergy_putbacks.loc[year_synergy_putbacks['PLAYER_ID']==pid, 'FGA'] += row['FGA']
    synergy_list.append(year_synergy_putbacks.copy())
full_synergy_putbacks_concat = pd.concat(synergy_list, ignore_index=True)
full_synergy_putbacks = full_synergy_putbacks_concat
for col in synergy_columns:
    if col != "GP":
        full_synergy_putbacks[col] = full_synergy_putbacks[col] / full_synergy_putbacks["GP"]
full_synergy_putbacks.fillna(0).to_csv("Data/synergy_putbacks.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_putbacks[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_putbacks[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_synergy_putbacks[synergy_columns] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

creating one synergy table

In [26]:
import pandas as pd
from functools import reduce

synergy_columns = [
    'PERCENTILE', 'GP', 'FREQ', 'PPP',
    'FG_PCT', 'FT_FREQ', 'TOV_FREQ', 'SF_FREQ',
    'AND1_FREQ', 'SCORE_FREQ', 'EFG_PCT', 'POSS',
    'PTS', 'FGM', 'FGA'
]

# The columns used to match rows across all dataframes
merge_keys = ['PLAYER_ID', 'SEASON']

# 2. ORGANIZE DATAFRAMES
# ----------------------
# Map the desired Prefix (Key) to your actual DataFrame variable (Value).
# Example: If your transition df is named 'full_synergy_transition', put it as the value.
dfs_dict = {
    'Isolation':   full_synergy_isolation,    # Replace 'full_synergy_isolation' with your actual variable
    'Transition':  full_synergy_transition,   # Replace 'full_synergy_transition' with your actual variable
    'PNR_Handler': full_synergy_pnr,  # Replace with actual variable...
    'PNR_Roll':    full_synergy_pnr_roll,
    'Post_Up':     full_synergy_postup,
    'Spot_Up':     full_synergy_spotup,
    'Putbacks':     full_synergy_putbacks,
    'Cut':         full_synergy_cut
}

# 3. RENAME AND PREPARE
# ---------------------
processed_dfs = []

for prefix, df in dfs_dict.items():
    # Identify which synergy columns are actually present in this specific DF
    # (Safe-guard in case one DF is missing a specific column like 'AND1_FREQ')
    cols_present = [c for c in synergy_columns if c in df.columns]
    
    # Select only the Merge Keys + Synergy Columns
    cols_to_keep = merge_keys + cols_present
    temp_df = df[cols_to_keep].copy()
    
    # Rename columns: e.g., 'PPP' -> 'Isolation_PPP'
    rename_map = {col: f"{prefix}_{col}" for col in cols_present}
    temp_df = temp_df.rename(columns=rename_map)
    
    processed_dfs.append(temp_df)

# 4. MERGE EVERYTHING
# -------------------
# use 'outer' merge so you don't lose data if a player is missing a specific play type
final_df = reduce(
    lambda left, right: pd.merge(left, right, on=merge_keys, how='outer'), 
    processed_dfs
)

full_synergy_types = final_df

In [27]:
full_synergy_types.to_csv("Data/full_synergy_stats.csv")

missing the shotdefend, rebound, assists and thats it

In [5]:
per_year_player_features = pd.read_csv("Data/Updating_per_year_player_features.csv")
players_for_clusters_list = pd.read_csv("Data/players_for_clusters_list_base.csv")
players_for_clusters = pd.read_csv("Data/players_for_clusters_base.csv")
full_players_features = pd.read_csv("Data/General_player_features.csv")
full_players_for_clusters = pd.read_csv("Data/full_players_list.csv")
full_shottype_df = pd.read_csv("Data/full_players_shottype1.csv")
allstar_season_team_df = pd.read_csv("Data/allstars_season_team.csv")
full_synergy_types = pd.read_csv("Data/full_synergy_stats.csv")



In [6]:
reb = playerdashptreb.PlayerDashPtReb(team_id=1610612737, player_id=1630700, season='2024-25') 
reb_df = reb.get_data_frames()[4]
print("These are the columns and their types for the rebounding stats, and after that is a preview of the data")
print(reb_df.dtypes)
reb_df.head(5)

These are the columns and their types for the rebounding stats, and after that is a preview of the data
PLAYER_ID                   int64
PLAYER_NAME_LAST_FIRST     object
SORT_ORDER                  int64
G                           int64
REB_DIST_RANGE             object
REB_FREQUENCY             float64
OREB                        int64
DREB                        int64
REB                         int64
C_OREB                      int64
C_DREB                      int64
C_REB                       int64
C_REB_PCT                 float64
UC_OREB                     int64
UC_DREB                     int64
UC_REB                      int64
UC_REB_PCT                float64
dtype: object


Unnamed: 0,PLAYER_ID,PLAYER_NAME_LAST_FIRST,SORT_ORDER,G,REB_DIST_RANGE,REB_FREQUENCY,OREB,DREB,REB,C_OREB,C_DREB,C_REB,C_REB_PCT,UC_OREB,UC_DREB,UC_REB,UC_REB_PCT
0,1630700,"Daniels, Dyson",1,76,0-3 Feet,0.191,26,59,85,14,21,35,0.412,12,38,50,0.588
1,1630700,"Daniels, Dyson",2,76,3-6 Feet,0.428,53,138,191,30,29,59,0.309,23,109,132,0.691
2,1630700,"Daniels, Dyson",3,76,6-10 Feet,0.222,17,82,99,8,15,23,0.232,9,67,76,0.768
3,1630700,"Daniels, Dyson",4,76,10+ Feet,0.159,27,44,71,5,3,8,0.113,22,41,63,0.887


In [7]:
from requests.exceptions import ConnectionError, ReadTimeout

# Suppress PerformanceWarning from pandas
warnings.simplefilter("ignore", PerformanceWarning)

# --- Helper Function ---
def _get_frame(frames, i):
    """Safely get a DataFrame from the API response list."""
    try:
        return frames[i] if frames and len(frames) > i and frames[i] is not None else pd.DataFrame()
    except Exception:
        return pd.DataFrame()

# --- Helper Function (for parallel execution) ---
def fetch_player_season_rebound_data(player_season_row):
    """
    Fetches all rebound data for a single player-season combination.
    This function performs both API calls and all the processing logic.
    """
    pid = player_season_row['PLAYER_ID']
    season = player_season_row['SEASON']
    player_name = player_season_row.get('PLAYER', pid)
    
    max_retries = 3
    base_delay = 1

    try:
        # --- API Call 1: Get Team(s) for the season ---
        time.sleep(0.6)
        yrs_frames = None
        for attempt in range(max_retries):
            try:
                yrs_obj = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
                    player_id=pid, 
                    timeout=20
                )
                yrs_frames = yrs_obj.get_data_frames()
                break
            except (ConnectionError, ReadTimeout) as e:
                if attempt == max_retries - 1:
                    print(f"  -> API Error 1 (PID {pid}, {season}): Failed to get teams after {max_retries} attempts. {e}")
                    return None
                delay = base_delay * (2 ** attempt)
                time.sleep(delay)
        
        # Process API Call 1 result
        years_teams_df = _get_frame(yrs_frames, 1)
        if years_teams_df.empty:
            return None

        # Filter to *only* the specific season we care about
        years_teams_df = years_teams_df[years_teams_df["TEAM_ID"] != -1]
        season_teams_df = years_teams_df[years_teams_df["GROUP_VALUE"] == season]

        if season_teams_df.empty:
            return None

        # This will hold a LIST of DataFrames, one per team
        team_dfs_list = []

        # --- Inner Loop: Process each team for that season ---
        for _, team_row in season_teams_df.iterrows():
            team_id = team_row.get("TEAM_ID")
            if team_id is None or team_id == -1:
                continue

            # --- API Call 2: Get Rebound Details ---
            time.sleep(0.2)
            frames = None
            for attempt in range(max_retries):
                try:
                    player_reb_obj = playerdashptreb.PlayerDashPtReb(
                        player_id=pid, 
                        season=season, 
                        team_id=team_id,
                        timeout=10
                    )
                    frames = player_reb_obj.get_data_frames()
                    break
                except (ConnectionError, ReadTimeout) as e:
                    if attempt == max_retries - 1:
                        print(f"  -> API Error 2 (PID {pid}, {season}, TID {team_id}): Failed to get rebounds. {e}")
                        continue
                    delay = base_delay * (2 ** attempt)
                    time.sleep(delay)
            
            if frames is None:
                continue

            # --- Process API Call 2 results ---
            # Get frames 0 (Overall), 2 (Rebound Chance Type), and 4 (Shot Type)
            player_reb_overall = _get_frame(frames, 0)
            player_reb_contesting = _get_frame(frames, 2)
            player_reb_dist = _get_frame(frames, 4)

            row_map = {}

            # Overall rebounds
            if not player_reb_overall.empty:
                vals = player_reb_overall.iloc[0]
                row_map[("OVERALL", "REB")] = vals.get("REB", 0)
                row_map[("OVERALL", "OREB")] = vals.get("OREB", 0)
                row_map[("OVERALL", "DREB")] = vals.get("DREB", 0)
                row_map[("OVERALL", "REB_CONTESTED")] = vals.get("C_REB", 0)
                row_map[("OVERALL", "REB_UNCONTESTED")] = vals.get("UC_REB", 0)
                

            # Rebound chance type
            if not player_reb_contesting.empty:
                for _, r in player_reb_contesting.iterrows():
                    key_base = r.get("REB_NUM_CONTESTING_RANGE")
                    if key_base:
                        row_map[(key_base, "REB")] = r.get("REB", 0)
                        row_map[(key_base, "OREB")] = r.get("OREB", 0)
                        row_map[(key_base, "DREB")] = r.get("DREB", 0)
                        row_map[(key_base, "REB_FREQ")] = r.get("REB_FREQUENCY", 0)
                        

            # Shot type
            if not player_reb_dist.empty:
                for _, r in player_reb_dist.iterrows():
                    key_base = r.get("REB_DIST_RANGE")
                    if key_base:
                        row_map[(key_base, "REB")] = r.get("REB", 0)
                        row_map[(key_base, "OREB")] = r.get("OREB", 0)
                        row_map[(key_base, "DREB")] = r.get("DREB", 0)
                        row_map[(key_base, "REB_FREQ")] = r.get("REB_FREQUENCY", 0)

            # Create a single row for THIS TEAM
            if row_map:
                curr_team_df = pd.DataFrame([row_map]).fillna(0)
                
                # Add identifiers
                curr_team_df["PLAYER_ID"] = pid
                curr_team_df["SEASON"] = season
                curr_team_df["TEAM_ID"] = team_id
                
                team_dfs_list.append(curr_team_df)
            
        # Return the list of team DataFrames
        if team_dfs_list:
            return team_dfs_list
        else:
            return None

    except Exception as e:
        print(f"  !! UNEXPECTED error processing {player_name} ({pid}, {season}): {e}")
        return None

# --- Main Script ---
print("Starting optimized rebound detail fetch...")
start_time = time.time()

# 1. Create the job list: one row per player-season
player_reb_param = per_year_player_features[["PLAYER_ID", "PLAYER", "SEASON"]].copy()
player_reb_param = player_reb_param.drop_duplicates(subset=['PLAYER_ID', 'SEASON']).reset_index(drop=True)
jobs = player_reb_param.to_dict('records')

total_jobs = len(jobs)
print(f"Created {total_jobs} unique player-season jobs.")

full_reb_list = []
num_workers = 5

# 2. Run all jobs in parallel
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    
    results = executor.map(fetch_player_season_rebound_data, jobs)

    # Process results as they come in
    for i, result_df_list in enumerate(results):
        if (i + 1) % 50 == 0 or (i + 1) == total_jobs:
            print(f"Processed {i+1}/{total_jobs} jobs...")
        
        if result_df_list is not None and result_df_list:
            full_reb_list.extend(result_df_list)

# 3. Final Concat
if full_reb_list:
    print("Concatenating all results...")
    full_reb_df = pd.concat(full_reb_list, ignore_index=True).fillna(0)
    
    print(f"Final full_reb_df with {len(full_reb_df)} rows.")
    full_reb_df.to_csv("Data/full_players_rebound.csv", index=False)
    print("Saved to Data/full_players_rebound.csv")
else:
    full_reb_df = pd.DataFrame()
    print("No rebound rows collected")

end_time = time.time()
print(f"\n--- Total time taken: {end_time - start_time:.2f} seconds ---")
full_reb_df

Starting optimized rebound detail fetch...
Created 4327 unique player-season jobs.
Processed 50/4327 jobs...
Processed 100/4327 jobs...
  !! UNEXPECTED error processing Payton Pritchard (1630202, 2020-21): Expecting value: line 1 column 1 (char 0)
Processed 150/4327 jobs...
Processed 200/4327 jobs...
Processed 250/4327 jobs...
Processed 300/4327 jobs...
Processed 350/4327 jobs...
Processed 400/4327 jobs...
Processed 450/4327 jobs...
Processed 500/4327 jobs...
Processed 550/4327 jobs...
  !! UNEXPECTED error processing Dario Šarić (203967, 2017-18): Expecting value: line 1 column 1 (char 0)
Processed 600/4327 jobs...
Processed 650/4327 jobs...
Processed 700/4327 jobs...
Processed 750/4327 jobs...
Processed 800/4327 jobs...
Processed 850/4327 jobs...
Processed 900/4327 jobs...
Processed 950/4327 jobs...
Processed 1000/4327 jobs...
Processed 1050/4327 jobs...
Processed 1100/4327 jobs...
Processed 1150/4327 jobs...
Processed 1200/4327 jobs...
  -> API Error 2 (PID 201144, 2021-22, TID 1610

Unnamed: 0,"(OVERALL, REB)","(OVERALL, OREB)","(OVERALL, DREB)","(OVERALL, REB_CONTESTED)","(OVERALL, REB_UNCONTESTED)","(0 Contesting Rebounders, REB)","(0 Contesting Rebounders, OREB)","(0 Contesting Rebounders, DREB)","(0 Contesting Rebounders, REB_FREQ)","(1 Contesting Rebounder, REB)",...,SEASON,TEAM_ID,"(6-10 Feet, REB)","(6-10 Feet, OREB)","(6-10 Feet, DREB)","(6-10 Feet, REB_FREQ)","(10+ Feet, REB)","(10+ Feet, OREB)","(10+ Feet, DREB)","(10+ Feet, REB_FREQ)"
0,4,1,3,2,2,2.0,0.0,2.0,0.500,1.0,...,2024-25,1610612737,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000
1,11,2,9,4,7,7.0,1.0,6.0,0.636,4.0,...,2024-25,1610612752,3.0,1.0,2.0,0.273,0.0,0.0,0.0,0.000
2,7,3,4,3,4,4.0,2.0,2.0,0.571,3.0,...,2023-24,1610612752,0.0,0.0,0.0,0.000,1.0,1.0,0.0,0.143
3,354,61,293,94,260,260.0,34.0,226.0,0.734,83.0,...,2024-25,1610612737,65.0,12.0,53.0,0.184,28.0,11.0,17.0,0.079
4,475,73,402,133,342,342.0,41.0,301.0,0.720,116.0,...,2023-24,1610612737,79.0,10.0,69.0,0.166,53.0,21.0,32.0,0.112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4720,4,0,4,1,3,3.0,0.0,3.0,0.750,0.0,...,2019-20,1610612740,1.0,0.0,1.0,0.250,1.0,0.0,1.0,0.250
4721,49,14,35,16,33,33.0,4.0,29.0,0.673,14.0,...,2017-18,1610612742,8.0,1.0,7.0,0.163,7.0,4.0,3.0,0.143
4722,51,18,33,17,34,34.0,8.0,26.0,0.667,15.0,...,2018-19,1610612746,9.0,1.0,8.0,0.176,3.0,2.0,1.0,0.059
4723,9,3,6,2,7,7.0,1.0,6.0,0.778,2.0,...,2019-20,1610612746,1.0,0.0,1.0,0.111,0.0,0.0,0.0,0.000


In [6]:
full_players_rebound = pd.read_csv("Data/full_players_rebound.csv")
full_players_rebound

Unnamed: 0,"('OVERALL', 'REB')","('OVERALL', 'OREB')","('OVERALL', 'DREB')","('OVERALL', 'REB_CONTESTED')","('OVERALL', 'REB_UNCONTESTED')","('0 Contesting Rebounders', 'REB')","('0 Contesting Rebounders', 'OREB')","('0 Contesting Rebounders', 'DREB')","('0 Contesting Rebounders', 'REB_FREQ')","('1 Contesting Rebounder', 'REB')",...,SEASON,TEAM_ID,"('6-10 Feet', 'REB')","('6-10 Feet', 'OREB')","('6-10 Feet', 'DREB')","('6-10 Feet', 'REB_FREQ')","('10+ Feet', 'REB')","('10+ Feet', 'OREB')","('10+ Feet', 'DREB')","('10+ Feet', 'REB_FREQ')"
0,4,1,3,2,2,2.0,0.0,2.0,0.500,1.0,...,2024-25,1610612737,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000
1,11,2,9,4,7,7.0,1.0,6.0,0.636,4.0,...,2024-25,1610612752,3.0,1.0,2.0,0.273,0.0,0.0,0.0,0.000
2,7,3,4,3,4,4.0,2.0,2.0,0.571,3.0,...,2023-24,1610612752,0.0,0.0,0.0,0.000,1.0,1.0,0.0,0.143
3,354,61,293,94,260,260.0,34.0,226.0,0.734,83.0,...,2024-25,1610612737,65.0,12.0,53.0,0.184,28.0,11.0,17.0,0.079
4,475,73,402,133,342,342.0,41.0,301.0,0.720,116.0,...,2023-24,1610612737,79.0,10.0,69.0,0.166,53.0,21.0,32.0,0.112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4720,4,0,4,1,3,3.0,0.0,3.0,0.750,0.0,...,2019-20,1610612740,1.0,0.0,1.0,0.250,1.0,0.0,1.0,0.250
4721,49,14,35,16,33,33.0,4.0,29.0,0.673,14.0,...,2017-18,1610612742,8.0,1.0,7.0,0.163,7.0,4.0,3.0,0.143
4722,51,18,33,17,34,34.0,8.0,26.0,0.667,15.0,...,2018-19,1610612746,9.0,1.0,8.0,0.176,3.0,2.0,1.0,0.059
4723,9,3,6,2,7,7.0,1.0,6.0,0.778,2.0,...,2019-20,1610612746,1.0,0.0,1.0,0.111,0.0,0.0,0.0,0.000


In [8]:
full_reb_df = full_players_rebound.copy()
id_cols = ['PLAYER_ID', 'SEASON', 'TEAM_ID']
data_cols = [col for col in full_reb_df.columns if col not in id_cols]
full_reb_df = full_reb_df[id_cols + sorted(data_cols)]

# --- FIX: Handle duplicate PLAYER_ID + SEASON combinations ---
# 1. Identify duplicates (players with multiple teams in same season)
duplicates_mask = full_reb_df.duplicated(subset=['PLAYER_ID', 'SEASON'], keep=False)

# 2. Split into unique and duplicate DataFrames
unique_df = full_reb_df[~duplicates_mask].copy()
duplicates_df = full_reb_df[duplicates_mask].copy()

print(f"Unique rows: {len(unique_df)}")
print(f"Duplicate rows (to aggregate): {len(duplicates_df)}")

# 3. Process duplicates: aggregate by PLAYER_ID + SEASON
if not duplicates_df.empty:
    # Identify FREQ columns (weighted average by OVERALL REB)
    freq_cols = [col for col in data_cols if isinstance(col, tuple) and 'FREQ' in str(col[1])]
    
    # All other columns are summed
    sum_cols = [col for col in data_cols if col not in freq_cols]
    
    # Group by PLAYER_ID and SEASON
    grouped = duplicates_df.groupby(['PLAYER_ID', 'SEASON'], as_index=False)
    
    # Start with summing all non-FREQ columns
    agg_result = grouped[sum_cols].sum()
    
    # --- WEIGHTED AVERAGE FOR FREQ COLUMNS ---
    # Weight by ('OVERALL', 'REB') column
    overall_reb_col = ('OVERALL', 'REB')
    
    if overall_reb_col in duplicates_df.columns:
        for freq_col in freq_cols:
            # Weighted average: sum(freq * overall_reb) / sum(overall_reb)
            weighted_sum = (duplicates_df[freq_col] * duplicates_df[overall_reb_col]).groupby(
                [duplicates_df['PLAYER_ID'], duplicates_df['SEASON']]
            ).sum()
            
            total_reb = duplicates_df[overall_reb_col].groupby(
                [duplicates_df['PLAYER_ID'], duplicates_df['SEASON']]
            ).sum()
            
            # Handle division by zero
            weighted_avg = weighted_sum / total_reb.replace(0, np.nan)
            agg_result[freq_col] = weighted_avg.values
    else:
        # Fallback: simple average if ('OVERALL', 'REB') doesn't exist
        for freq_col in freq_cols:
            mean_series = grouped[freq_col].mean().reset_index()
            for idx, row in mean_series.iterrows():
                pid = row['PLAYER_ID']
                season = row['SEASON']
                val = row[freq_col]
                mask = (agg_result['PLAYER_ID'] == pid) & (agg_result['SEASON'] == season)
                agg_result.loc[mask, freq_col] = val
    
    # Don't include TEAM_ID in the aggregated result
    agg_result = agg_result[['PLAYER_ID', 'SEASON'] + [c for c in agg_result.columns if c not in ['PLAYER_ID', 'SEASON', 'TEAM_ID']]]
    
    print(f"Aggregated {len(duplicates_df)} duplicate rows into {len(agg_result)} rows")
    
    # 4. Concatenate unique + aggregated
    full_reb_df = pd.concat([unique_df, agg_result], ignore_index=True).fillna(0)
    full_reb_df = full_reb_df.sort_values(by=['PLAYER_ID', 'SEASON']).reset_index(drop=True)
    
    # Save aggregated result
    full_reb_df.to_csv("Data/full_players_rebound_aggregated.csv", index=False)
    print("✓ Saved aggregated rebound data to Data/full_players_rebound_aggregated.csv")
else:
    print("No duplicates to aggregate")

print(f"\nFinal aggregated DataFrame shape: {full_reb_df.shape}")
full_reb_df

Unique rows: 3558
Duplicate rows (to aggregate): 1167
Aggregated 1167 duplicate rows into 568 rows
✓ Saved aggregated rebound data to Data/full_players_rebound_aggregated.csv

Final aggregated DataFrame shape: (4126, 36)


Unnamed: 0,PLAYER_ID,SEASON,TEAM_ID,"('0 Contesting Rebounders', 'DREB')","('0 Contesting Rebounders', 'OREB')","('0 Contesting Rebounders', 'REB')","('0 Contesting Rebounders', 'REB_FREQ')","('0-3 Feet', 'DREB')","('0-3 Feet', 'OREB')","('0-3 Feet', 'REB')",...,"('3-6 Feet', 'REB_FREQ')","('6-10 Feet', 'DREB')","('6-10 Feet', 'OREB')","('6-10 Feet', 'REB')","('6-10 Feet', 'REB_FREQ')","('OVERALL', 'DREB')","('OVERALL', 'OREB')","('OVERALL', 'REB')","('OVERALL', 'REB_CONTESTED')","('OVERALL', 'REB_UNCONTESTED')"
0,1495,2015-16,1.610613e+09,208.0,41.0,249.0,0.568,105.0,30.0,135.0,...,0.454,42.0,19.0,61.0,0.139,329,109,438,189,249
1,1717,2015-16,1.610613e+09,299.0,16.0,315.0,0.645,136.0,15.0,151.0,...,0.449,67.0,8.0,75.0,0.154,437,51,488,173,315
2,1717,2016-17,1.610613e+09,211.0,11.0,222.0,0.647,93.0,5.0,98.0,...,0.487,56.0,6.0,62.0,0.181,322,21,343,121,222
3,1717,2017-18,1.610613e+09,279.0,6.0,285.0,0.655,144.0,5.0,149.0,...,0.490,50.0,4.0,54.0,0.124,415,20,435,150,285
4,1717,2018-19,1.610613e+09,110.0,3.0,113.0,0.715,39.0,2.0,41.0,...,0.494,31.0,3.0,34.0,0.215,153,5,158,45,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4121,1642450,2024-25,1.610613e+09,1.0,0.0,1.0,0.500,0.0,0.0,0.0,...,0.500,0.0,0.0,0.0,0.000,2,0,2,1,1
4122,1642461,2024-25,1.610613e+09,4.0,3.0,7.0,0.412,1.0,0.0,1.0,...,0.235,5.0,2.0,7.0,0.412,9,8,17,10,7
4123,1642484,2024-25,1.610613e+09,8.0,2.0,10.0,0.833,0.0,0.0,0.0,...,0.167,3.0,1.0,4.0,0.333,10,2,12,2,10
4124,1642505,2024-25,1.610613e+09,12.0,4.0,16.0,0.615,3.0,0.0,3.0,...,0.308,4.0,1.0,5.0,0.192,17,9,26,10,16


In [None]:
# Load the data
per_year_df = per_year_player_features

# Create sets of (PLAYER_ID, SEASON) tuples
per_year_set = set(zip(per_year_df['PLAYER_ID'], per_year_df['SEASON']))
reb_set = set(zip(full_reb_df['PLAYER_ID'], full_reb_df['SEASON']))

# Find missing combinations (in per_year but NOT in rebound)
missing_in_reb = per_year_set - reb_set

print(f"Total rows in per_year_player_features: {len(per_year_df)}")
print(f"Total rows in full_reb_df: {len(full_reb_df)}")
print(f"Missing combinations in rebound data: {len(missing_in_reb)}")
print(f"\nMissing (PLAYER_ID, SEASON) combinations in rebound data:")

# Convert to DataFrame for easier inspection
missing_df = pd.DataFrame(list(missing_in_reb), columns=['PLAYER_ID', 'SEASON'])
missing_df = missing_df.sort_values(['PLAYER_ID', 'SEASON']).reset_index(drop=True)

# Merge with player names for better readability
missing_with_names = missing_df.merge(
    per_year_df[['PLAYER_ID', 'PLAYER', 'SEASON']].drop_duplicates(),
    on=['PLAYER_ID', 'SEASON'],
    how='left'
)

print(missing_with_names)

missing_with_names.to_csv("Data/missing_in_rebound_data.csv", index=False)
print(f"\n✓ Saved missing combinations to Data/missing_in_rebound_data.csv")

# Show summary by season
print("\nMissing by season:")
print(missing_with_names['SEASON'].value_counts().sort_index())
missing_with_names

Total rows in per_year_player_features: 4327
Total rows in full_reb_df: 4126
Missing combinations in rebound data: 201

Missing (PLAYER_ID, SEASON) combinations in rebound data:
     PLAYER_ID   SEASON           PLAYER
0         2037  2019-20      J. Crawford
1         2207  2021-22       J. Johnson
2       101123  2021-22         G. Green
3       101139  2021-22         C. Miles
4       201144  2021-22      Mike Conley
..         ...      ...              ...
196    1631245  2024-25  Quenton Jackson
197    1641745  2023-24     Adam Flagler
198    1641749  2024-25       K. Johnson
199    1642389  2024-25      Zyon Pullin
200    1642399  2024-25    Jesse Edwards

[201 rows x 3 columns]

✓ Saved missing combinations to Data/missing_in_rebound_data.csv

Missing by season:
SEASON
2015-16     6
2016-17    15
2017-18    13
2018-19    22
2019-20    14
2020-21    18
2021-22    23
2022-23    24
2023-24    38
2024-25    28
Name: count, dtype: int64


In [63]:
# --- 1. SETUP MASTER DATAFRAME ---
defend_cols = ['FREQ', 'D_FGM', 'D_FGA', 'D_FG_PCT', 'NORMAL_FG_PCT', 'PCT_PLUSMINUS']

full_defend_df = per_year_player_features[['PLAYER_ID', 'SEASON']].copy()
full_defend_df['PLAYER_ID'] = full_defend_df['PLAYER_ID'].astype(int)

# Add defense columns, initialized to 0
for col in defend_cols:
    full_defend_df[col] = 0.0

# --- 2. ROBUST PROCESSING FUNCTION ---
def fetch_and_process_defense_season(year, master_df, target_cols, type):
    """Fetch defense stats for a specific season"""
    season_mask = master_df['SEASON'] == year
    current_season_df = master_df[season_mask].copy()

    if current_season_df.empty:
        print(f"No players found for {year}")
        return current_season_df

    try:
        print(f"Fetching defense data for {year}...")
        
        max_retries = 3
        season_df = None

        for attempt in range(max_retries):
            try:
                # Polite delay
                time.sleep(1 + (attempt * 2))
                defen = leaguedashptdefend.LeagueDashPtDefend(
                    season=year,
                    per_mode_simple='PerGame',
                    defense_category=type,
                    timeout=30
                )
                season_df = defen.get_data_frames()[0]
                print(f"Fetched defense data for {year}")
                break
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"API Error for {year}: {e}")
                    return current_season_df
                time.sleep(2 * (2 ** attempt))
        
        if season_df is None or season_df.empty:
            print(f"No defense data returned for {year}")
            return current_season_df

        # --- FIND PLAYER_ID COLUMN ---
        season_df = season_df.reset_index(drop=True)
        
        pid_col = None
        for c in season_df.columns:
            if 'CLOSE_DEF_PERSON_ID' in str(c):
                pid_col = c
                break
        
        if pid_col is None:
            print(f"PLAYER_ID not found for {year}. Columns: {season_df.columns.tolist()[:5]}")
            return current_season_df
        
        # Rename to standard format
        season_df = season_df.rename(columns={pid_col: 'PLAYER_ID'})
        
        # Standardize ID format
        season_df['PLAYER_ID'] = pd.to_numeric(season_df['PLAYER_ID'], errors='coerce').astype('Int64')
        season_df = season_df.dropna(subset=['PLAYER_ID'])
        
        # Filter to relevant players
        player_ids_for_season = current_season_df['PLAYER_ID'].unique()
        season_df = season_df[season_df['PLAYER_ID'].isin(player_ids_for_season)]

        if season_df.empty:
            print(f"No matching players for {year}")
            return current_season_df

        # Find intersecting defense columns
        intersect_cols = [c for c in target_cols if c in season_df.columns]
        
        if not intersect_cols:
            print(f"No matching defense columns for {year}")
            return current_season_df

        # Update data by matching PLAYER_ID
        for col in intersect_cols:
            for idx, row in current_season_df.iterrows():
                pid = row['PLAYER_ID']
                matching = season_df[season_df['PLAYER_ID'] == pid]
                if not matching.empty:
                    current_season_df.loc[idx, col] = matching.iloc[0][col]
        
        print(f"SUCCESS {year}: Updated {len(season_df)} players with defense stats")
        return current_season_df

    except Exception as e:
        print(f"CRITICAL ERROR processing {year}: {e}")
        season_mask_on_error = master_df['SEASON'] == year
        return master_df[season_mask_on_error].copy()

# --- 3. EXECUTION WITH MULTIPROCESSING ---
all_season_defense_dfs = []
start_time = time.time()

num_workers = min(len(seasons), 4)
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(fetch_and_process_defense_season, year, full_defend_df, defend_cols, 'Overall')
        for year in seasons
    ]
    all_season_defense_dfs = [f.result() for f in futures]

print(f"Finished in {time.time() - start_time:.2f} seconds")

# --- 4. CONCATENATE ---
if all_season_defense_dfs:
    full_defend_df = pd.concat(all_season_defense_dfs, ignore_index=True)
    full_defend_df = full_defend_df.sort_values(
        by=['PLAYER_ID', 'SEASON']
    ).reset_index(drop=True)

# --- 5. SAVE ---
full_defend_df.fillna(0).to_csv("Data/full_players_defense_Overall.csv", index=False)
print("Defense stats saved to Data/full_players_defense_Overall.csv")
full_defend_df

Fetching defense data for 2015-16...
Fetching defense data for 2016-17...
Fetching defense data for 2017-18...
Fetching defense data for 2018-19...
Fetched defense data for 2016-17
Fetched defense data for 2017-18
Fetched defense data for 2015-16
Fetched defense data for 2018-19
SUCCESS 2015-16: Updated 304 players with defense stats
Fetching defense data for 2019-20...
SUCCESS 2016-17: Updated 326 players with defense stats
Fetching defense data for 2020-21...
SUCCESS 2017-18: Updated 365 players with defense stats
Fetching defense data for 2021-22...
SUCCESS 2018-19: Updated 390 players with defense stats
Fetching defense data for 2022-23...
Fetched defense data for 2019-20
Fetched defense data for 2020-21
Fetched defense data for 2021-22
Fetched defense data for 2022-23
SUCCESS 2019-20: Updated 402 players with defense stats
Fetching defense data for 2023-24...
SUCCESS 2020-21: Updated 420 players with defense stats
Fetching defense data for 2024-25...
SUCCESS 2021-22: Updated 462 p

Unnamed: 0,PLAYER_ID,SEASON,FREQ,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS
0,1495,2015-16,1.0,4.87,11.26,0.432,0.474,-0.041
1,1717,2015-16,1.0,6.20,13.39,0.463,0.460,0.003
2,1717,2016-17,1.0,5.23,10.70,0.489,0.476,0.012
3,1717,2017-18,1.0,5.83,12.43,0.469,0.477,-0.007
4,1717,2018-19,1.0,4.06,8.25,0.492,0.472,0.020
...,...,...,...,...,...,...,...,...
4322,1642450,2024-25,1.0,0.80,1.60,0.500,0.448,0.052
4323,1642461,2024-25,1.0,0.77,2.54,0.303,0.438,-0.135
4324,1642484,2024-25,1.0,2.43,4.57,0.531,0.436,0.095
4325,1642505,2024-25,1.0,1.62,3.08,0.525,0.437,0.088


In [65]:
full_defend_df = per_year_player_features[['PLAYER_ID', 'SEASON']].copy()
full_defend_df['PLAYER_ID'] = full_defend_df['PLAYER_ID'].astype(int)

# Add defense columns, initialized to 0
for col in defend_cols:
    full_defend_df[col] = 0.0

all_season_defense_dfs = []
start_time = time.time()

num_workers = min(len(seasons), 4)
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(fetch_and_process_defense_season, year, full_defend_df, defend_cols, '3 Pointers')
        for year in seasons
    ]
    all_season_defense_dfs = [f.result() for f in futures]

print(f"Finished in {time.time() - start_time:.2f} seconds")

# --- 4. CONCATENATE ---
if all_season_defense_dfs:
    full_defend_df = pd.concat(all_season_defense_dfs, ignore_index=True)
    full_defend_df = full_defend_df.sort_values(
        by=['PLAYER_ID', 'SEASON']
    ).reset_index(drop=True)

# --- 5. SAVE ---
full_defend_df.fillna(0).to_csv("Data/full_players_defense_3_Pointers.csv", index=False)
print("Defense stats saved to Data/full_players_defense_3_Pointers.csv")
full_defend_df

Fetching defense data for 2015-16...
Fetching defense data for 2016-17...
Fetching defense data for 2017-18...
Fetching defense data for 2018-19...
Fetched defense data for 2018-19
Fetched defense data for 2016-17
Fetched defense data for 2015-16
Fetched defense data for 2017-18
SUCCESS 2016-17: Updated 325 players with defense stats
Fetching defense data for 2019-20...
SUCCESS 2018-19: Updated 388 players with defense stats
Fetching defense data for 2020-21...
SUCCESS 2015-16: Updated 302 players with defense stats
Fetching defense data for 2021-22...
SUCCESS 2017-18: Updated 362 players with defense stats
Fetching defense data for 2022-23...
Fetched defense data for 2019-20
Fetched defense data for 2022-23
Fetched defense data for 2020-21
Fetched defense data for 2021-22
SUCCESS 2020-21: Updated 420 players with defense stats
Fetching defense data for 2023-24...
SUCCESS 2019-20: Updated 401 players with defense stats
Fetching defense data for 2024-25...
SUCCESS 2022-23: Updated 459 p

Unnamed: 0,PLAYER_ID,SEASON,FREQ,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS
0,1495,2015-16,0.067,0.0,0.0,0.0,0.0,0.0
1,1717,2015-16,0.247,0.0,0.0,0.0,0.0,0.0
2,1717,2016-17,0.229,0.0,0.0,0.0,0.0,0.0
3,1717,2017-18,0.242,0.0,0.0,0.0,0.0,0.0
4,1717,2018-19,0.283,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
4322,1642450,2024-25,0.375,0.0,0.0,0.0,0.0,0.0
4323,1642461,2024-25,0.545,0.0,0.0,0.0,0.0,0.0
4324,1642484,2024-25,0.375,0.0,0.0,0.0,0.0,0.0
4325,1642505,2024-25,0.450,0.0,0.0,0.0,0.0,0.0


In [66]:
full_defend_df = per_year_player_features[['PLAYER_ID', 'SEASON']].copy()
full_defend_df['PLAYER_ID'] = full_defend_df['PLAYER_ID'].astype(int)

# Add defense columns, initialized to 0
for col in defend_cols:
    full_defend_df[col] = 0.0

all_season_defense_dfs = []
start_time = time.time()

num_workers = min(len(seasons), 4)
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(fetch_and_process_defense_season, year, full_defend_df, defend_cols, '2 Pointers')
        for year in seasons
    ]
    all_season_defense_dfs = [f.result() for f in futures]

print(f"Finished in {time.time() - start_time:.2f} seconds")

# --- 4. CONCATENATE ---
if all_season_defense_dfs:
    full_defend_df = pd.concat(all_season_defense_dfs, ignore_index=True)
    full_defend_df = full_defend_df.sort_values(
        by=['PLAYER_ID', 'SEASON']
    ).reset_index(drop=True)

# --- 5. SAVE ---
full_defend_df.fillna(0).to_csv("Data/full_players_defense_2_Pointers.csv", index=False)
print("Defense stats saved to Data/full_players_defense_2_Pointers.csv")
full_defend_df

Fetching defense data for 2015-16...
Fetching defense data for 2016-17...
Fetching defense data for 2017-18...
Fetching defense data for 2018-19...
Fetched defense data for 2018-19
SUCCESS 2018-19: Updated 390 players with defense stats
Fetching defense data for 2019-20...
Fetched defense data for 2016-17
SUCCESS 2016-17: Updated 326 players with defense stats
Fetching defense data for 2020-21...
Fetched defense data for 2015-16
SUCCESS 2015-16: Updated 304 players with defense stats
Fetching defense data for 2021-22...
Fetched defense data for 2017-18
SUCCESS 2017-18: Updated 363 players with defense stats
Fetching defense data for 2022-23...
Fetched defense data for 2020-21
SUCCESS 2020-21: Updated 420 players with defense stats
Fetching defense data for 2023-24...
Fetched defense data for 2022-23
SUCCESS 2022-23: Updated 460 players with defense stats
Fetching defense data for 2024-25...
Fetched defense data for 2019-20
SUCCESS 2019-20: Updated 401 players with defense stats
Fetched

Unnamed: 0,PLAYER_ID,SEASON,FREQ,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS
0,1495,2015-16,0.933,0.0,0.0,0.0,0.0,0.0
1,1717,2015-16,0.753,0.0,0.0,0.0,0.0,0.0
2,1717,2016-17,0.771,0.0,0.0,0.0,0.0,0.0
3,1717,2017-18,0.758,0.0,0.0,0.0,0.0,0.0
4,1717,2018-19,0.717,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
4322,1642450,2024-25,0.625,0.0,0.0,0.0,0.0,0.0
4323,1642461,2024-25,0.455,0.0,0.0,0.0,0.0,0.0
4324,1642484,2024-25,0.625,0.0,0.0,0.0,0.0,0.0
4325,1642505,2024-25,0.550,0.0,0.0,0.0,0.0,0.0


In [None]:
year_by_year = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id=203507)
year_df = year_by_year.get_data_frames()[1]
year_df
cols_yearly = ['GP', 'MIN', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PFD','PTS', 'PLUS_MINUS']

Unnamed: 0,GROUP_SET,GROUP_VALUE,TEAM_ID,TEAM_ABBREVIATION,MAX_GAME_DATE,GP,W,L,W_PCT,MIN,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,By Year,2025-26,1610612749,MIL,2025-12-01T00:00:00,16,8,8,0.5,492.383333,...,13,1,1,13,13,9,13,12,11,13
1,By Year,2024-25,1610612749,MIL,2025-04-11T00:00:00,67,40,27,0.597,2288.616666,...,8,9,2,7,2,7,5,3,1,6
2,By Year,2023-24,1610612749,MIL,2024-04-09T00:00:00,73,45,28,0.616,2567.233333,...,7,12,7,1,1,6,1,1,2,1
3,By Year,2022-23,1610612749,MIL,2023-04-04T00:00:00,63,47,16,0.746,2023.683333,...,12,10,6,3,6,5,8,5,4,8
4,By Year,2021-22,1610612749,MIL,2022-04-08T00:00:00,67,45,22,0.672,2204.25,...,5,5,8,2,4,4,6,5,7,5
5,By Year,2020-21,1610612749,MIL,2021-05-15T00:00:00,61,40,21,0.656,2012.733333,...,9,3,3,9,9,3,9,8,3,9
6,By Year,2019-20,1610612749,MIL,2020-08-11T00:00:00,63,51,12,0.81,1916.9,...,10,7,5,5,7,1,7,2,7,7
7,By Year,2018-19,1610612749,MIL,2019-04-07T00:00:00,72,56,16,0.778,2358.216666,...,3,13,10,4,5,2,2,4,5,2
8,By Year,2017-18,1610612749,MIL,2018-04-11T00:00:00,75,39,36,0.52,2756.216666,...,4,11,9,6,3,8,4,7,10,3
9,By Year,2016-17,1610612749,MIL,2017-04-10T00:00:00,80,42,38,0.525,2845.05,...,1,8,11,8,8,10,3,9,9,4


In [11]:

# --- Helper Function ---
def fetch_and_process_player_yearly_stats(pid, seasons_list, target_cols):
    """
    Fetches year-by-year stats for a single player.
    Handles multi-team seasons by aggregating stats.
    Returns a list of dictionaries (one per season).
    """
    max_retries = 3
    base_delay = 0.5
    
    time.sleep(0.6)  # Polite delay
    
    try:
        # Fetch year-by-year dashboard
        for attempt in range(max_retries):
            try:
                year_obj = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
                    player_id=pid,
                    timeout=15
                )
                year_frames = year_obj.get_data_frames()
                break
            except (ConnectionError, ReadTimeout) as e:
                if attempt == max_retries - 1:
                    print(f"  -> Failed to fetch year-by-year for PID {pid} after {max_retries} attempts")
                    return None
                delay = base_delay * (2 ** attempt)
                time.sleep(delay)
        
        if not year_frames or len(year_frames) < 2:
            return None
        
        year_df = year_frames[1]  # [1] is the ByYearPlayerDashboard
        
        if year_df is None or year_df.empty:
            return None
        
        # --- FILTER & CLEAN ---
        # Remove rows with TEAM_ID == -1 (totals)
        year_df = year_df[year_df["TEAM_ID"] != -1].copy()
        
        # Filter to only seasons we care about
        year_df = year_df[year_df["GROUP_VALUE"].isin(seasons_list)].copy()
        
        if year_df.empty:
            return None
        
        # --- AGGREGATE MULTI-TEAM SEASONS ---
        # Group by season to handle players who played on multiple teams
        result_rows = []
        
        for season, season_group in year_df.groupby("GROUP_VALUE"):
            row_dict = {
                "PLAYER_ID": pid,
                "SEASON": season
            }
            
            # For most stats: sum them (games played, minutes, etc.)
            sum_cols = ['GP', 'MIN', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PFD', 'PTS']
            for col in sum_cols:
                if col in season_group.columns:
                    row_dict[col] = season_group[col].sum()
                else:
                    row_dict[col] = 0
            
            # For PLUS_MINUS: weighted average by GP
            # plus_minus_per_game = total_plus_minus / total_gp
            # When aggregating: weighted_sum = sum(plus_minus_per_game * gp for each team)
            # Then: final_plus_minus_per_game = weighted_sum / total_gp
            if 'PLUS_MINUS' in season_group.columns and 'GP' in season_group.columns:
                # Get total GP for the season
                total_gp_season = season_group['GP'].sum()
                
                if total_gp_season > 0:
                    # Calculate weighted sum of plus-minus
                    # Each row's plus_minus is already per-game, so multiply by GP to get total
                    weighted_pm_sum = (season_group['PLUS_MINUS'] * season_group['GP']).sum()
                    # Average it back out
                    row_dict['PLUS_MINUS'] = weighted_pm_sum / total_gp_season
                else:
                    row_dict['PLUS_MINUS'] = 0
            else:
                row_dict['PLUS_MINUS'] = 0
            
            result_rows.append(row_dict)
        
        return result_rows if result_rows else None
    
    except Exception as e:
        print(f"  !! Unexpected error for PID {pid}: {e}")
        return None

# --- MAIN EXECUTION ---
print("Starting year-by-year stats fetch for all players...")
start_time = time.time()

# Get unique player IDs from full_players_features
player_ids = full_players_features['PLAYER_ID'].unique().tolist()
print(f"Processing {len(player_ids)} unique players...")

target_cols = ['GP', 'MIN', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']

# Collect all year-by-year stats
all_yearly_stats = []
num_workers = 5

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(fetch_and_process_player_yearly_stats, pid, seasons, target_cols)
        for pid in player_ids
    ]
    
    for i, future in enumerate(futures):
        if (i + 1) % 50 == 0 or (i + 1) == len(player_ids):
            print(f"Processed {i+1}/{len(player_ids)} players...")
        
        result = future.result()
        if result is not None:
            all_yearly_stats.extend(result)

print(f"Collected {len(all_yearly_stats)} player-season records")

# Create a temporary DataFrame from collected stats
yearly_stats_df = pd.DataFrame(all_yearly_stats)

if yearly_stats_df.empty:
    print("ERROR: No yearly stats collected!")
else:
    print(f"Yearly stats DataFrame shape: {yearly_stats_df.shape}")
    print(f"Columns: {yearly_stats_df.columns.tolist()}")
    
    # --- UPDATE per_year_player_features ---
    for col in target_cols:
        if col not in per_year_player_features.columns:
            per_year_player_features[col] = 0
    # For each column in target_cols, update the corresponding row in per_year_player_features
    
    for col in target_cols:
        if col in yearly_stats_df.columns and col in per_year_player_features.columns:
            # Create a lookup: (PLAYER_ID, SEASON) -> value
            lookup = yearly_stats_df.set_index(['PLAYER_ID', 'SEASON'])[col]
            
            # Update per_year_player_features
            for idx, row in per_year_player_features.iterrows():
                pid = row['PLAYER_ID']
                season = row['SEASON']
                
                if (pid, season) in lookup.index:
                    per_year_player_features.loc[idx, col] = lookup[(pid, season)]
    
    print("✓ Updated per_year_player_features with yearly stats")
    
    # Save the updated DataFrame
    per_year_player_features.to_csv("Data/Updating_per_year_player_features_stats.csv", index=False)
    print("✓ Saved to Data/Updating_per_year_player_features_stats.csv")

end_time = time.time()
print(f"\n--- Total time taken: {end_time - start_time:.2f} seconds ---")

# Display sample
print("\nSample of updated data:")
print(per_year_player_features[['PLAYER_ID', 'SEASON', 'GP', 'MIN', 'PTS', 'PLUS_MINUS']].head(10))

Starting year-by-year stats fetch for all players...
Processing 885 unique players...
Processed 50/885 players...
Processed 100/885 players...
Processed 150/885 players...
Processed 200/885 players...
Processed 250/885 players...
Processed 300/885 players...
Processed 350/885 players...
Processed 400/885 players...
Processed 450/885 players...
Processed 500/885 players...
Processed 550/885 players...
Processed 600/885 players...
Processed 650/885 players...
Processed 700/885 players...
Processed 750/885 players...
Processed 800/885 players...
Processed 850/885 players...
Processed 885/885 players...
Collected 4327 player-season records
Yearly stats DataFrame shape: (4327, 12)
Columns: ['PLAYER_ID', 'SEASON', 'GP', 'MIN', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']
✓ Updated per_year_player_features with yearly stats
✓ Saved to Data/Updating_per_year_player_features_stats.csv

--- Total time taken: 1038.92 seconds ---

Sample of updated data:
   PLAYER_ID   SEASON  GP 

In [None]:
per_year_player_features

## we jump back here to pull a few more features for our final df in clustering notebook

In [30]:
project_root = "../"
data_path = project_root + "data/interim/clusters/"
df_players_seasons = pd.read_csv(data_path + "players_for_clusters_filtered.csv")

In [9]:
from nba_api.stats.library.http import NBAStatsHTTP

custom_headers = {
    "Host": "stats.nba.com",
    "Connection": "keep-alive",
    "Accept": "application/json, text/plain, */*",
    "x-nba-stats-token": "true",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "x-nba-stats-origin": "stats",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "cors",
    "Referer": "https://www.nba.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
}

In [None]:
from time import sleep
tracking_stats =leaguedashptstats.LeagueDashPtStats(
    season='2022-23',
    #team_id_nullable=1610612746,
    per_mode_simple='PerGame',
    player_or_team = 'Player',
    pt_measure_type='Defense',
    #last_n_games= 10,
    season_type_all_star='Regular Season',
    #headers=custom_headers,
    timeout=30
)
tracking_stats_df = tracking_stats.get_data_frames()[0]
tracking_stats_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GP,W,L,MIN,MIN1,DIST_FEET,DIST_MILES,DIST_MILES_OFF,DIST_MILES_DEF,AVG_SPEED,AVG_SPEED_OFF,AVG_SPEED_DEF
0,1630639,A.J. Lawson,1610612742,DAL,15,5,10,7.2,7.2,3155.2,0.6,0.3,0.3,4.64,4.88,4.30
1,1631260,AJ Green,1610612749,MIL,35,27,8,9.9,9.9,4053.2,0.8,0.4,0.3,4.35,4.73,3.96
2,1631100,AJ Griffin,1610612737,ATL,72,34,38,19.5,19.5,8386.3,1.6,0.9,0.7,4.56,4.96,4.17
3,203932,Aaron Gordon,1610612743,DEN,68,45,23,30.2,30.2,11320.4,2.1,1.1,1.0,3.99,4.22,3.77
4,1628988,Aaron Holiday,1610612737,ATL,63,32,31,13.4,13.4,5413.9,1.0,0.5,0.5,4.27,4.35,4.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,1628380,Zach Collins,1610612759,SAS,61,19,42,22.9,22.9,8779.4,1.7,0.9,0.8,4.05,4.39,3.71
535,203897,Zach LaVine,1610612741,CHI,76,37,39,36.0,36.0,13892.7,2.6,1.4,1.2,4.11,4.53,3.72
536,1630192,Zeke Nnaji,1610612743,DEN,52,34,18,14.0,14.0,5437.5,1.0,0.6,0.5,4.11,4.28,3.92
537,1630533,Ziaire Williams,1610612763,MEM,37,21,16,15.2,15.2,6061.3,1.1,0.6,0.6,4.22,4.37,4.08


In [23]:
print(tracking_stats_df.columns)

Index(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'GP', 'W',
       'L', 'MIN', 'MIN1', 'DIST_FEET', 'DIST_MILES', 'DIST_MILES_OFF',
       'DIST_MILES_DEF', 'AVG_SPEED', 'AVG_SPEED_OFF', 'AVG_SPEED_DEF'],
      dtype='object')


In [10]:
import time
import random

def pull_tracking_safe(**kwargs):
    time.sleep(10 + random.uniform(5, 10))
    return leaguedashptstats.LeagueDashPtStats(
        headers=custom_headers,
        timeout=60,
        **kwargs
    ).get_data_frames()[0]


In [18]:
df = pull_tracking_safe(
    season="2022-23",
    season_type_all_star="Regular Season",
    per_mode_simple="PerGame",
    player_or_team="Player",
    pt_measure_type="PullUpShot"
)


In [19]:
print(df.columns)

Index(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'GP', 'W',
       'L', 'MIN', 'PULL_UP_FGM', 'PULL_UP_FGA', 'PULL_UP_FG_PCT',
       'PULL_UP_PTS', 'PULL_UP_FG3M', 'PULL_UP_FG3A', 'PULL_UP_FG3_PCT',
       'PULL_UP_EFG_PCT'],
      dtype='object')


In [41]:
NBAStatsHTTP.DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Referer": "https://www.nba.com/",
    "Origin": "https://www.nba.com",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.9",
    "x-nba-stats-origin": "stats",
    "x-nba-stats-token": "true",
}

import time
import random

def polite_sleep():
    time.sleep(random.uniform(3.5, 6.0))
import time
from json import JSONDecodeError

def safe_call(func, max_retries=5):
    for attempt in range(max_retries):
        try:
            return func()
        except (JSONDecodeError, Exception) as e:
            wait = 5 * (attempt + 1)
            time.sleep(wait)
    return None

In [42]:
tracking_stats =safe_call(
    lambda : leaguedashptstats.LeagueDashPtStats(
    season='2024-25',
    per_mode_simple='Totals',
    player_or_team = 'Player',
    pt_measure_type='Drives',
    last_n_games= 10,
    season_type_all_star='Regular Season',
    timeout=30
    )
)
tracking_stats_df = tracking_stats.get_data_frames()[0]
tracking_stats_df

KeyboardInterrupt: 

In [20]:
custom_headers = {
    "Host": "stats.nba.com",
    "Connection": "keep-alive",
    "Accept": "application/json, text/plain, */*",
    "x-nba-stats-token": "true",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "x-nba-stats-origin": "stats",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "cors",
    "Referer": "https://www.nba.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
}
SEASONS = [f"{y}-{str(y+1)[-2:]}" for y in range(2015, 2025)]


In [21]:
import time
import random

def safe_pull_ptstats(
    season,
    pt_measure_type,
    per_mode,
    timeout=60
):
    # behave like a human
    time.sleep(12 + random.uniform(3, 7))

    stats = leaguedashptstats.LeagueDashPtStats(
        season=season,
        season_type_all_star="Regular Season",
        per_mode_simple=per_mode,
        player_or_team="Player",
        pt_measure_type=pt_measure_type,
        headers=custom_headers,
        timeout=timeout
    )

    df = stats.get_data_frames()[0]
    df["SEASON"] = season
    return df


In [22]:
MEASURES = {
    "possessions": {
        "pt_measure_type": "Possessions",
        "per_mode": "Totals"
    },
    "drives": {
        "pt_measure_type": "Drives",
        "per_mode": "Totals"
    },
    "passing": {
        "pt_measure_type": "Passing",
        "per_mode": "Totals"
    },
    "speeddistance": {
        "pt_measure_type": "SpeedDistance",
        "per_mode": "PerGame"   # ONLY mode that works
    },
    "catchshoot": {
        "pt_measure_type": "CatchShoot",
        "per_mode": "PerGame"   # flaky, keep light
    },
    "pullupshot": {
        "pt_measure_type": "PullUpShot",
        "per_mode": "PerGame"
    }
}


In [24]:
results = {name: [] for name in MEASURES}

for season in SEASONS:
    print(f"\n=== Season {season} ===")

    for name, cfg in MEASURES.items():
        print(f"Pulling {name}...")

        try:
            df = safe_pull_ptstats(
                season=season,
                pt_measure_type=cfg["pt_measure_type"],
                per_mode=cfg["per_mode"]
            )
            results[name].append(df)
            print(f"  ✔ {name}: {df.shape}")

        except Exception as e:
            print(f"  ✖ {name} failed for {season}: {e}")
            # IMPORTANT: do not retry immediately
            continue



=== Season 2015-16 ===
Pulling possessions...
  ✔ possessions: (476, 22)
Pulling drives...
  ✔ drives: (476, 26)
Pulling passing...
  ✔ passing: (476, 19)
Pulling speeddistance...
  ✔ speeddistance: (476, 17)
Pulling catchshoot...
  ✔ catchshoot: (476, 17)
Pulling pullupshot...
  ✔ pullupshot: (476, 17)

=== Season 2016-17 ===
Pulling possessions...
  ✔ possessions: (486, 22)
Pulling drives...
  ✔ drives: (486, 26)
Pulling passing...
  ✔ passing: (486, 19)
Pulling speeddistance...
  ✔ speeddistance: (486, 17)
Pulling catchshoot...
  ✔ catchshoot: (486, 17)
Pulling pullupshot...
  ✔ pullupshot: (486, 17)

=== Season 2017-18 ===
Pulling possessions...
  ✔ possessions: (540, 22)
Pulling drives...
  ✔ drives: (540, 26)
Pulling passing...
  ✔ passing: (540, 19)
Pulling speeddistance...
  ✔ speeddistance: (540, 17)
Pulling catchshoot...
  ✔ catchshoot: (540, 17)
Pulling pullupshot...
  ✔ pullupshot: (540, 17)

=== Season 2018-19 ===
Pulling possessions...
  ✔ possessions: (530, 22)
Pulling 

In [28]:
df = safe_pull_ptstats(
                season='2018-19',
                pt_measure_type='CatchShoot',
                per_mode='PerGame'
            )
results['catchshoot'].append(df)

In [29]:
tracking_dfs = {}

for name, dfs in results.items():
    if len(dfs) == 0:
        continue

    df_all = pd.concat(dfs, ignore_index=True)

    # normalize column types (important for merges & parquet)
    df_all.columns = df_all.columns.astype(str)

    tracking_dfs[name] = df_all

    print(f"{name}: {df_all.shape}")

for name, df in tracking_dfs.items():
    df.to_parquet(data_path + f"tracking/tracking_{name}_2016_2025.parquet", index=False)



possessions: (5386, 22)
drives: (5386, 26)
passing: (5386, 19)
speeddistance: (5386, 17)
catchshoot: (5386, 17)
pullupshot: (5386, 17)


In [31]:
valid_pairs = set(
    zip(df_players_seasons["PLAYER_ID"], df_players_seasons["SEASON"])
)

In [32]:
def filter_by_player_season(df, valid_pairs):
    mask = [
        (pid, season) in valid_pairs
        for pid, season in zip(df["PLAYER_ID"], df["SEASON"])
    ]
    return df.loc[mask].copy()


In [34]:
POSSESSIONS_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "TOUCHES", "FRONT_CT_TOUCHES", "TIME_OF_POSS",
    "AVG_SEC_PER_TOUCH", "AVG_DRIB_PER_TOUCH",
    "ELBOW_TOUCHES", "POST_TOUCHES", "PAINT_TOUCHES",
    "PTS_PER_TOUCH", "PTS_PER_PAINT_TOUCH"
]
DRIVES_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "DRIVES", "DRIVE_FG_PCT",
    "DRIVE_PTS", "DRIVE_AST",
    "DRIVE_TOV", "DRIVE_PF"
]
SPEED_DISTANCE_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "DIST_MILES", "DIST_MILES_OFF",
    "DIST_MILES_DEF", "AVG_SPEED",
    "AVG_SPEED_OFF", "AVG_SPEED_DEF"
]
PASSING_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "PASSES_MADE", "AST", "SECONDARY_AST",
    "POTENTIAL_AST", "AST_TO_PASS_PCT"
]
CATCH_SHOOT_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "CATCH_SHOOT_FG_PCT", "CATCH_SHOOT_FG3_PCT", "CATCH_SHOOT_EFG_PCT",
    "CATCH_SHOOT_FG3A", "CATCH_SHOOT_FGA"
]
PULLUP_COLS = [
    "PLAYER_ID", "SEASON", "MIN",
    "PULL_UP_FG_PCT", "PULL_UP_FG3_PCT", "PULL_UP_EFG_PCT",
    "PULL_UP_FGA"
]
data_path = project_root + "data/interim/clusters/tracking/"

In [54]:
df_poss = pd.read_parquet(data_path + "tracking_possessions_2016_2025.parquet")
df_poss = filter_by_player_season(df_poss, valid_pairs)
df_poss = df_poss[POSSESSIONS_COLS]

# engineered features
df_poss["touches_per_min"] = df_poss["TOUCHES"] / df_poss["MIN"]
df_poss["time_poss_per_min"] = df_poss["TIME_OF_POSS"] / df_poss["MIN"]
df_poss["front_ct_touches_per_min"] = df_poss["FRONT_CT_TOUCHES"] / df_poss["MIN"]

df_poss["paint_touch_share"] = df_poss["PAINT_TOUCHES"] / df_poss["TOUCHES"]
df_poss["post_touch_share"] = df_poss["POST_TOUCHES"] / df_poss["TOUCHES"]
df_poss["elbow_touch_share"] = df_poss["ELBOW_TOUCHES"] / df_poss["TOUCHES"]

df_poss = df_poss.drop(
    columns=["TOUCHES", "TIME_OF_POSS", "MIN",
            "PAINT_TOUCHES", "POST_TOUCHES", "ELBOW_TOUCHES", "FRONT_CT_TOUCHES"]
).round(2)


In [55]:
df_drives = pd.read_parquet(data_path + "tracking_drives_2016_2025.parquet")
df_drives = filter_by_player_season(df_drives, valid_pairs)
df_drives = df_drives[DRIVES_COLS]

df_drives["drives_per_min"] = df_drives["DRIVES"] / df_drives["MIN"]
df_drives["drive_pts_per_drive"] = df_drives["DRIVE_PTS"] / df_drives["DRIVES"]
df_drives["drive_ast_rate"] = df_drives["DRIVE_AST"] / df_drives["DRIVES"]
df_drives["drive_tov_rate"] = df_drives["DRIVE_TOV"] / df_drives["DRIVES"]
df_drives["drive_pf_rate"] = df_drives["DRIVE_PF"] / df_drives["DRIVES"]

df_drives = df_drives.drop(columns=["DRIVES", "DRIVE_PTS", "DRIVE_AST",
                                    "DRIVE_TOV", "DRIVE_PF", "MIN"]).round(2)


In [56]:
df_speed = pd.read_parquet(data_path + "tracking_speeddistance_2016_2025.parquet")
df_speed = filter_by_player_season(df_speed, valid_pairs)
df_speed = df_speed[SPEED_DISTANCE_COLS]

df_speed["dist_miles_per_48min"] = (df_speed["DIST_MILES"] / df_speed["MIN"])*48
df_speed["dist_miles_off_per_48min"] = (df_speed["DIST_MILES_OFF"] / df_speed["MIN"])*48
df_speed["dist_miles_def_per_48min"] = (df_speed["DIST_MILES_DEF"] / df_speed["MIN"])*48

df_speed = df_speed.drop(columns=["DIST_MILES", "DIST_MILES_OFF", "DIST_MILES_DEF", "MIN"]).round(3)


In [57]:
df_pass = pd.read_parquet(data_path + "tracking_passing_2016_2025.parquet")
df_pass = filter_by_player_season(df_pass, valid_pairs)
df_pass = df_pass[PASSING_COLS]

df_pass["passes_per_min"] = df_pass["PASSES_MADE"] / df_pass["MIN"]
df_pass["ast_per_pass"] = df_pass["AST"] / df_pass["PASSES_MADE"]
df_pass["secondary_ast_rate"] = df_pass["SECONDARY_AST"] / df_pass["PASSES_MADE"]
df_pass["potential_ast_rate"] = df_pass["POTENTIAL_AST"] / df_pass["PASSES_MADE"]

df_pass = df_pass.drop(columns=["PASSES_MADE", "AST", "SECONDARY_AST", "POTENTIAL_AST", "MIN"]).round(2)


In [59]:
df_catch = pd.read_parquet(data_path + "tracking_catchshoot_2016_2025.parquet")
df_catch = filter_by_player_season(df_catch, valid_pairs)
df_catch = df_catch[CATCH_SHOOT_COLS]

df_catch["catch_fga_per_min"] = df_catch["CATCH_SHOOT_FGA"] / df_catch["MIN"]
df_catch["catch_fg3a_per_min"] = df_catch["CATCH_SHOOT_FG3A"] / df_catch["MIN"]

df_catch = df_catch.drop(columns=["CATCH_SHOOT_FGA", "CATCH_SHOOT_FG3A", "MIN"]).round(2)


In [60]:
df_pull = pd.read_parquet(data_path + "tracking_pullupshot_2016_2025.parquet")
df_pull = filter_by_player_season(df_pull, valid_pairs)
df_pull = df_pull[PULLUP_COLS]

df_pull["pullup_fga_per_min"] = df_pull["PULL_UP_FGA"] / df_pull["MIN"]

df_pull = df_pull.drop(columns=["PULL_UP_FGA", "MIN"]).round(2)


In [61]:
dfs = [df_poss, df_drives, df_speed, df_pass, df_catch, df_pull]

df_model_tracking = dfs[0]
for df in dfs[1:]:
    df_model_tracking = df_model_tracking.merge(
        df,
        on=["PLAYER_ID", "SEASON"],
        how="left"
    )


In [62]:
df_model_tracking = df_model_tracking.fillna(0)

In [63]:
data_path = project_root + "data/interim/clusters/"
df_model_tracking.to_csv(data_path + "df_tracking.csv", index=False)

In [None]:
df_model_tracking