In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [4]:
!pip install nba_api



In [5]:
from nba_api.stats.static import teams #to get team ids
from nba_api.stats.endpoints import commonteamroster #from here we want the player ids (clusters + models)
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import shotchartlineupdetail #from here we want the shot charts for our lineups (for visualizations)
from nba_api.stats.endpoints import leaguelineupviz #from here we want lineup shot frequencies (for visualizations)
from nba_api.stats.endpoints import leaguedashlineups #from here we want league average lineup stats (might not use)
from nba_api.stats.endpoints import teamdashlineups #from here we want team lineup stats (for the main analysis)

import time
from multiprocessing import Pool
import logging
from time import sleep
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import ConnectionError, ReadTimeout
import warnings
from pandas.errors import PerformanceWarning

In [6]:
teams_list = teams.get_teams()
team_ids = [team['id'] for team in teams_list]
team_id_name_map = {team['id']: team['full_name'] for team in teams_list}

seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(2015, 2025)]

In [6]:
allstar_ids = [203999, 201939, 201935, 202695, 2544, 1629029, 203507, 1630162, 1626164, 201142, 1628983]
season_team_df = pd.read_csv("Data/allstars_season_team.csv")
target_ids_str = [str(int(x)) for x in allstar_ids]
combos = list(season_team_df[['TEAM_ID', 'GROUP_VALUE']].drop_duplicates().itertuples(index=False, name=None))

results = []
count = 0
for team_id, season1 in combos:
    count += 1
    tried = 0
    success = False
    while tried < 3 and not success:
        try:
            obj = teamdashlineups.TeamDashLineups(
                team_id=int(team_id),
                season=str(season1),
                measure_type_detailed_defense="Advanced",
                per_mode_detailed="Per48",
                group_quantity=5,
            )
            lineups_df = obj.get_data_frames()[1]
            if lineups_df is not None and not lineups_df.empty:
                results.append(lineups_df)
            print(f"Fetched lineups for team {team_id} season {season1} ({count}/{len(combos)})")
            success = True
        except Exception as e:
            tried += 1
            print(f"Fetch failed for {team_id} {season1} (attempt {tried}): {e}")
            sleep(0.6 * tried)  # polite backoff

# single concat 
if results:
    full_lineups_data = pd.concat(results, ignore_index=True)
else:
    full_lineups_data = pd.DataFrame()

if not full_lineups_data.empty:
    full_lineups_data['GROUP_ID'] = full_lineups_data['GROUP_ID'].str[1:-1]
    mask = full_lineups_data['GROUP_ID'].apply(
        lambda s: any(pid in target_ids_str for pid in s.split('-'))
    )
    filtered = full_lineups_data[mask].copy()
    full_lineups_data = filtered
    full_lineups_data.to_csv("Data/allstar_lineups_data.csv", index=False)
    print("✓ Saved to Data/allstar_lineups_data.csv")


Fetch failed for 1610612743 2024-25 (attempt 1): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Fetched lineups for team 1610612743 season 2024-25 (1/90)
Fetched lineups for team 1610612743 season 2023-24 (2/90)
Fetched lineups for team 1610612743 season 2022-23 (3/90)
Fetched lineups for team 1610612743 season 2021-22 (4/90)
Fetched lineups for team 1610612743 season 2020-21 (5/90)
Fetched lineups for team 1610612743 season 2019-20 (6/90)
Fetched lineups for team 1610612743 season 2018-19 (7/90)
Fetched lineups for team 1610612743 season 2017-18 (8/90)
Fetched lineups for team 1610612743 season 2016-17 (9/90)
Fetched lineups for team 1610612743 season 2015-16 (10/90)
Fetch failed for 1610612744 2024-25 (attempt 1): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Fetched lineups for team 1610612744 season 2024-25 (11/90)
Fetched lineups for team 1610612744 season 2023-24 (12/90)
Fetched lineups for team 1610

In [None]:
allstar_ids = [203999, 201939, 201935, 202695, 2544, 1629029, 203507, 1630162, 1626164, 201142, 1628983]
season_team_df = pd.read_csv("Data/allstars_season_team.csv")
target_ids_str = [str(int(x)) for x in allstar_ids]
combos = list(season_team_df[['TEAM_ID', 'GROUP_VALUE']].drop_duplicates().itertuples(index=False, name=None))

# --- HELPER FUNCTION ---
def fetch_lineup_data(combo_info):
    """
    Fetches lineup data for a single team-season combo.
    Returns a tuple: (team_id, season, defense_type, dataframe)
    """
    team_id, season, defense_type = combo_info
    max_retries = 3
    
    for attempt in range(max_retries):
        try:
            time.sleep(0.6)  # Polite delay
            obj = teamdashlineups.TeamDashLineups(
                team_id=int(team_id),
                season=str(season),
                measure_type_detailed_defense=defense_type,
                per_mode_detailed="Per48",
                group_quantity=5,
            )
            lineups_df = obj.get_data_frames()[1]
            
            if lineups_df is not None and not lineups_df.empty:
                return (team_id, season, defense_type, lineups_df)
            else:
                return (team_id, season, defense_type, None)
                
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"  ✗ Failed {team_id} {season} {defense_type} after {max_retries} attempts: {e}")
                return (team_id, season, defense_type, None)
            sleep(0.6 * (attempt + 1))

# --- MAIN EXECUTION ---
defense_types = ["Advanced", "Base", "Scoring", "Misc", "Opponent"]

# Create job list: (team_id, season, defense_type)
jobs = [(team_id, season, defense_type) 
        for team_id, season in combos 
        for defense_type in defense_types]

print(f"Created {len(jobs)} jobs ({len(combos)} team-seasons × {len(defense_types)} defense types)")

# Store results by defense type
results_by_type = {dt: [] for dt in defense_types}
total_jobs = len(jobs)
completed = 0

# Run in parallel
num_workers = 10
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [executor.submit(fetch_lineup_data, job) for job in jobs]
    
    for future in futures:
        completed += 1
        if completed % 50 == 0 or completed == total_jobs:
            print(f"Processed {completed}/{total_jobs} jobs...")
        
        team_id, season, defense_type, lineups_df = future.result()
        
        if lineups_df is not None:
            results_by_type[defense_type].append(lineups_df)
        else:
            print(f"  ⚠ No data: team {team_id}, season {season}, {defense_type}")

# --- PROCESS RESULTS FOR EACH DEFENSE TYPE ---
output_files = {
    "Advanced": "Data/allstar_lineups_data_advanced.csv",
    "Base": "Data/allstar_lineups_data_base.csv",
    "Scoring": "Data/allstar_lineups_data_scoring.csv",
    "Misc": "Data/allstar_lineups_data_misc.csv",
    "Opponent": "Data/allstar_lineups_data_opponent.csv"
}

for defense_type, output_file in output_files.items():
    print(f"\n--- Processing {defense_type} ---")
    
    if results_by_type[defense_type]:
        full_lineups = pd.concat(results_by_type[defense_type], ignore_index=True)
        
        # Filter for allstar lineups
        full_lineups['GROUP_ID'] = full_lineups['GROUP_ID'].str[1:-1]
        mask = full_lineups['GROUP_ID'].apply(
            lambda s: any(pid in target_ids_str for pid in s.split('-'))
        )
        filtered = full_lineups[mask].copy()
        
        if not filtered.empty:
            filtered.to_csv(output_file, index=False)
            print(f"✓ Saved {len(filtered)} lineups to {output_file}")
        else:
            print(f"✗ No allstar lineups found for {defense_type}")
    else:
        print(f"✗ No data collected for {defense_type}")

print("\n--- All defense types completed! ---")

In [None]:
from nba_api.stats.endpoints import teamgamelogs
team_gamelog = teamgamelogs.TeamGameLogs(team_id_nullable=1610612742, season_nullable="2023-24")
team_gamelog_df = team_gamelog.get_data_frames()[0]
team_gamelog_df

Unnamed: 0,SEASON_YEAR,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,AVAILABLE_FLAG
0,2023-24,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17T00:00:00,DAL @ BOS,L,48.000000,35,...,101,56,97,64,9,71,98,109,99,1.0
1,2023-24,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14T00:00:00,DAL vs. BOS,W,48.000000,46,...,76,11,41,89,70,34,68,34,2,1.0
2,2023-24,1610612742,DAL,Dallas Mavericks,0042300403,2024-06-12T00:00:00,DAL vs. BOS,L,48.000000,38,...,108,11,80,106,82,34,68,95,79,1.0
3,2023-24,1610612742,DAL,Dallas Mavericks,0042300402,2024-06-09T00:00:00,DAL @ BOS,L,48.000000,38,...,76,77,80,78,70,34,98,97,79,1.0
4,2023-24,1610612742,DAL,Dallas Mavericks,0042300401,2024-06-06T00:00:00,DAL @ BOS,L,48.000000,35,...,113,39,27,106,104,24,90,107,99,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,2023,1610612742,DAL,Dallas Mavericks,1522300074,2023-07-16T00:00:00,DAL vs. ATL,W,40.000000,37,...,105,88,12,78,82,92,80,90,12,
109,2023,1610612742,DAL,Dallas Mavericks,1522300055,2023-07-14T00:00:00,DAL @ IND,W,40.000000,42,...,86,46,27,89,28,2,42,63,12,
110,2023,1610612742,DAL,Dallas Mavericks,1522300039,2023-07-12T00:00:00,DAL vs. GSW,W,43.616667,29,...,108,11,17,10,9,111,3,97,62,
111,2023,1610612742,DAL,Dallas Mavericks,1522300027,2023-07-10T00:00:00,DAL vs. PHI,W,40.000000,37,...,94,56,41,89,104,92,10,65,44,


In [39]:
shotchartlineupdetailed = shotchartlineupdetail.ShotChartLineupDetail(
    context_measure_detailed = 'FGM',
    team_id_nullable=1610612743,
    group_id='-201145-203115-203932-203999-1628420-',
    season="2021-22")
shotchart_df = shotchartlineupdetailed.get_data_frames()[1]
shotchart_df

Unnamed: 0,GRID_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,FGA,FGM,FG_PCT
0,League Averages,Above the Break 3,Back Court(BC),Back Court Shot,35,4,0.114
1,League Averages,Above the Break 3,Center(C),24+ ft.,16368,5613,0.343
2,League Averages,Above the Break 3,Left Side Center(LC),24+ ft.,24924,8511,0.341
3,League Averages,Above the Break 3,Right Side Center(RC),24+ ft.,23120,8171,0.353
4,League Averages,Backcourt,Back Court(BC),Back Court Shot,440,11,0.025
5,League Averages,In The Paint (Non-RA),Center(C),8-16 ft.,11777,5228,0.444
6,League Averages,In The Paint (Non-RA),Center(C),Less Than 8 ft.,23351,9813,0.42
7,League Averages,In The Paint (Non-RA),Left Side(L),8-16 ft.,2197,949,0.432
8,League Averages,In The Paint (Non-RA),Right Side(R),8-16 ft.,2343,1000,0.427
9,League Averages,Left Corner 3,Left Side(L),24+ ft.,11377,4371,0.384


In [33]:
from nba_api.stats.endpoints import shotchartdetail
shotchart = shotchartdetail.ShotChartDetail(
    team_id=1610612743,
    player_id=203999,
    season_type_all_star="Regular Season",
    season_nullable="2023-24",
    context_measure_simple="FGM"
)
shotchart_df = shotchart.get_data_frames()[0]
shotchart_df

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,0022300006,14,203999,Nikola Jokić,1610612743,Denver Nuggets,1,10,55,...,Center(C),Less Than 8 ft.,3,31,7,1,1,20231103,DEN,DAL
1,Shot Chart Detail,0022300006,29,203999,Nikola Jokić,1610612743,Denver Nuggets,1,9,7,...,Center(C),Less Than 8 ft.,0,0,0,1,1,20231103,DEN,DAL
2,Shot Chart Detail,0022300006,66,203999,Nikola Jokić,1610612743,Denver Nuggets,1,6,17,...,Right Side Center(RC),24+ ft.,25,104,236,1,1,20231103,DEN,DAL
3,Shot Chart Detail,0022300006,248,203999,Nikola Jokić,1610612743,Denver Nuggets,2,6,50,...,Center(C),Less Than 8 ft.,2,22,12,1,1,20231103,DEN,DAL
4,Shot Chart Detail,0022300006,270,203999,Nikola Jokić,1610612743,Denver Nuggets,2,5,15,...,Center(C),Less Than 8 ft.,5,0,54,1,0,20231103,DEN,DAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406,Shot Chart Detail,0022301224,551,203999,Nikola Jokić,1610612743,Denver Nuggets,4,5,33,...,Center(C),Less Than 8 ft.,0,7,7,1,1,20231208,DEN,HOU
1407,Shot Chart Detail,0022301224,557,203999,Nikola Jokić,1610612743,Denver Nuggets,4,4,57,...,Center(C),Less Than 8 ft.,0,0,0,1,0,20231208,DEN,HOU
1408,Shot Chart Detail,0022301224,591,203999,Nikola Jokić,1610612743,Denver Nuggets,4,3,0,...,Center(C),Less Than 8 ft.,7,-48,60,1,1,20231208,DEN,HOU
1409,Shot Chart Detail,0022301224,600,203999,Nikola Jokić,1610612743,Denver Nuggets,4,2,10,...,Center(C),8-16 ft.,9,-5,90,1,0,20231208,DEN,HOU


In [37]:
from nba_api.stats.endpoints import teamdashlineups

team_lineups = teamdashlineups.TeamDashLineups(
    team_id=1610612743,        # Denver Nuggets
    season="2021-22",
    season_type_all_star="Regular Season",
    group_quantity=5           # 5-man lineups
).get_data_frames()[1]

team_lineups.head()

Unnamed: 0,GROUP_SET,GROUP_ID,GROUP_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,...,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,SUM_TIME_PLAYED
0,Lineups,-201145-203115-203932-203999-1628420-,J. Green - W. Barton - A. Gordon - N. Jokic - ...,42,27,15,0.643,761.211667,744,1398,...,1,770,1,1,770,770,1,1,1,2283635
1,Lineups,-203115-203932-203999-1628420-1629008-,W. Barton - A. Gordon - N. Jokic - M. Morris -...,9,5,4,0.556,169.916667,156,305,...,2,769,2,3,766,768,3,3,4,509750
2,Lineups,-201145-203085-203932-203999-1628420-,J. Green - A. Rivers - A. Gordon - N. Jokic - ...,17,12,5,0.706,153.395,139,261,...,3,768,3,2,763,767,2,2,2,460185
3,Lineups,-203085-203115-203932-203999-1628420-,A. Rivers - W. Barton - A. Gordon - N. Jokic -...,21,14,7,0.667,124.973333,111,225,...,4,765,4,11,767,766,4,4,3,374920
4,Lineups,-202326-203085-203210-1627854-1630538-,D. Cousins - A. Rivers - J. Green - B. Forbes ...,12,8,4,0.667,112.131667,98,204,...,5,767,5,4,768,769,5,5,6,336395


In [6]:
allstar_lineups = pd.read_csv("../data/raw/lineups/allstar_lineups_data.csv")

In [7]:
allstar_lineups_with_team = pd.read_csv("../data/raw/lineups/allstar_lineups_combined_team.csv")

In [None]:
allstar_lineups_with_team['GROUP_ID']

In [None]:
allstar_ids = [203999, 201939, 201935, 202695, 2544, 1629029, 203507, 1630162, 1626164, 201142, 1628983]
season_team_df = pd.read_csv("../data/raw/lineups/allstars_season_team.csv")
target_ids_str = [str(int(x)) for x in allstar_ids]
combos = list(season_team_df[['TEAM_ID', 'GROUP_VALUE']].drop_duplicates().itertuples(index=False, name=None))

# --- HELPER FUNCTION ---
def fetch_lineup_data(combo_info):
    """
    Fetches lineup data for a single team-season combo.
    Returns a tuple: (team_id, season, defense_type, dataframe)
    """
    team_id, season, defense_type = combo_info
    max_retries = 3
    
    for attempt in range(max_retries):
        try:
            time.sleep(0.6)  # Polite delay
            obj = teamdashlineups.TeamDashLineups(
                team_id=int(team_id),
                season=str(season),
                measure_type_detailed_defense=defense_type,
                per_mode_detailed="Per48",
                group_quantity=5,
            )
            lineups_df = obj.get_data_frames()[1]
            
            if lineups_df is not None and not lineups_df.empty:
                lineups_df['TEAM_ID'] = team_id  # Add TEAM_ID column
                lineups_df['SEASON'] = season  # Add SEASON column
                return (team_id, season, defense_type, lineups_df)
            else:
                return (team_id, season, defense_type, None)
                
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"  ✗ Failed {team_id} {season} {defense_type} after {max_retries} attempts: {e}")
                return (team_id, season, defense_type, None)
            sleep(0.6 * (attempt + 1))

# --- MAIN EXECUTION ---
defense_types = ["Advanced", "Base", "Scoring", "Misc", "Opponent"]

# Create job list: (team_id, season, defense_type)
jobs = [(team_id, season, defense_type) 
        for team_id, season in combos 
        for defense_type in defense_types]

print(f"Created {len(jobs)} jobs ({len(combos)} team-seasons × {len(defense_types)} defense types)")

# Store results by defense type
results_by_type = {dt: [] for dt in defense_types}
total_jobs = len(jobs)
completed = 0

# Run in parallel
num_workers = 10
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [executor.submit(fetch_lineup_data, job) for job in jobs]
    
    for future in futures:
        completed += 1
        if completed % 50 == 0 or completed == total_jobs:
            print(f"Processed {completed}/{total_jobs} jobs...")
        
        team_id, season, defense_type, lineups_df = future.result()
        
        if lineups_df is not None:
            results_by_type[defense_type].append(lineups_df)
        else:
            print(f"  ⚠ No data: team {team_id}, season {season}, {defense_type}")

# --- PROCESS RESULTS FOR EACH DEFENSE TYPE ---
output_files = {
    "Advanced": "../data/raw/lineups/retry/allstar_lineups_data_advanced.csv",
    "Base": "../data/raw/lineups/retry/allstar_lineups_data_base.csv",
    "Scoring": "../data/raw/lineups/retry/allstar_lineups_data_scoring.csv",
    "Misc": "../data/raw/lineups/retry/allstar_lineups_data_misc.csv",
    "Opponent": "../data/raw/lineups/retry/allstar_lineups_data_opponent.csv"
}

for defense_type, output_file in output_files.items():
    print(f"\n--- Processing {defense_type} ---")
    
    if results_by_type[defense_type]:
        full_lineups = pd.concat(results_by_type[defense_type], ignore_index=True)
        
        # Filter for allstar lineups
        full_lineups['GROUP_ID'] = full_lineups['GROUP_ID'].str[1:-1]
        mask = full_lineups['GROUP_ID'].apply(
            lambda s: any(pid in target_ids_str for pid in s.split('-'))
        )
        filtered = full_lineups[mask].copy()
        
        if not filtered.empty:
            filtered.to_csv(output_file, index=False)
            print(f"✓ Saved {len(filtered)} lineups to {output_file}")
        else:
            print(f"✗ No allstar lineups found for {defense_type}")
    else:
        print(f"✗ No data collected for {defense_type}")

print("\n--- All defense types completed! ---")

Created 450 jobs (90 team-seasons × 5 defense types)
  ✗ Failed 1610612743 2018-19 Scoring after 3 attempts: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  ✗ Failed 1610612743 2018-19 Base after 3 attempts: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  ✗ Failed 1610612743 2018-19 Misc after 3 attempts: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  ✗ Failed 1610612743 2017-18 Base after 3 attempts: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
  ✗ Failed 1610612743 2019-20 Scoring after 3 attempts: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
  ⚠ No data: team 1610612743, season 2019-20, Scoring
  ✗ Failed 1610612743 2018-19 Advanced after 3 attempts: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
  ⚠ No data: team 1610612