In [1]:
# libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle

In [2]:
# common variables 
from nba_api.stats.static import teams

teamIds = dict([(team['abbreviation'], team['id']) for team in teams.get_teams()])

# # get all team abbreviations since 1983 associated with each team ID (takes a few minutes to run)

# from nba_api.stats.endpoints import leaguegamefinder
# import time

# teamAbbrevsDict = dict()

# for teamAbbrev in teamIds.keys():
#     allGamesDf = leaguegamefinder.LeagueGameFinder(team_id_nullable=teamIds[teamAbbrev]) \
#                                  .get_data_frames()[0]
    
#     teamAbbrevsDict[int(allGamesDf['TEAM_ID'].unique())] = list(allGamesDf['TEAM_ABBREVIATION'].unique())
    
#     time.sleep(1)

teamAbbrevsDict = {1610612737: ['ATL'], 1610612738: ['BOS'], 1610612739: ['CLE'], 1610612740: ['NOP', 'NOH', 'NOK']
                   , 1610612741: ['CHI'], 1610612742: ['DAL'], 1610612743: ['DEN'], 1610612744: ['GSW', 'GOS']
                   , 1610612745: ['HOU'], 1610612746: ['LAC', 'SDC'], 1610612747: ['LAL'], 1610612748: ['MIA']
                   , 1610612749: ['MIL'], 1610612750: ['MIN'], 1610612751: ['BKN', 'NJN'], 1610612752: ['NYK']
                   , 1610612753: ['ORL'], 1610612754: ['IND'], 1610612755: ['PHI', 'PHL'], 1610612756: ['PHX']
                   , 1610612757: ['POR'], 1610612758: ['SAC', 'KCK'], 1610612759: ['SAS', 'SAN'], 1610612760: ['OKC', 'SEA']
                   , 1610612761: ['TOR'], 1610612762: ['UTA', 'UTH'], 1610612763: ['MEM', 'VAN'], 1610612764: ['WAS']
                   , 1610612765: ['DET'], 1610612766: ['CHA', 'CHH']
                  }

standardStats = ["FGM","FGA","FG_PCT","FG3M","FG3A","FG3_PCT","FTM","FTA","FT_PCT"
                 ,"PTS","OREB","DREB","REB","AST","TOV","STL","BLK","PF","PLUS_MINUS"
                ]
advancedStats = ["OFF_RATING","DEF_RATING","NET_RATING"
                 ,"AST_PCT","AST_TO","AST_RATIO","TM_TOV_PCT"
                 ,"OREB_PCT","DREB_PCT","REB_PCT"
                 , "EFG_PCT", "TS_PCT", "PACE", "POSS"
                ]
teamIds

{'ATL': 1610612737,
 'BOS': 1610612738,
 'CLE': 1610612739,
 'NOP': 1610612740,
 'CHI': 1610612741,
 'DAL': 1610612742,
 'DEN': 1610612743,
 'GSW': 1610612744,
 'HOU': 1610612745,
 'LAC': 1610612746,
 'LAL': 1610612747,
 'MIA': 1610612748,
 'MIL': 1610612749,
 'MIN': 1610612750,
 'BKN': 1610612751,
 'NYK': 1610612752,
 'ORL': 1610612753,
 'IND': 1610612754,
 'PHI': 1610612755,
 'PHX': 1610612756,
 'POR': 1610612757,
 'SAC': 1610612758,
 'SAS': 1610612759,
 'OKC': 1610612760,
 'TOR': 1610612761,
 'UTA': 1610612762,
 'MEM': 1610612763,
 'WAS': 1610612764,
 'DET': 1610612765,
 'CHA': 1610612766}

## Get Traditional and Advanced Stats (per game)

In [3]:
# functions
def convert_season_str(withoutDashStr):
    '''
    Convert string of year to season string with a dash (eg. convert '2021' to '2021-22').
    '''
    
    nextSeasonStr = str(int(withoutDashStr[-2:]) + 1)
    
    if len(nextSeasonStr) < 2: # add 0 to beginning for seasons in 2000s
        nextSeasonStr = '0' + nextSeasonStr
    elif len(nextSeasonStr) == 3: # I think only to account for season 1999-00
        nextSeasonStr = nextSeasonStr[-2:]

    withDashStr = withoutDashStr + '-' + nextSeasonStr

    return withDashStr

def filter_regSeason_games(seasonStr, lowestWinPct, allSeasonGames):
    '''
    Filter out regular season games based on win percentage of opposing team.
    
    Parameters:
    seasonStr (string) = season to get games from (eg. '2021-22')
    lowestWinPct (float) = matchups against teams with records lower than this win percentage are excluded (eg. 0.500)
    allSeasonGames (dataframe) = dataframe of stats for all season games
    '''
    
    # get standings of every team that season
    seasonStandings = leaguestandings.LeagueStandings(season=seasonStr).get_data_frames()[0]

    # filter for IDs of only teams greater than lowestWinPct
    teamsToUseIds = seasonStandings[seasonStandings['WinPCT'] >= lowestWinPct]['TeamID'].to_list()

    # convert IDs to all possible corresponding abbreviations of teams
    allAbbrevs = [teamAbbrevsDict[teamId] for teamId in teamsToUseIds]
    teamsToUseAbbrevs = list(itertools.chain.from_iterable(allAbbrevs))

    filteredGames = allSeasonGames[allSeasonGames['MATCHUP'].apply(lambda matchupStr: matchupStr[-3:] in teamsToUseAbbrevs)]
    
    return filteredGames

In [4]:
# get traditional and advanced stats (takes >10min to run)

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.endpoints import leaguestandings
from nba_api.stats.endpoints import leaguedashteamshotlocations
import itertools
import time

allTeamAbbrevs = list(teamIds.keys()) # abbreviations for all teams to get stats for
allYearsStrs = [str(seasonInt) for seasonInt in np.arange(1983, 2022)] # all years to get stats for
lowestWinPct = 0.500 # filter regular season games to only against teams with records greater than this win probability

# initiate dicts to store all data
colNames = ['TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GP'] + standardStats
seasonStats = dict.fromkeys(allYearsStrs, pd.DataFrame(columns=colNames)) 
playoffStats = dict.fromkeys(allYearsStrs, pd.DataFrame(columns=colNames))

for teamAbbrev in allTeamAbbrevs:

    for seasonType in ['Regular Season', 'Playoffs']:
            
        if seasonType == 'Regular Season':
            seasonIdSuffix = '2' # arbitrary code used in nba data to denote regular season games
        elif seasonType == 'Playoffs':
            seasonIdSuffix = '4'
    
        # get data for all games the franchise has played (since 1983)
        allGamesDf = leaguegamefinder.LeagueGameFinder(team_id_nullable=teamIds[teamAbbrev], season_type_nullable=seasonType) \
                               .get_data_frames()[0]

        for yearStr in allYearsStrs:

            seasonStr = convert_season_str(yearStr) # need to use different format for some functions
            
            if seasonType == 'Regular Season':
                currSeasonGames = allGamesDf[(allGamesDf['SEASON_ID']==seasonIdSuffix+yearStr)
                                             & (allGamesDf['GAME_ID'].apply(lambda gameId: False if gameId[0:2] != '00' else True)) 
                                               # remove summer league games (not sure why they're included to begin with)
                                            ]
            elif seasonType == 'Playoffs':
                currSeasonGames = allGamesDf[(allGamesDf['SEASON_ID']==seasonIdSuffix+yearStr)]
            
            if not currSeasonGames.empty: # skip teams that didn't make the playoffs or didn't exist that season
                # for regular season games:
                # get league standings for that season and filter out games (eg. played against non-playoff teams)
                if seasonType == 'Regular Season':
                    filteredGames = filter_regSeason_games(seasonStr, lowestWinPct, currSeasonGames)
                elif seasonType == 'Playoffs':
                    filteredGames = currSeasonGames # no filter for playoff games
                
                time.sleep(1) # pause to avoid timeouts
                
                # do mean across all games played
                perGameStats = filteredGames[standardStats].apply(np.mean)
                perGameStats = pd.concat([allGamesDf.iloc[0][['TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME']], perGameStats])
                perGameStats['GP'] = filteredGames.shape[0] # store number of games played that season
                
                # get advanced stats and shot locations if available (starting 1997 season)
                if int(yearStr) >= 1997:
                # get advanced stats
                    temp = leaguedashteamstats.LeagueDashTeamStats(season=seasonStr, team_id_nullable=teamIds[teamAbbrev]
                                                                   , measure_type_detailed_defense='Advanced', season_type_all_star=seasonType
                                                                  )
                    advancedDf = temp.get_data_frames()[0][['TEAM_NAME', 'TEAM_ID'] + advancedStats]

                    # concat standard and advanced stats into one table
                    perGameStats = pd.concat([perGameStats, advancedDf[advancedDf['TEAM_ID']==perGameStats['TEAM_ID']][advancedStats].squeeze()])
                    perGameStats['POSS'] = perGameStats['POSS']/currSeasonGames.shape[0] # get per game posessions
                    
                else:
                    # for seasons without advanced stats, concat nan placeholders
                    advancedDfNans = pd.Series(np.full_like(np.empty(len(advancedStats)), np.nan), index=advancedStats)
                    perGameStats = pd.concat([perGameStats, advancedDfNans, shotDistDfNans,])
                    
                # change decimal place for percent stats
                pctCols = [stat for stat in perGameStats.index if 'PCT' in stat]
                perGameStats[pctCols] = perGameStats[pctCols]*100
                
                # store regular season and playoffs separately
                if seasonType == 'Regular Season':
                    seasonStats[yearStr] = seasonStats[yearStr].append(perGameStats, ignore_index=True)
                elif seasonType == 'Playoffs':
                    playoffStats[yearStr] = playoffStats[yearStr].append(perGameStats, ignore_index=True)


In [5]:
# save dicts to pickle files
saveDir = '' # input path where you want data to be saved to 

# save regular season data
saveFName = 'allTeams_perGameStats_regSeason_filtered_above500_1983-2022'
with open(saveDir + saveFName, 'wb') as file:
    pickle.dump(seasonStats, file)
    
# save playoffs data
saveFName = 'allTeams_perGameStats_playoffs_filtered_above500_1983-2022'
with open(saveDir + saveFName, 'wb') as file:
    pickle.dump(playoffStats, file)

In [337]:
# load data (saved above: per game stats for all stats of all seasons)
loadDir = '' # input path where data will be loaded from

# regular season
loadFName = 'allTeams_perGameStats_regSeason_filtered_above500_1983-2022'
with open(loadDir + loadFName, 'rb') as file:
    seasonStats = pickle.load(file)
    
# playoffs
loadFName = 'allTeams_perGameStats_playoffs_filtered_above500_1983-2022'
with open(loadDir + loadFName, 'rb') as file:
    playoffStats = pickle.load(file)

In [452]:
# take a look at data
seasonStats['2001']

Unnamed: 0,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GP,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,DREB_PCT,EFG_PCT,NET_RATING,OFF_RATING,OREB_PCT,PACE,POSS,REB_PCT,TM_TOV_PCT,TS_PCT
0,1610612737,ATL,Atlanta Hawks,51,35.196078,81.078431,0.434941,5.039216,14.862745,0.341588,...,0.689,0.471,-4.4,100.4,0.297,93.21,150.54902,0.491,0.166,0.517
1,1610612738,BOS,Boston Celtics,46,35.0,81.913043,0.428217,8.586957,24.0,0.356326,...,0.695,0.476,2.2,102.1,0.272,93.6,168.217391,0.481,0.144,0.52
2,1610612739,CLE,Cleveland Cavaliers,51,35.137255,79.764706,0.440667,4.72549,12.764706,0.364569,...,0.69,0.477,-3.6,103.4,0.319,91.31,148.098039,0.505,0.158,0.524
3,1610612741,CHI,Chicago Bulls,50,33.78,77.98,0.43384,3.54,10.32,0.32828,...,0.685,0.456,-9.1,97.2,0.298,91.56,150.98,0.487,0.166,0.499
4,1610612742,DAL,Dallas Mavericks,46,38.434783,83.869565,0.459304,7.826087,21.304348,0.371239,...,0.683,0.507,4.3,110.3,0.285,94.36,170.065217,0.487,0.127,0.553
5,1610612743,DEN,Denver Nuggets,50,35.04,84.0,0.41904,5.44,15.98,0.34256,...,0.668,0.455,-6.3,100.3,0.341,91.35,150.74,0.497,0.16,0.495
6,1610612744,GSW,Golden State Warriors,50,36.98,85.66,0.43312,4.16,12.44,0.33312,...,0.669,0.452,-5.6,101.5,0.373,95.75,157.8,0.516,0.175,0.499
7,1610612745,HOU,Houston Rockets,50,33.86,79.72,0.42618,6.66,18.96,0.34598,...,0.675,0.465,-5.4,101.6,0.327,89.56,149.12,0.497,0.155,0.507
8,1610612746,LAC,LA Clippers,50,35.48,81.28,0.43874,5.14,14.32,0.3494,...,0.673,0.476,-0.4,104.3,0.349,91.21,150.48,0.512,0.161,0.519
9,1610612747,LAL,Los Angeles Lakers,46,37.478261,83.23913,0.451717,5.891304,17.521739,0.333674,...,0.686,0.498,7.6,108.0,0.311,93.32,167.195652,0.501,0.135,0.534
