In [1]:
import datetime
import numpy as np
import pandas as pd
import random
import time
import json
import asyncio
import aiohttp
import winsound
import gc

In [2]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

In [3]:
"""Used to make a beef noise to indicate the script is complete. This will only work in Windows"""
def beep_sound():
    duration = 2000  # milliseconds
    freq = 1500  # Hz
    winsound.Beep(freq, duration)

In [4]:
'''Function is used to pull the request data'''
async def get_json(client,url,headers,params):
    async with client.get(url,params=params,headers=headers) as response:
        try:
            assert response.status==200
            ret=await response.json()
            return ret
        except AssertionError:
            pass

In [5]:
 async def response_basic_summ(wait_base,client,headers,game,params):
    wait_t=random.uniform(0,wait_base)
    await asyncio.sleep(wait_t)
    params['GameID']=game
    url_summ='https://stats.nba.com/stats/boxscoresummaryv2'
    response_summ= await get_json(client,url_summ,headers,params)
    return response_summ

'''The main courtine. This will be invoked to pull a list of all the JSON results from stats.nba.com'''
async def main(game_list_key,year_param,season_param,wait_base):
    wait_base=wait_base
    start_time = time.time()
    
    '''Need to pass headers and the game ID for box score summary data'''
    headers = {
    'Host': 'stats.nba.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
        }

    payload={}
    payload['GameID']='0021400340'
    
    conn= aiohttp.TCPConnector(limit=50)
    
    async with aiohttp.ClientSession(connector=conn) as client: #create the client sesson object that persists across requests
        '''create task is used to start the task to pull all the asynch requests'''
        summ_box_data=[asyncio.create_task(response_basic_summ(wait_base,client,headers,game,payload)) for game in game_list_key]
            
        '''The await...gather ensures all of the queries are complete before the function returns the list of JSONs back to the main program'''    
        results = await asyncio.gather(*summ_box_data, return_exceptions=True)
        print("it took --- %s seconds ---" % (time.time() - start_time),'to go through:'+str(len(game_list_key))+' games')
        return(results)
   

In [6]:
'''This function returns a list of GameIDs for the selected season'''
def game_list_gen(year_to_pull):
    game_list=['002'+str(year_to_pull)[2:4] + str(i).zfill(5) for i in range(1,1231)] #002 is the prefix, plus last two digits of the year+games 1-1230 padded to 5 digits
    return game_list

In [7]:
'''Because the JSONs have nested lists we need to flatten them to create the dataframes'''
def flat_list(list_to_flatten):
    flat_list = [item for sublist in list_to_flatten for item in sublist]
    return flat_list

In [8]:
'''build the list that flattens each of the JSON results from each table
First initialize all the dictionaries
Then take the results for the year and convert to a dictionary with the GameID as the key
Loop through each game and build each of the respective dictionaries
'''

'''Create the column headers for the tables + mark the first column as the GameID (GID)'''

datatables=['GameSummary','OtherStats', 'Officials', 'InactivePlayers', 'GameInfo', 'LineScore', 'LastMeeting', 'SeasonSeries'] #a list of the tables returned from the summary query

def build_datatable(results):
    
    #clear dataframes for garbage collection
    try:
        del df_GameSummary 
        del df_OtherStats 
        del df_Officials 
        del df_InactivePlayers 
        del df_GameInfo 
        del df_LineScore 
        del df_LastMeeting 
        del df_SeasonSeries 
        del df_InactivePlayers 
        del df_GameInfo 
        del df_LineScore 
        del df_LastMeeting 
        del df_SeasonSeries 
    except:
        pass
    
    GameSummary=[]
    OtherStats=[]
    Officials=[]
    InactivePlayers=[]
    GameInfo=[]
    LineScore=[]
    LastMeeting=[]
    SeasonSeries=[] 
    
    #Build the column headers for the datatable
    GameSummary_col=['GID']+(results[0]['resultSets'][0]['headers'])
    OtherStats_col=['GID']+(results[0]['resultSets'][1]['headers'])
    Officials_col=['GID']+(results[0]['resultSets'][2]['headers'])
    InactivePlayers_col=['GID']+(results[0]['resultSets'][3]['headers'])
    GameInfo_col=['GID']+(results[0]['resultSets'][4]['headers'])
    LineScore_col=['GID']+(results[0]['resultSets'][5]['headers'])
    LastMeeting_col=['GID']+(results[0]['resultSets'][6]['headers'])
    SeasonSeries_col=['GID']+(results[0]['resultSets'][7]['headers'])
    
    r={}
    error_cnt=0 #track the number of exceptions
    
    #Split the result sets. Since some data may be corrupted a dict comprehension won't work
    for i in range(0, len(results)):
        try:
            r[(results[i]['parameters']['GameID'])] = results[i]['resultSets']
        except (RuntimeError, TypeError, NameError) as e:
            error_cnt+=1
            continue

    for k,v in r.items():
        GameSummary.append([[k]+ row_d for row_d in r[k][0]['rowSet'] if row_d is not None])
        OtherStats.append([[k]+ row_d for row_d in r[k][1]['rowSet'] if row_d is not None])
        Officials.append([[k]+ row_d for row_d in r[k][2]['rowSet'] if row_d is not None])
        InactivePlayers.append([[k]+ row_d for row_d in r[k][3]['rowSet'] if row_d is not None])
        GameInfo.append([[k]+ row_d for row_d in r[k][4]['rowSet'] if row_d is not None])
        LineScore.append([[k]+ row_d for row_d in r[k][5]['rowSet'] if row_d is not None])
        LastMeeting.append([[k]+ row_d for row_d in r[k][6]['rowSet'] if row_d is not None])
        SeasonSeries.append([[k]+ row_d for row_d in r[k][7]['rowSet'] if row_d is not None])

    #Need to flatten the results in order to build the datatables
    GameSummary=flat_list(GameSummary)
    OtherStats=flat_list(OtherStats)
    Officials=flat_list(Officials)
    InactivePlayers=flat_list(InactivePlayers)
    GameInfo=flat_list(GameInfo)
    LineScore=flat_list(LineScore)
    LastMeeting=flat_list(LastMeeting)
    SeasonSeries=flat_list(SeasonSeries)
    
    #build the 8 datatables and label the columns
    df_GameSummary = pd.DataFrame(GameSummary, columns=GameSummary_col)
    df_OtherStats = pd.DataFrame(OtherStats, columns=OtherStats_col)
    df_Officials = pd.DataFrame(Officials, columns=Officials_col)
    df_InactivePlayers = pd.DataFrame(InactivePlayers, columns=InactivePlayers_col)
    df_GameInfo = pd.DataFrame(GameInfo, columns=GameInfo_col)
    df_LineScore = pd.DataFrame(LineScore, columns=LineScore_col)
    df_LastMeeting = pd.DataFrame(LastMeeting, columns=LastMeeting_col)
    df_SeasonSeries = pd.DataFrame(SeasonSeries, columns=SeasonSeries_col)
    
    #keep a list of the dataframes for reference
    df_list=[]
    df_list=df_GameSummary, df_OtherStats, df_Officials, df_InactivePlayers, df_GameInfo, df_LineScore, df_LastMeeting, df_SeasonSeries
    
    #return each of the dataframes
    return df_GameSummary, df_OtherStats, df_Officials, df_InactivePlayers, df_GameInfo, df_LineScore, df_LastMeeting, df_SeasonSeries,df_list,error_cnt

    

In [11]:
'''This code runs the main loop
First it takes user input to determine how many seasons of data to scrape.
Second it converts those into a list used for the loop.
Finally it invokes the main function to run the aynchio loop'''

#first_year=int(input('What is the first year of the season to pull:'))
#last_year=int(input('What is the last year of the season to pull:'))
first_year=int(input('Pick the first season to scrape:'))
last_year=int(input('Pick the last season to scrape:'))
wait_base=int(input('~how many seconds should the requests take in total?'))
years_to_pull=[x for x in range(first_year,last_year+1)] #create a list with the years to loop through

#print the seasons that will be pulled
print(f'The following years were selected: {years_to_pull}')

#loop through each of the seasons
for year_to_pull in years_to_pull:
    year_param=((str(year_to_pull)+'-'+str((int(year_to_pull)+1)))) 
    season_param=((str(year_to_pull)+'-'+str((int(year_to_pull)+1))[2:4])) #season parameter for the query
    
    game_list_key=game_list_gen(year_to_pull) #call a function to generate the GameIDs for selected season
    start_time = time.time()
    results= await main(game_list_key[:],year_to_pull,season_param,wait_base) #this would be asyncio.run(main(client)) in plain Python(https://bit.ly/36MtBDI)
    
    #call the function to build and return each of the datatables
    df_GameSummary, df_OtherStats, df_Officials, df_InactivePlayers, df_GameInfo, df_LineScore, df_LastMeeting, df_SeasonSeries,df_list,error_cnt=build_datatable(results)
    
    #add the season parameter to the 8 datatables before they are saved for sorting later
    df_GameSummary['SEASON']=year_param
    df_GameSummary.to_csv(f'df_GameSummary{year_param}.csv')
    
    df_OtherStats['SEASON']=year_param
    df_OtherStats.to_csv(f'df_OtherStats{year_param}.csv')
    
    df_Officials['SEASON']=year_param
    df_Officials.to_csv(f'df_Officials{year_param}.csv')
        
    df_InactivePlayers['SEASON']=year_param
    df_InactivePlayers.to_csv(f'df_InactivePlayers{year_param}.csv')
        
    df_GameInfo['SEASON']=year_param
    df_GameInfo.to_csv(f'df_GameInfo{year_param}.csv')
    
    df_LineScore['SEASON']=year_param
    df_LineScore.to_csv(f'df_LineScore{year_param}.csv')
    
    df_LastMeeting['SEASON']=year_param
    df_LastMeeting.to_csv(f'df_LastMeeting{year_param}.csv')
    
    df_SeasonSeries['SEASON']=year_param
    df_SeasonSeries.to_csv(f'df_SeasonSeries{year_param}.csv')
    
    beep_sound() #make the computer beep when the script is finished running
    print("it took --- %s seconds ---" % (time.time() - start_time)+'to go through everything for the ' + year_param + ' season' + ' and there were ' + 'error_cnt' + ': rows with errors')
    time.sleep(random.uniform(0,13)) #pause season pulls to reduce the chances of being throttled

Pick the first season to scrape: 1980
Pick the last season to scrape: 2019
~how many seconds should the requests take in total? 1200


The following years were selected: [1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
it took --- 1201.255652666092 seconds --- to go through:1230 games


PermissionError: [Errno 13] Permission denied: 'df_LineScore1980-1981.csv'