# Setting Up the Environment for Web Scraping

In [1]:
!pip install bs4 # BeautifulSoup allows us to parse the page
!pip install requests # requests allows us to make http request and extract data from the page

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [1]:
import pandas as pd
import requests
pd.set_option('display.max_columns', None) # so we can see all columns in a wide DataFrame
import time
import numpy as np

# Defining the URL for Web Scraping

In [2]:
test_url = "https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season=2023-24&SeasonType=Regular%20Season&StatCategory=PTS"

# Fetching and Parsing JSON Data from the NBA Stats Website

In [4]:
r = requests.get(url=test_url).json()

In [9]:
table_headers = r['resultSet']['headers']

In [17]:
pd.DataFrame(r['resultSet']['rowSet'], columns=table_headers)

Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF
0,1629029,1,Luka Doncic,1610612742,DAL,70,37.5,11.5,23.6,0.487,4.1,10.6,0.382,6.8,8.7,0.786,0.8,8.4,9.2,9.8,1.4,0.5,4.0,33.9,36.9
1,203507,2,Giannis Antetokounmpo,1610612749,MIL,73,35.2,11.5,18.8,0.611,0.5,1.7,0.274,7.0,10.7,0.657,2.7,8.8,11.5,6.5,1.2,1.1,3.4,30.4,36.4
2,1628983,3,Shai Gilgeous-Alexander,1610612760,OKC,75,34.0,10.6,19.8,0.535,1.3,3.6,0.353,7.6,8.7,0.874,0.9,4.7,5.5,6.2,2.0,0.9,2.2,30.1,32.2
3,1628973,4,Jalen Brunson,1610612752,NYK,77,35.4,10.3,21.4,0.479,2.7,6.8,0.401,5.5,6.5,0.847,0.6,3.1,3.6,6.7,0.9,0.2,2.4,28.7,25.6
4,201142,5,Kevin Durant,1610612756,PHX,75,37.2,10.0,19.1,0.523,2.2,5.4,0.413,4.8,5.6,0.856,0.5,6.1,6.6,5.0,0.9,1.2,3.3,27.1,27.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1630231,236,KJ Martin,1610612755,PHI,60,12.5,1.6,3.0,0.536,0.1,0.5,0.286,0.4,0.7,0.537,0.7,1.5,2.2,0.9,0.4,0.2,0.5,3.7,5.1
236,203939,237,Dwight Powell,1610612742,DAL,63,13.3,1.1,1.7,0.679,0.0,0.0,0.333,1.0,1.4,0.708,1.5,1.9,3.4,1.3,0.4,0.3,0.5,3.3,7.3
237,1630192,238,Zeke Nnaji,1610612743,DEN,58,9.9,1.2,2.6,0.463,0.1,0.4,0.261,0.7,1.1,0.677,1.1,1.1,2.2,0.6,0.3,0.7,0.5,3.2,4.7
238,1630550,239,JT Thor,1610612766,CHA,63,12.4,1.3,2.9,0.437,0.4,1.3,0.346,0.2,0.3,0.550,0.7,1.6,2.3,0.5,0.2,0.4,0.2,3.2,4.5


# Creating and Structuring the Initial DataFrame

In [18]:
temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns=table_headers)

In [20]:
temp_df2 = pd.DataFrame({'Year':['2000-01' for i in range(len(temp_df1))],
                        'Season_type':['Regular%20Season' for i in range(len(temp_df1))]})
temp_df3 = pd.concat([temp_df2,temp_df1], axis=1)
temp_df3

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF
0,2000-01,Regular%20Season,1629029,1,Luka Doncic,1610612742,DAL,70,37.5,11.5,23.6,0.487,4.1,10.6,0.382,6.8,8.7,0.786,0.8,8.4,9.2,9.8,1.4,0.5,4.0,33.9,36.9
1,2000-01,Regular%20Season,203507,2,Giannis Antetokounmpo,1610612749,MIL,73,35.2,11.5,18.8,0.611,0.5,1.7,0.274,7.0,10.7,0.657,2.7,8.8,11.5,6.5,1.2,1.1,3.4,30.4,36.4
2,2000-01,Regular%20Season,1628983,3,Shai Gilgeous-Alexander,1610612760,OKC,75,34.0,10.6,19.8,0.535,1.3,3.6,0.353,7.6,8.7,0.874,0.9,4.7,5.5,6.2,2.0,0.9,2.2,30.1,32.2
3,2000-01,Regular%20Season,1628973,4,Jalen Brunson,1610612752,NYK,77,35.4,10.3,21.4,0.479,2.7,6.8,0.401,5.5,6.5,0.847,0.6,3.1,3.6,6.7,0.9,0.2,2.4,28.7,25.6
4,2000-01,Regular%20Season,201142,5,Kevin Durant,1610612756,PHX,75,37.2,10.0,19.1,0.523,2.2,5.4,0.413,4.8,5.6,0.856,0.5,6.1,6.6,5.0,0.9,1.2,3.3,27.1,27.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,2000-01,Regular%20Season,1630231,236,KJ Martin,1610612755,PHI,60,12.5,1.6,3.0,0.536,0.1,0.5,0.286,0.4,0.7,0.537,0.7,1.5,2.2,0.9,0.4,0.2,0.5,3.7,5.1
236,2000-01,Regular%20Season,203939,237,Dwight Powell,1610612742,DAL,63,13.3,1.1,1.7,0.679,0.0,0.0,0.333,1.0,1.4,0.708,1.5,1.9,3.4,1.3,0.4,0.3,0.5,3.3,7.3
237,2000-01,Regular%20Season,1630192,238,Zeke Nnaji,1610612743,DEN,58,9.9,1.2,2.6,0.463,0.1,0.4,0.261,0.7,1.1,0.677,1.1,1.1,2.2,0.6,0.3,0.7,0.5,3.2,4.7
238,2000-01,Regular%20Season,1630550,239,JT Thor,1610612766,CHA,63,12.4,1.3,2.9,0.437,0.4,1.3,0.346,0.2,0.3,0.550,0.7,1.6,2.3,0.5,0.2,0.4,0.2,3.2,4.5


In [21]:
del temp_df1, temp_df2, temp_df3

In [22]:
df_cols = ['Year', 'Season_type'] + table_headers

In [23]:
pd.DataFrame(columns=df_cols)

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF


# Setting Up HTTP Request Headers

In [28]:
headers = {
    'accept:*/*',
    'accept-encoding': 'gzip, deflate, br, zstd',
    'accept-language': 'en-US,en;q=0.9',
    'connection': 'keep-alive',
    'host': 'stats.nba.com',
    'origin: https': '//www.nba.com',
    'referer: https': '//www.nba.com/',
    'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

SyntaxError: invalid syntax (1989589813.py, line 3)

# Looping Through Seasons and Playoff Data to Scrape NBA Statistics

In [30]:
df = pd.DataFrame(columns=df_cols)
season_types = ['Regular%20Season', 'Playoffs']
years = ['2000-01','2001-02','2002-03','2003-04','2004-05','2005-06','2006-07','2007-08','2008-09','2009-10',
         '2010-11','2011-12','2012-13','2013-14','2014-15','2015-16','2016-17','2017-18','2018-19','2020-21',
         '2021-22','2022-23','2023-24']

begin_loop = time.time()

for y in years:
    for s in season_types:
        api_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season='+y+'&SeasonType='+s+'&StatCategory=PTS'
        r = requests.get(url=api_url).json()
        temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns=table_headers)
        temp_df2 = pd.DataFrame({'Year':[y for i in range(len(temp_df1))],
                                'Season_type':[s for i in range(len(temp_df1))]})
        temp_df3 = pd.concat([temp_df2,temp_df1],axis=1)
        df = pd.concat([df,temp_df3], axis=0)
        print(f'Finished Scraping data for the {y} {s}.')
        #delay loop, so nba doesnt thing we are a bot
        lag = np.random.uniform(low=5,high=40)
        print(f'...waiting {round(lag,1)} seconds')
        time.sleep(lag) 
print(f'Process Completed! Total run time: {round((time.time()-begin_loop)/60,2)}')
df.to_excel('nba_player_data.xlsx',index=False)

Finished Scraping data for the 2000-01 Regular%20Season.
...waiting 21.0 seconds
Finished Scraping data for the 2000-01 Playoffs.
...waiting 28.9 seconds
Finished Scraping data for the 2001-02 Regular%20Season.
...waiting 22.8 seconds
Finished Scraping data for the 2001-02 Playoffs.
...waiting 19.0 seconds
Finished Scraping data for the 2002-03 Regular%20Season.
...waiting 11.1 seconds
Finished Scraping data for the 2002-03 Playoffs.
...waiting 6.3 seconds
Finished Scraping data for the 2003-04 Regular%20Season.
...waiting 29.7 seconds
Finished Scraping data for the 2003-04 Playoffs.
...waiting 8.4 seconds
Finished Scraping data for the 2004-05 Regular%20Season.
...waiting 32.6 seconds
Finished Scraping data for the 2004-05 Playoffs.
...waiting 23.4 seconds
Finished Scraping data for the 2005-06 Regular%20Season.
...waiting 34.9 seconds
Finished Scraping data for the 2005-06 Playoffs.
...waiting 20.3 seconds
Finished Scraping data for the 2006-07 Regular%20Season.
...waiting 8.2 second