In [1]:
#If necessary, install BeautifulSoup4 and Pandas Packages:
#!pip install BeautifulSoup4
#!pip install pandas
#!pip install tqdm
#!pip install numpy

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import unicodedata
import numpy

from tqdm import tqdm

In [12]:
year = 2021
months = ['october', 
          'november',
          'december',
          'january',
          'february',
          'march',
          'april',
          'may',
          'june',
          'july',
          'august',
          'september'
         ]

links = []
links_tmp = []
for month in range(len(months)):
    url = 'https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_games-'+months[month]+'.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    links_tmp = [game['href'] for game in soup.find_all('a', text='Box Score')]
    base_url = "https://www.basketball-reference.com"
    links_tmp = [base_url + link for link in links_tmp]
    for link in links_tmp:
        links.append(link)

In [13]:
stats = []
with tqdm(total=len(links)) as pbar:
    for link in range(len(links)):
    #for link in range(150):
        tmp = links[link]
        page = requests.get(tmp)
        soup = BeautifulSoup(page.text, 'html.parser')

        teams = [game['href'] for game in soup.find_all('a', {'itemprop': 'name'})]
        teams = [teams[0][7:10], teams[1][7:10]]

        stats_tmp = []
        for team in range(len(teams)):
            basic_stats = soup.find('table',{'id': 'box-'+teams[team]+'-game-basic'})
            adv_stats = soup.find('table',{'id': 'box-'+teams[team]+'-game-advanced'})

            #Player Names
            player_names = [[th.getText() for th in basic_stats.findAll('tr')[1:][i].findAll('th')] for i in range(len(basic_stats.findAll('tr')[1:]))]
            player_names = player_names[1:6] + player_names[7:-1]
            df_player_names = pd.DataFrame(player_names)

            #Starters
            starters = ['Y','Y','Y','Y','Y'] + ['N']*(len(player_names)-5)
            df_starters = pd.DataFrame(starters)
            
            #Player Basic Stats
            team_stats_basic = [[td.getText() for td in basic_stats.findAll('tr')[1:][i].findAll('td')]
                    for i in range(len(basic_stats.findAll('tr')[1:]))]
            team_stats_basic = team_stats_basic[1:6] + team_stats_basic[7:-1]
            df_team_stats_basic = pd.DataFrame(team_stats_basic)

            #Player Advanced Stats
            team_stats_adv = [[td.getText() for td in adv_stats.findAll('tr')[1:][i].findAll('td')]
                    for i in range(len(adv_stats.findAll('tr')[1:]))]
            team_stats_adv = team_stats_adv[1:6] + team_stats_adv[7:-1]
            df_team_stats_adv = pd.DataFrame(team_stats_adv)

            #Game ID
            game_id = links[link][47:59]
            game_id = [game_id for i in range(len(player_names))]
            df_game_id = pd.DataFrame(game_id)

            #Team ID
            team_name = [teams[team] for i in range(len(player_names))]
            df_team_name = pd.DataFrame(team_name)

            #Consolidating the Data:
            stats_tmp = pd.concat([df_game_id,df_team_name,df_player_names,df_starters,df_team_stats_basic,df_team_stats_adv],axis=1)
            stats.append(stats_tmp)
        pbar.update(1)
     
    output_raw = pd.concat(stats)

100%|██████████████████████████████████████████████████████████████████████████████| 1264/1264 [14:32<00:00,  1.45it/s]


In [14]:
output = output_raw

In [15]:
#Add a header to the Data
header = ['Game_ID','Team','Player','Starter','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','P/M',
          'del','TS%','eFG%','3PAr', 'FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%','ORtg','DRtg','BPM']
output.columns = header

#Index Column - Reset IDs
output = output.reset_index(drop=True)

#Game_ID - OK 
#Team - OK
#Player - Needs Special Characters altered
output['Player'] = output['Player'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

#MP - Convert to decimal
for row in range(len(output['MP'])):
    if ":" in output['MP'][row]:
        output['MP'][row] = round(float(output['MP'][row].split(':')[0]+str(int(output['MP'][row].split(':')[1])/60)[1:]),2)
    else:
        output['MP'][row] = 0

#Delete redundant Minutes Played (del) column:
output = output.drop(['del'],axis=1)

output = output.fillna(0)
output = output.replace('',0)
print(output)

            Game_ID Team                Player Starter     MP  FG FGA   FG%  \
0      201611010CLE  HOU          James Harden       Y  38.17  13  20  .650   
1      201611010CLE  HOU           Eric Gordon       Y  32.98   6  13  .462   
2      201611010CLE  HOU          Trevor Ariza       Y  31.48   4  11  .364   
3      201611010CLE  HOU         Ryan Anderson       Y  30.52   4  12  .333   
4      201611010CLE  HOU          Clint Capela       Y  18.52   3   5  .600   
...             ...  ...                   ...     ...    ...  ..  ..   ...   
32425  201706120GSW  GSW      Shaun Livingston       N  10.03   2   3  .667   
32426  201706120GSW  GSW           Matt Barnes       N   1.07   0   0     0   
32427  201706120GSW  GSW             Ian Clark       N   0.00   0   0     0   
32428  201706120GSW  GSW  James Michael McAdoo       N   0.00   0   0     0   
32429  201706120GSW  GSW          JaVale McGee       N   0.00   0   0     0   

      3P 3PA  ...  DRB%  TRB%  AST% STL% BLK%  TOV%

In [16]:
output.to_csv('2017stats.csv')

In [103]:
#!pip install gspread
#!pip install gspread_dataframe
import gspread
from gspread_dataframe import set_with_dataframe

gc = gspread.oauth()
sh = gc.open_by_key('1ZmvBC3wiXHROO-I2aovyW2zF9t_z-gan4oDqRP0wSHE')
worksheet = sh.get_worksheet(0)

range_of_cells = worksheet.range('A1:AN50000') #-> Select the range you want to clear
for cell in range_of_cells:
    cell.value = ''
set_with_dataframe(worksheet, output)