In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import numpy as np
import time
import random

# References
- one season page:  https://www.basketball-reference.com/leagues/NBA_2020_games.html
- one game page: https://www.basketball-reference.com/boxscores/201910220TOR.html
- Fastest way to append rows to df: https://stackoverflow.com/questions/57000903/what-is-the-fastest-and-most-efficient-way-to-append-rows-to-a-dataframe

In [2]:
list_html = 'https://www.basketball-reference.com/boxscores/201910220TOR.html'

In [3]:
# Getting all references from the homepage
# for link in soup.find_all("a"):
#     print("Inner Text: {}".format(link.text))
#     print("Title: {}".format(link.get("title")))
#     print("href: {}".format(link.get("href")))

# Functions to scrape one game's box scores

In [7]:
def table_scraper(box_table):
    '''
    Function scprapes one table on game page
    '''
    rows = []
    for idx, child in enumerate(box_table.find_all("tr")):
        # Dict for one row
        row = {}
        # Counter for dictionaries
        counter = 0
        # Save whether the player is starter
        if idx < 7:
            # value is 1 if player is starter
            row[counter] = 1
            counter = counter + 1
        else:
            row[counter] = 0 # otherwise zero
            counter = counter + 1
        
        for td in child: # iterate through all elements of row
            try:
                row[counter] = td.text
                
                if counter == 1: # Getting player references as player IDs
                    counter = counter + 1
                    row[counter] = td.find('a').get('href')

            except:
                continue
            
            counter = counter + 1

        if len(row) > 0:
            rows.append(row)
    # Drop useless elements of table        
    rows.pop(7) # additional header for Reserves
    rows.pop(0) # header
    rows.pop(-1) # total row
    return(rows[1:], rows[0]) # giving back the table as a dict and header names

In [8]:
def game_information_collector(soup):
    '''
    Function collects additional information about games like team ID, score, date or location
    Team ID is equal to the end of html
    '''
    global_inf = []
    # Team IDs and name:
    for i in soup.find_all('a', attrs = {'itemprop' : 'name'}):
        global_inf.append(i.text) #Team name
        global_inf.append(i.get('href')) #ID
    # Score
    gam_res = soup.find_all("div", attrs = {'class' : 'scorebox'})
    for i in gam_res[0].find_all('div', attrs = {'class' : "score"}):
        global_inf.append(i.text)
    # date and location
    for i in soup.find('div', attrs = {'class' : 'scorebox_meta'}).find_all('div'):
            global_inf.append(i.text)
    return(global_inf)

In [9]:
def single_page_handler(list_html):
    '''
    Function scrapes basic statistic of the home and away team as a dictionary
    '''
    result = requests.get(list_html)
    soup = BeautifulSoup(result.content)
    
    # Global information (team IDs, date, location...)
    global_inf = game_information_collector(soup)

    counter = 0
    # It stores the number of wanted tables
    table_needed = []
    # Getting the location of necessary tables in the all table list of the game page
    for table_id in soup.find_all('table'):
        # I need only the basic statistics
        if 'game-basic' in table_id.get('id'):
            table_needed.append(counter)
        counter = counter + 1

    # Extract home team's tables
    dict_home, header = table_scraper(soup.find_all('table')[table_needed[1]])
    for item in dict_home:
        item.update( {"team":global_inf[2]})
        item.update( {"team_id":global_inf[3]})
        item.update( {"game_id":list_html.rsplit('/', 1)[-1]})

    # Extract away team's table
    dict_away, header = table_scraper(soup.find_all('table')[table_needed[0]])
    for item in dict_away:
        item.update( {"team":global_inf[0]})
        item.update( {"team_id":global_inf[1]})
        item.update( {"game_id":list_html.rsplit('/', 1)[-1]})
        
    # Concatenate tables
    return dict_home + dict_away, header

Let's check how it works on a given match url:

In [10]:
list_html = 'https://www.basketball-reference.com/boxscores/202012220BRK.html'
df = single_page_handler(list_html)

In [12]:
df

([{0: 1,
   1: 'Kyrie Irving',
   2: '/players/i/irvinky01.html',
   3: '25:18',
   4: '10',
   5: '16',
   6: '.625',
   7: '4',
   8: '7',
   9: '.571',
   10: '2',
   11: '2',
   12: '1.000',
   13: '1',
   14: '3',
   15: '4',
   16: '4',
   17: '0',
   18: '0',
   19: '1',
   20: '3',
   21: '26',
   22: '+32',
   'team': 'Brooklyn Nets',
   'team_id': '/teams/BRK/2021.html',
   'game_id': '202012220BRK.html'},
  {0: 1,
   1: 'Kevin Durant',
   2: '/players/d/duranke01.html',
   3: '24:56',
   4: '7',
   5: '16',
   6: '.438',
   7: '1',
   8: '2',
   9: '.500',
   10: '7',
   11: '7',
   12: '1.000',
   13: '1',
   14: '4',
   15: '5',
   16: '3',
   17: '3',
   18: '1',
   19: '1',
   20: '3',
   21: '22',
   22: '+26',
   'team': 'Brooklyn Nets',
   'team_id': '/teams/BRK/2021.html',
   'game_id': '202012220BRK.html'},
  {0: 1,
   1: 'Joe Harris',
   2: '/players/h/harrijo01.html',
   3: '20:46',
   4: '4',
   5: '8',
   6: '.500',
   7: '2',
   8: '5',
   9: '.400',
   10: '0'

# Collecting links for the games

In [16]:
def box_score_table_scraper(soup):
    '''
    I need many box score links from tables like: https://www.basketball-reference.com/leagues/NBA_2020_games.html
    Additionally, information can be collected about games. Every row is one game having information about team names and IDs,
    game scores, game ID. 
    '''
    rows = []
    for child in soup.find_all('table')[0].find_all("tr"):
        row = []
        counter = -1
        for td in child:
            counter = counter + 1

            try:
                if counter in [2, 6, 4]: # references are needed for teams and the box score
                    row.append(td.text)
                    row.append(td.find('a').get('href'))
                else:
                    row.append(td.text) # I need only text information in case of 
            except:
                continue
                
        if len(row) > 0:
            rows.append(row)
            
    return rows[1:] # it gives back the full list of lists 

In [17]:
def get_one_season_links(url, main_url):
    '''
    It collects all the game level information and box score links for one season
    All references per month must be collected. Then references are given to
    box_score_table_scraper to create a data frame about game level information for the whole season
    '''
    result = requests.get(main_url + url)
    soup = BeautifulSoup(result.content)
    
    # Games are grouped by months. The list collects the url of different months
    href_season_months = []
    for link in soup.find('div', attrs={'class' : 'filter'}).find_all("a"):
         href_season_months.append(link.get("href"))
    
    # It stores game level information as a lists of list
    array = []
    for href_months in href_season_months:
        #print(main_url + href_months)
        result = requests.get(main_url + href_months)
        soup = BeautifulSoup(result.content)
        
        array = array + box_score_table_scraper(soup)
    
    # Transform the result to a dataframe
    df = pd.DataFrame(array[1:], columns = ['date', 'hour', 'visitor', 'visitor_id','visitor_score'
                                  , 'home', 'home_id', 'home_score', 'box_score', 'box_score_link',
                                  'OT', 'att', 'note']).drop(columns = ['box_score', 'note'])
    
    df = df[(df.home_score != '')]
    return(df)

In [18]:
def get_one_season_data(url, main_url, saving_path):
    '''
    It collects all game level information and the linked bo score links using get_one_season_links()
    Then it iterates through box score links to get statistics per game
    Every game statistic is appended to each other to form a df containing player level information
    Finally, it saves player level information for one season
    '''
    # Creating game level df with box score references
    box_score_df = get_one_season_links(url, main_url)
    box_score_df = box_score_df[box_score_df.home_score.notna()]
    
    stats_all_game = []
    for box_link in box_score_df.box_score_link:
        if isinstance(box_link, str):
            stats_per_game, headers = single_page_handler(main_url + box_link)
            stats_all_game = stats_all_game + stats_per_game
    
    stats_all_game = pd.DataFrame.from_dict(stats_all_game)
    stats_all_game.columns = ['starter'] + [headers[1]] + ['player_id'] + [headers[x] for x in range(22)][2:] + ['team', 'team_id', 'game_id']
    
    stats_all_game.to_csv(saving_path)
    
    return(box_score_df)

In [19]:
saving_path = 'C:/Users/Hp/Desktop/year2021/nba game prediction/data/box scores'
url = "/leagues/NBA_2021_games.html"
main_url = 'https://www.basketball-reference.com'
proba = get_one_season_data(url, main_url, saving_path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Hp/Desktop/year2021/nba game prediction/data/box scores'

Finally get the stats of many seasons

In [24]:
def scrape_the_seasons(url, numb_seasons, saving_path, main_url):
    '''
    
    '''
    result = requests.get(main_url + url)
    soup = BeautifulSoup(result.content)
    
    season_links = [url]
    
    for idx, i in enumerate(season_links):
        season_links.append(soup.find('div', attrs = {'class' : 'prevnext'}).find('a').get('href'))
        result = requests.get(main_url + season_links[-1])
        soup = BeautifulSoup(result.content)
        if idx > numb_seasons:
            break
    
    #pd.concat([s1, s2], ignore_index=True)
    #box_df_list = []
    for idx, link in enumerate(season_links):
        print(idx)
        saving_path_new = saving_path + "/" + link.rsplit('/', 1)[-1].rsplit('.')[0] + '.csv'
        print(main_url + link)
        box = get_one_season_data(link, main_url, saving_path_new)
        # box_df_list.append(get_one_season_data(link, main_url, saving_path_new))
        box.to_csv(saving_path + "/" + link.rsplit('/', 1)[-1].rsplit('.')[0] + 'box_score' +'.csv')

    return(box)

In [25]:
saving_path = 'C:/Users/Hp/Desktop/year2021/nba game prediction/data/box scores'
url = "/leagues/NBA_2021_games.html"
numb_seasons = 5
main_url = 'https://www.basketball-reference.com'

start_time = time.time()

scrape_the_seasons(url, numb_seasons, saving_path, main_url)

end_time = time.time()

print('Execution time = %.6f seconds' % (end_time-start_time))

0
https://www.basketball-reference.com/leagues/NBA_2021_games.html
1
https://www.basketball-reference.com/leagues/NBA_2020_games.html
2
https://www.basketball-reference.com/leagues/NBA_2019_games.html
3
https://www.basketball-reference.com/leagues/NBA_2018_games.html
4
https://www.basketball-reference.com/leagues/NBA_2017_games.html
5
https://www.basketball-reference.com/leagues/NBA_2016_games.html
6
https://www.basketball-reference.com/leagues/NBA_2015_games.html
7
https://www.basketball-reference.com/leagues/NBA_2014_games.html
Execution time = 8186.268025 seconds


In [None]:
saving_path = 'C:/Users/Hp/Desktop/year2021/nba game prediction/data/box scores/proba.csv'
url = '/leagues/NBA_2021_games.html'
main_url =  'https://www.basketball-reference.com'
df = get_one_season_data(url, main_url, saving_path)

# Improvement Ideas
- There are not enough games in box score table per season. It shoud be 1230 - Actually I have almost enough for full years
- in scrape_the_seasons funtion box score tables shoud be binded properly - error occures
- box score tables cauld be handled as lists/dicts for simplicity 
- What are play off games and regular games?

In [26]:
8186/60


136.43333333333334