In [None]:
# %pip install bs4
# %pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.2/112.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting webencodings (from html5lib)
  Using cached webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: webencodings, html5lib
Successfully installed html5lib-1.1 webencodings-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bs4 import *
from bs4.element import PageElement
import time
import requests
import re
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import json
import pickle
from pathlib import Path
HOME_PAGE = 'https://www.basketball-reference.com'
SEASONS_PAGE = 'https://www.basketball-reference.com/leagues'
TEAMS_PAGE = 'https://www.basketball-reference.com/teams'
BOXSCORES_PAGE = 'https://www.basketball-reference.com/boxscores'

In [4]:
def fetch_html(url,from_local=False):
    if from_local:
        try:
            data = load(url)
            return re.sub("<!--|-->","\n",data)
        except:
            print(f'Failed to fetch {url} from local. Try to fetch online')
    try:
        if not url.startswith("https://"):
            url = "https://"+url
        session = requests.Session()
        return re.sub("<!--|-->","\n",session.get(url).text)
    except:
        print(f'Failed to fetch {url} from web. Please double check url.')
        return None

def make_soup(text):
    return BeautifulSoup(text,features='html.parser')

def save(a,filepath,mode='w',file_type=None):
    if not Path(filepath).exists():
        Path(filepath).parent.mkdir(parents=True,exist_ok=True)
    if file_type is None:
        with open(filepath,mode) as f:
            f.write(a)
    elif file_type.endswith('json'):
        with open(filepath,mode) as f:
            json.dump(a,f)
    elif file_type.endswith('pkl'):
        with open(filepath,mode) as f:
            pickle.dump(a,f)       

def load(filepath,mode='r',file_type=None):
    if file_type is None:
        with open(filepath,mode) as f:
            data = f.read()
    elif file_type.endswith('json'):
        with open(filepath,mode) as f:
            data = json.load(f)
    elif file_type.endswith('pkl'):
        with open(filepath,mode) as f:
            data = pickle.load(f)    
    return data

HOST = 'https://www.basketball-reference.com'
URL = f'{HOST}/leagues'
html_text = fetch_html(URL)
html_soup = make_soup(html_text)
# html_soup

In [92]:
HOME_PAGE = 'https://www.basketball-reference.com'
SEASONS_PAGE = 'https://www.basketball-reference.com/leagues'
TEAMS_PAGE = 'https://www.basketball-reference.com/teams'
BOXSCORES_PAGE = 'https://www.basketball-reference.com/boxscores'

def fetch_seasons_hrefs(save_to=None,from_local=False):
    # fetch leagues page
    html_text = fetch_html(SEASONS_PAGE,from_local)
    html_soup = make_soup(html_text)
    seasons_list = [a['href'] for th in html_soup.find_all('th', {'data-stat': 'season'}) for a in th.find_all('a')]
    if save_to:
        save(seasons_list,save_to)
    return seasons_list

def fetch_season_boxscores_hrefs(season_href,save_to=None,from_local=False,sleep=0):
    # Load and save season schedule page
    # Check for filters. If so iterate through each filter to get the entire list. Else use the schedule on the current page
    url = f"{HOME_PAGE}{season_href.strip('.html')}_games.html"
    html_text = fetch_html(url,from_local)
    if html_text is None:
        return
    if save_to:
        save_url = f'{save_to}{url}'
        save(html_text,save_url)
        
    html_soup = make_soup(html_text)
    season_boxscores_hrefs = []
    filter_div = html_soup.find('div',{'class':'filter'}) 
    schedule_table = html_soup.find('table', {'id': 'schedule'})
    
    if filter_div is None:
        season_boxscores_hrefs = [a['href'] for th in schedule_table.find_all('td',{'data-stat':'box_score_text'}) for a in th]
    
    # If so iterate through each filter to get the entire list
    else: 
        month_hrefs = [a['href'] for a in filter_div.select('a')]
        for month_href in month_hrefs:
            url = f'{HOME_PAGE}{month_href}'
            html_text = fetch_html(url,from_local)
            if html_text is None:
                continue
            if save_to:
                save_url = f'{save_to}{url}'
                save(html_text,save_url)

            html_soup = make_soup(html_text)
            schedule_table = html_soup.find('table', {'id': 'schedule'})
            season_boxscores_hrefs += [a['href'] for th in schedule_table.find_all('td',{'data-stat':'box_score_text'}) for a in th]
            if sleep:
                time.sleep(sleep)
    return season_boxscores_hrefs

def fetch_match_boxscores(boxscore_href,save_to=None,from_local=None, sleep=0):
    url = f"{HOME_PAGE}{boxscore_href}"
    html_text = fetch_html(url,from_local)

    box_scores_hrefs = []
    filter_div = html_soup.find('div',{'class':'filter'})
    if filter_div is not None:
        filter_hrefs = [a['href'] for a in filter_div.select('a')]
        for filter_href in filter_hrefs:
            url = f'{HOME_PAGE}{filter_href}'
            html_text = fetch_html(url,from_local)
            if html_text is None:
                continue
            if save_to:
                save_url = f'{save_to}{url}'
                save(html_text,save_url)
            if sleep:
                time.sleep(sleep)
            box_scores_hrefs.append(filter_href)

    else:
        url = f'{HOME_PAGE}{boxscore_href}'
        html_text = fetch_html(url,from_local)
        if html_text is None:
            pass
        else:
            if save_to:
                save_url = f'{save_to}{url}'
                save(html_text,save_url)
            box_scores_hrefs.append(boxscore_href)
    return box_scores_hrefs


# def fetch_player(player):
#     # pass
#     pass


def main():
    # fetch seasons
    save_to = './'
    seasons_hrefs = fetch_seasons_hrefs(save_to=save_to)
    for seasons_href in tqdm(seasons_hrefs,position=0)[:2]:
        # fetch season boxscores list
        season_boxscores_hrefs = fetch_season_boxscores_hrefs(seasons_href,save_to=save_to,from_local=True, sleep=3)
        for season_boxscores_href in tqdm(season_boxscores_hrefs,position=1,leave=True):
            # fetch match box scores
            match_boxscores = fetch_match_boxscores(season_boxscores_href,save_to=save_to,from_local=False, sleep=3)

    




In [93]:
fetch_match_boxscores(season_boxscores_hrefs[1])

100%|██████████| 4/4 [00:01<00:00,  3.32it/s]


['/boxscores/202210180BOS.html',
 '/boxscores/pbp/202210180BOS.html',
 '/boxscores/shot-chart/202210180BOS.html',
 '/boxscores/plus-minus/202210180BOS.html']