In [2]:
from file_tools import *
from request_tools import *
import time
from tqdm import tqdm
import argparse
from parse_tools import *

In [3]:
def req_all_seasons_hrefs() -> list:
    seasons = []
    html_soup = request_html_soup(SEASONS_PAGE)
    try:
        for th in html_soup.find_all('th', {'data-stat': 'season'}):
            for a in th.find_all('a'):
                seasons.append(a['href'])
    except Exception as e:
        print(f'Error getting seasons: {e}')
    return seasons

def req_season_games_hrefs(season_href,sleep=3) -> list:
    games = []
    html_soup = request_html_soup(HOME_PAGE + season_href.strip('.html')+'_games.html')
    filter_div = html_soup.find('div',{'class':'filter'}) 
    schedule_table = html_soup.find('table', {'id': 'schedule'})
    try:
        if filter_div is None:
            for td in html_soup.find_all('td', {'data-stat': 'box_score_text'}):
                for a in td.find_all('a'):
                    games.append(a['href'])
        else:
            month_hrefs = [a['href'] for a in filter_div.select('a')]
            for month_href in month_hrefs:
                html_soup = request_html_soup(HOME_PAGE + month_href)
                schedule_table = html_soup.find('table', {'id': 'schedule'})
                for td in schedule_table.find_all('td', {'data-stat': 'box_score_text'}):
                    for a in td.find_all('a'):
                        games.append(a['href'])
                time.sleep(sleep)
    except Exception as e:
        print(f'Error getting boxscores for season {season_href}: {e}')
    return games

def req_game_boxscores_hrefs(game_href) -> dict:
    boxscores = []
    try:
        html_soup = request_html_soup(HOME_PAGE + game_href)
        filter_div = html_soup.find('div',{'class':'filter'})
        if filter_div is not None:
            filter_hrefs = [a['href'] for a in filter_div.select('a')]
            for filter_href in filter_hrefs:
                boxscores.append(filter_href)
        else:
            boxscores.append(game_href)
    except Exception as e:
        print(f'Error getting boxscores hrefs for game {game_href}: {e}')
    return boxscores


def scrape_boxscores_hrefs(start_season=0,end_season=None):
    # fetch season boxscores list
    game_hrefs = {}
    seasons_hrefs = req_all_seasons_hrefs()[start_season:end_season]
    for season_href in tqdm(seasons_hrefs,position=0, leave=True):
        season_games_hrefs = req_season_games_hrefs(season_href,sleep)
        game_hrefs += sorted(season_games_hrefs,reverse=True)
        save_file(outdir + 'boxscores.txt', '\n'.join(game_hrefs))
        time.sleep(sleep)

In [6]:
SS = load_json('./00-data-facts/boxscores_hrefs.json')
SS.keys()

dict_keys(['/leagues/NBA_2024.html', '/leagues/NBA_2023.html', '/leagues/NBA_2022.html', '/leagues/NBA_2021.html', '/leagues/NBA_2020.html', '/leagues/NBA_2019.html', '/leagues/NBA_2018.html', '/leagues/NBA_2017.html', '/leagues/NBA_2016.html', '/leagues/NBA_2015.html', '/leagues/NBA_2014.html', '/leagues/NBA_2013.html', '/leagues/NBA_2012.html', '/leagues/NBA_2011.html', '/leagues/NBA_2010.html', '/leagues/NBA_2009.html', '/leagues/NBA_2008.html', '/leagues/NBA_2007.html', '/leagues/NBA_2006.html', '/leagues/NBA_2005.html', '/leagues/NBA_2004.html', '/leagues/NBA_2003.html', '/leagues/NBA_2002.html', '/leagues/NBA_2001.html', '/leagues/NBA_2000.html', '/leagues/NBA_1999.html', '/leagues/NBA_1998.html', '/leagues/NBA_1997.html', '/leagues/NBA_1996.html', '/leagues/NBA_1995.html', '/leagues/NBA_1994.html', '/leagues/NBA_1993.html', '/leagues/NBA_1992.html', '/leagues/NBA_1991.html', '/leagues/NBA_1990.html', '/leagues/NBA_1989.html', '/leagues/NBA_1988.html', '/leagues/NBA_1987.html', '

In [7]:
SS['/leagues/NBA_2023.html']

['/boxscores/202210180BOS.html',
 '/boxscores/202210180GSW.html',
 '/boxscores/202210190ATL.html',
 '/boxscores/202210190BRK.html',
 '/boxscores/202210190DET.html',
 '/boxscores/202210190IND.html',
 '/boxscores/202210190MEM.html',
 '/boxscores/202210190MIA.html',
 '/boxscores/202210190MIN.html',
 '/boxscores/202210190PHO.html',
 '/boxscores/202210190SAC.html',
 '/boxscores/202210190SAS.html',
 '/boxscores/202210190TOR.html',
 '/boxscores/202210190UTA.html',
 '/boxscores/202210200LAL.html',
 '/boxscores/202210200PHI.html',
 '/boxscores/202210210ATL.html',
 '/boxscores/202210210BRK.html',
 '/boxscores/202210210CHO.html',
 '/boxscores/202210210GSW.html',
 '/boxscores/202210210HOU.html',
 '/boxscores/202210210IND.html',
 '/boxscores/202210210MIA.html',
 '/boxscores/202210210MIN.html',
 '/boxscores/202210210NYK.html',
 '/boxscores/202210210POR.html',
 '/boxscores/202210210WAS.html',
 '/boxscores/202210220CHI.html',
 '/boxscores/202210220DAL.html',
 '/boxscores/202210220DEN.html',
 '/boxscor

In [4]:
def scrape_boxscores_html(target_dir, sleep=3):
    _FAILS_ = []
    SS_BOXSCORES = load_json('./00-data-facts/boxscores_hrefs.json')
    if SS_BOXSCORES is None:
        print('No boxscores found')
        return
    TQDM_SS_BOXSCORES_KEYS = tqdm(SS_BOXSCORES.keys(),position=0, leave=True, ncols=150)
    for SEASON_HTML in  TQDM_SS_BOXSCORES_KEYS:
        TQDM_SS_BOXSCORES_KEYS.set_description(f'{SEASON_HTML}')
        TQDM_SS_BOXSCORES_LIST = tqdm(SS_BOXSCORES[SEASON_HTML],position=1, leave=False,ncols=150)
        for BOXSCORES_HTML in TQDM_SS_BOXSCORES_LIST:
            if file_exists('/'.join([target_dir,BOXSCORES_HTML])):
                continue
            try:
                html_soup = request_html_soup(HOME_PAGE + BOXSCORES_HTML)
                html_text = content_div_only(html_soup).prettify()
                save_file('/'.join([target_dir,BOXSCORES_HTML]), html_text)
                time.sleep(sleep)
            except Exception as e:
                print(f'Error getting boxscores for game {BOXSCORES_HTML}: {e}')
                _FAILS_.append(BOXSCORES_HTML)
    
    if _FAILS_:
        print(f'Failed to fetch {len(_FAILS_)} boxscores')
    else:
        print('All boxscores fetched')


        # SEASON_DIR = parse_league_id(SEASON_HTML)['body']
        # if folder(''outdir + SEASON_DIR):
        #     continue

    # game_hrefs_tqdm = tqdm(game_hrefs,position=0, leave=True,ncols=150)
    # for game_href in game_hrefs_tqdm:
    #     game_hrefs_tqdm.set_description(f'{game_href}')
    #     # Check if we already have the boxscores
    #     if file_exists(outdir + game_href):
    #         continue
    #     # Else fetch the game html
    #     try:
    #         html_soup = request_html_soup(HOME_PAGE + game_href)
    #         html_text = content_div_only(html_soup).prettify()
    #         save_file(outdir + game_href, html_text)
    #         time.sleep(sleep)
    #     except Exception as e:
    #         print(f'Error getting boxscores for game {game_href}: {e}')

['/leagues/NBA_2024.html',
 '/leagues/NBA_2023.html',
 '/leagues/NBA_2022.html',
 '/leagues/NBA_2021.html',
 '/leagues/NBA_2020.html',
 '/leagues/NBA_2019.html',
 '/leagues/NBA_2018.html',
 '/leagues/NBA_2017.html',
 '/leagues/NBA_2016.html',
 '/leagues/NBA_2015.html',
 '/leagues/NBA_2014.html',
 '/leagues/NBA_2013.html',
 '/leagues/NBA_2012.html',
 '/leagues/NBA_2011.html',
 '/leagues/NBA_2010.html',
 '/leagues/NBA_2009.html',
 '/leagues/NBA_2008.html',
 '/leagues/NBA_2007.html',
 '/leagues/NBA_2006.html',
 '/leagues/NBA_2005.html',
 '/leagues/NBA_2004.html',
 '/leagues/NBA_2003.html',
 '/leagues/NBA_2002.html',
 '/leagues/NBA_2001.html',
 '/leagues/NBA_2000.html',
 '/leagues/NBA_1999.html',
 '/leagues/NBA_1998.html',
 '/leagues/NBA_1997.html',
 '/leagues/NBA_1996.html',
 '/leagues/NBA_1995.html',
 '/leagues/NBA_1994.html',
 '/leagues/NBA_1993.html',
 '/leagues/NBA_1992.html',
 '/leagues/NBA_1991.html',
 '/leagues/NBA_1990.html',
 '/leagues/NBA_1989.html',
 '/leagues/NBA_1988.html',
 