In [15]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout # type: ignore
import time
import pandas as pd

In [16]:
month = 4
day = 1
year = 2022

In [3]:
DATA_DIR = 'data'
STANDINGS_DIR = os.path.join(DATA_DIR, 'standings')
SCORES_DIR = os.path.join(DATA_DIR, 'scores')

In [4]:
async def get_html(url, selector, sleep=3, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep*i)

        try:
            async with async_playwright() as p:
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f'Timeout error on {url}')
            continue
        else:
            break

    return html

In [20]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={'id':'line_score'})[0]

    cols = list(line_score.columns)
    cols[0] = 'team'
    cols[-1] = 'total'
    line_score.columns = cols

    line_score = line_score[['team', 'total']]

    return line_score

In [18]:
def read_stats(soup, team, stat):
    df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors='coerce')
    df.drop('Reserves', inplace=True)

    return df

In [19]:
def read_season_info(soup):
    nav = soup.select('#bottom_nav_container')[0]
    hrefs = [a['href'] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split('_')[0]

    return season

In [12]:
async def scrape_game(month, day, year):
    htmls = []
    url = f'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
    html = await get_html(url, '#content .game_summaries')

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [l.get('href') for l in links]
    boxscores = [l for l in hrefs if l and 'boxscore' in l and '.html' in l and 'pbp' not in l]
    boxscores = [f'https://www.basketball-reference.com{l}' for l in boxscores]

    for url in boxscores:
        save_path = os.path.join(SCORES_DIR, url.split('/')[-1])
        if os.path.exists(save_path):
            continue

        html = await get_html(url, '#content')
        htmls.append(html)

    return htmls

In [13]:
htmls = await scrape_game(month,day,year)

NBA Games Played on April 16, 2024 | Basketball-Reference.com
Play-In Game: Lakers vs Pelicans, April 16, 2024 | Basketball-Reference.com
Play-In Game: Lakers vs Pelicans, April 16, 2024 | Basketball-Reference.com
Play-In Game: Warriors vs Kings, April 16, 2024 | Basketball-Reference.com
Play-In Game: Warriors vs Kings, April 16, 2024 | Basketball-Reference.com


In [21]:
base_cols = None
games = []

for html in htmls:
    soup = BeautifulSoup(html)

    [s.decompose for s in soup.select('tr.over_header')]
    [s.decompose for s in soup.select('tr.thead')]

    line_score = read_line_score(soup)
    teams = list(line_score['team'])

    summaries = []
    for team in teams:
        basic = read_stats(soup, team, 'basic')
        advanced = read_stats(soup, team, 'advanced')

        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        totals = totals.reset_index().drop(['level_0'], axis=1).set_index('Starters')
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
        maxes = maxes.reset_index().drop(['level_0'], axis=1).set_index('Starters')
        maxes.index = maxes.index.str.lower() + '_max'

        summary = pd.concat([totals, maxes])
        summary['Team Totals'][37:] = summary[0][37:]
        summary.drop(0, axis=1, inplace=True)

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep='first'))
            base_cols = [b for b in base_cols if 'bpm' not in b]

        summary.columns = [teams.index(team)]
        summary = summary.loc[base_cols]

        summaries.append(summary)

    summary = pd.concat(summaries, axis=1).T

    game = pd.concat([summary, line_score], axis=1)

    game['home'] = [0,1]
    game_opp = game.iloc[::-1].reset_index()

    game_opp.columns += '_opp'

    full_game = pd.concat([game, game_opp], axis=1)
    full_game['season'] = read_season_info(soup)

    full_game['date'] = f'{year}{month}{day}'
    full_game['date'] = pd.to_datetime(full_game['date'], format='%Y%m%d')

    full_game['won'] = full_game['total'] > full_game['total_opp']

    games.append(full_game)


  line_score = pd.read_html(str(soup), attrs={'id':'line_score'})[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  line_score = pd.read_html(str(soup), attrs={'id':'line_score'})[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  line_score = pd.read_html(str(soup), attrs={'id':'line_score'})[0]
  df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
 

In [38]:
games_df = pd.concat(games, ignore_index=True)

In [39]:
df = pd.read_csv('nba_games.csv')

df.drop(['Unnamed: 0'], axis=1, inplace=True)
games_df.drop(['unnamed: 16_level_1', 'unnamed: 16_level_1_max', 'unnamed: 16_level_1_opp', 'unnamed: 16_level_1_max_opp'], axis=1, inplace=True)

df.columns = games_df.columns
df = pd.concat([df, games_df], ignore_index=True)

In [63]:
df.to_csv('nba_games.csv')

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts,gmsc,+/-,ts%,efg%,3par,ftr,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,ortg,drtg,mp_max,mp_max.1,fg_max,fga_max,fg%_max,3p_max,3pa_max,3p%_max,ft_max,fta_max,ft%_max,orb_max,drb_max,trb_max,ast_max,stl_max,blk_max,tov_max,pf_max,pts_max,gmsc_max,+/-_max,ts%_max,efg%_max,3par_max,ftr_max,orb%_max,drb%_max,trb%_max,ast%_max,stl%_max,blk%_max,tov%_max,usg%_max,ortg_max,drtg_max,team,total,home,index_opp,mp_opp,mp_opp.1,fg_opp,fga_opp,fg%_opp,3p_opp,3pa_opp,3p%_opp,ft_opp,fta_opp,ft%_opp,orb_opp,drb_opp,trb_opp,ast_opp,stl_opp,blk_opp,tov_opp,pf_opp,pts_opp,gmsc_opp,+/-_opp,ts%_opp,efg%_opp,3par_opp,ftr_opp,orb%_opp,drb%_opp,trb%_opp,ast%_opp,stl%_opp,blk%_opp,tov%_opp,usg%_opp,ortg_opp,drtg_opp,mp_max_opp,mp_max_opp.1,fg_max_opp,fga_max_opp,fg%_max_opp,3p_max_opp,3pa_max_opp,3p%_max_opp,ft_max_opp,fta_max_opp,ft%_max_opp,orb_max_opp,drb_max_opp,trb_max_opp,ast_max_opp,stl_max_opp,blk_max_opp,tov_max_opp,pf_max_opp,pts_max_opp,gmsc_max_opp,+/-_max_opp,ts%_max_opp,efg%_max_opp,3par_max_opp,ftr_max_opp,orb%_max_opp,drb%_max_opp,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,0.857,4.0,33.0,37.0,27.0,7.0,2.0,19.0,16.0,110.0,,,0.677,0.639,0.514,0.292,12.9,86.8,53.6,71.1,7.3,4.4,19.0,100.0,115.5,112.3,,,9.0,20.0,1.000,5.0,12.0,1.000,8.0,9.0,1.0,1.0,10.0,11.0,8.0,2.0,1.0,5.0,3.0,25.0,16.2,15.0,1.250,1.250,1.000,1.400,12.9,44.1,26.7,30.6,3.9,8.1,41.0,37.3,160.0,121.0,POR,110,0,1,240.0,240.0,40.0,84.0,0.476,14.0,39.0,0.359,13.0,15.0,0.867,5.0,27.0,32.0,28.0,11.0,2.0,12.0,18.0,107.0,,,0.591,0.560,0.464,0.179,13.2,87.1,46.4,70.0,11.5,5.7,11.7,100.0,112.3,115.5,,,8.0,14.0,0.625,4.0,9.0,0.667,6.0,6.0,1.0,3.0,7.0,7.0,8.0,6.0,1.0,3.0,5.0,16.0,17.1,2.0,0.889,0.875,1.000,0.800,10.9,63.6,28.6,33.4,9.2,4.1,26.2,26.8,155.0,123.0,MIA,107,1,2023,2022-11-07,True
1,240.0,240.0,40.0,84.0,0.476,14.0,39.0,0.359,13.0,15.0,0.867,5.0,27.0,32.0,28.0,11.0,2.0,12.0,18.0,107.0,,,0.591,0.560,0.464,0.179,13.2,87.1,46.4,70.0,11.5,5.7,11.7,100.0,112.3,115.5,,,8.0,14.0,0.625,4.0,9.0,0.667,6.0,6.0,1.0,3.0,7.0,7.0,8.0,6.0,1.0,3.0,5.0,16.0,17.1,2.0,0.889,0.875,1.000,0.800,10.9,63.6,28.6,33.4,9.2,4.1,26.2,26.8,155.0,123.0,MIA,107,1,0,240.0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,0.857,4.0,33.0,37.0,27.0,7.0,2.0,19.0,16.0,110.0,,,0.677,0.639,0.514,0.292,12.9,86.8,53.6,71.1,7.3,4.4,19.0,100.0,115.5,112.3,,,9.0,20.0,1.000,5.0,12.0,1.000,8.0,9.0,1.0,1.0,10.0,11.0,8.0,2.0,1.0,5.0,3.0,25.0,16.2,15.0,1.250,1.250,1.000,1.400,12.9,44.1,26.7,30.6,3.9,8.1,41.0,37.3,160.0,121.0,POR,110,0,2023,2022-11-07,False
2,240.0,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,0.789,5.0,38.0,43.0,26.0,5.0,2.0,9.0,20.0,105.0,,,0.608,0.577,0.308,0.244,15.6,86.4,56.6,63.4,5.6,5.6,9.4,100.0,117.2,100.4,,,13.0,20.0,0.857,6.0,9.0,0.667,4.0,6.0,1.0,2.0,11.0,11.0,6.0,2.0,1.0,3.0,5.0,34.0,25.3,22.0,0.888,0.857,0.500,0.600,14.7,54.4,35.5,24.1,3.2,100.0,12.6,33.0,183.0,110.0,CLE,105,0,1,240.0,240.0,29.0,74.0,0.392,13.0,38.0,0.342,19.0,26.0,0.731,6.0,27.0,33.0,17.0,2.0,3.0,13.0,15.0,90.0,,,0.527,0.480,0.514,0.351,13.6,84.4,43.4,58.6,2.2,5.6,13.2,100.0,100.4,117.2,,,9.0,23.0,0.667,4.0,8.0,0.600,10.0,11.0,1.0,2.0,6.0,7.0,7.0,1.0,1.0,5.0,4.0,30.0,18.7,10.0,1.136,0.917,1.000,1.500,42.5,58.4,24.6,64.5,1.8,3.5,28.6,41.1,250.0,125.0,DAL,90,1,2023,2022-12-14,True
3,240.0,240.0,29.0,74.0,0.392,13.0,38.0,0.342,19.0,26.0,0.731,6.0,27.0,33.0,17.0,2.0,3.0,13.0,15.0,90.0,,,0.527,0.480,0.514,0.351,13.6,84.4,43.4,58.6,2.2,5.6,13.2,100.0,100.4,117.2,,,9.0,23.0,0.667,4.0,8.0,0.600,10.0,11.0,1.0,2.0,6.0,7.0,7.0,1.0,1.0,5.0,4.0,30.0,18.7,10.0,1.136,0.917,1.000,1.500,42.5,58.4,24.6,64.5,1.8,3.5,28.6,41.1,250.0,125.0,DAL,90,1,0,240.0,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,0.789,5.0,38.0,43.0,26.0,5.0,2.0,9.0,20.0,105.0,,,0.608,0.577,0.308,0.244,15.6,86.4,56.6,63.4,5.6,5.6,9.4,100.0,117.2,100.4,,,13.0,20.0,0.857,6.0,9.0,0.667,4.0,6.0,1.0,2.0,11.0,11.0,6.0,2.0,1.0,3.0,5.0,34.0,25.3,22.0,0.888,0.857,0.500,0.600,14.7,54.4,35.5,24.1,3.2,100.0,12.6,33.0,183.0,110.0,CLE,105,0,2023,2022-12-14,False
4,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,0.778,6.0,41.0,47.0,26.0,7.0,7.0,15.0,19.0,98.0,,,0.551,0.519,0.247,0.222,16.2,67.2,48.0,66.7,7.4,10.1,14.4,100.0,103.1,100.0,,,7.0,13.0,0.667,2.0,6.0,1.000,6.0,9.0,1.0,2.0,9.0,11.0,6.0,3.0,2.0,4.0,4.0,20.0,21.6,9.0,0.667,0.667,1.000,0.692,23.0,27.6,21.7,26.5,4.3,6.2,50.0,32.6,152.0,111.0,ATL,98,0,1,240.0,240.0,36.0,100.0,0.360,7.0,31.0,0.226,16.0,19.0,0.842,20.0,31.0,51.0,18.0,7.0,3.0,9.0,15.0,95.0,,,0.438,0.395,0.310,0.190,32.8,83.8,52.0,50.0,7.4,4.9,7.7,100.0,100.0,103.1,,,8.0,20.0,1.000,2.0,8.0,0.500,6.0,7.0,1.0,7.0,10.0,17.0,6.0,3.0,2.0,2.0,4.0,18.0,15.9,11.0,0.862,1.000,0.500,3.000,14.6,34.3,22.0,31.0,4.9,4.4,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23043,240.0,240.0,43.0,93.0,0.462,9.0,30.0,0.300,11.0,15.0,0.733,10.0,40.0,50.0,30.0,4.0,2.0,12.0,23.0,106.0,,,0.532,0.511,0.323,0.161,22.2,87.0,54.9,69.8,4.1,4.1,10.8,100.0,108.2,112.3,,,17.0,27.0,0.630,2.0,9.0,1.000,6.0,9.0,1.0,3.0,10.0,12.0,7.0,2.0,1.0,3.0,5.0,40.0,32.2,10.0,0.714,0.714,1.000,0.333,15.8,41.8,26.1,37.6,2.8,3.2,20.2,40.1,143.0,118.0,NOP,106,1,0,240.0,240.0,35.0,84.0,0.417,14.0,35.0,0.400,26.0,29.0,0.897,6.0,35.0,41.0,24.0,8.0,7.0,8.0,12.0,110.0,,,0.568,0.500,0.417,0.345,13.0,77.8,45.1,68.6,8.2,11.1,7.6,100.0,112.3,108.2,,,7.0,20.0,1.000,5.0,11.0,1.000,10.0,10.0,1.0,6.0,9.0,15.0,9.0,3.0,3.0,3.0,4.0,23.0,22.8,16.0,1.000,1.000,0.857,0.625,15.6,41.3,20.4,37.8,3.6,5.7,13.6,29.6,200.0,114.0,LAL,110,0,2024,2022-04-01 00:00:00,False
23044,240.0,240.0,33.0,80.0,0.413,10.0,32.0,0.313,18.0,23.0,0.783,8.0,34.0,42.0,19.0,5.0,3.0,16.0,17.0,94.0,,,0.522,0.475,0.400,0.288,19.0,69.4,46.2,57.6,5.3,5.1,15.1,100.0,99.2,124.5,,,8.0,16.0,1.000,3.0,7.0,0.500,4.0,6.0,1.0,2.0,6.0,8.0,6.0,2.0,1.0,6.0,4.0,22.0,13.7,3.0,1.000,1.000,0.667,1.000,51.6,45.9,30.9,65.6,2.9,3.2,50.0,58.8,156.0,135.0,GSW,94,0,1,240.0,240.0,43.0,98.0,0.439,18.0,39.0,0.462,14.0,15.0,0.933,15.0,34.0,49.0,28.0,10.0,5.0,8.0,17.0,118.0,,,0.564,0.531,0.398,0.153,30.6,81.0,53.8,65.1,10.5,10.4,7.1,100.0,124.5,99.2,,,11.0,25.0,1.000,8.0,13.0,0.750,4.0,4.0,1.0,3.0,11.0,12.0,7.0,3.0,3.0,3.0,5.0,32.0,24.9,29.0,1.000,1.000,1.000,0.364,22.0,51.6,23.8,50.4,3.9,45.1,25.3,29.2,200.0,110.0,SAC,118,1,2024,2022-04-01 00:00:00,False
23045,240.0,240.0,43.0,98.0,0.439,18.0,39.0,0.462,14.0,15.0,0.933,15.0,34.0,49.0,28.0,10.0,5.0,8.0,17.0,118.0,,,0.564,0.531,0.398,0.153,30.6,81.0,53.8,65.1,10.5,10.4,7.1,100.0,124.5,99.2,,,11.0,25.0,1.000,8.0,13.0,0.750,4.0,4.0,1.0,3.0,11.0,12.0,7.0,3.0,3.0,3.0,5.0,32.0,24.9,29.0,1.000,1.000,1.000,0.364,22.0,51.6,23.8,50.4,3.9,45.1,25.3,29.2,200.0,110.0,SAC,118,1,0,240.0,240.0,33.0,80.0,0.413,10.0,32.0,0.313,18.0,23.0,0.783,8.0,34.0,42.0,19.0,5.0,3.0,16.0,17.0,94.0,,,0.522,0.475,0.400,0.288,19.0,69.4,46.2,57.6,5.3,5.1,15.1,100.0,99.2,124.5,,,8.0,16.0,1.000,3.0,7.0,0.500,4.0,6.0,1.0,2.0,6.0,8.0,6.0,2.0,1.0,6.0,4.0,22.0,13.7,3.0,1.000,1.000,0.667,1.000,51.6,45.9,30.9,65.6,2.9,3.2,50.0,58.8,156.0,135.0,GSW,94,0,2024,2022-04-01 00:00:00,True
23046,240.0,240.0,33.0,80.0,0.413,10.0,32.0,0.313,18.0,23.0,0.783,8.0,34.0,42.0,19.0,5.0,3.0,16.0,17.0,94.0,,,0.522,0.475,0.400,0.288,19.0,69.4,46.2,57.6,5.3,5.1,15.1,100.0,99.2,124.5,,,8.0,16.0,1.000,3.0,7.0,0.500,4.0,6.0,1.0,2.0,6.0,8.0,6.0,2.0,1.0,6.0,4.0,22.0,13.7,3.0,1.000,1.000,0.667,1.000,51.6,45.9,30.9,65.6,2.9,3.2,50.0,58.8,156.0,135.0,GSW,94,0,1,240.0,240.0,43.0,98.0,0.439,18.0,39.0,0.462,14.0,15.0,0.933,15.0,34.0,49.0,28.0,10.0,5.0,8.0,17.0,118.0,,,0.564,0.531,0.398,0.153,30.6,81.0,53.8,65.1,10.5,10.4,7.1,100.0,124.5,99.2,,,11.0,25.0,1.000,8.0,13.0,0.750,4.0,4.0,1.0,3.0,11.0,12.0,7.0,3.0,3.0,3.0,5.0,32.0,24.9,29.0,1.000,1.000,1.000,0.364,22.0,51.6,23.8,50.4,3.9,45.1,25.3,29.2,200.0,110.0,SAC,118,1,2024,2022-04-01 00:00:00,False
