In [1]:
import os
import pandas as pd
from io import StringIO

SCORE_DIR = "data/scores"

In [2]:
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [3]:
from bs4 import BeautifulSoup

def parse_html(box_score):
    with open(box_score, encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html)
    # remove the html elements tr.over_header and tr.thead
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [4]:
def read_season_info(soup):
    # find all a tags
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [5]:
def read_line_score(soup):
    # getting the teams that won and lost and the total points of each team
    html_str = str(soup)
    line_score = pd.read_html(StringIO(html_str), attrs = {'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    line_score = line_score[["team", "total"]]
    
    return line_score

In [6]:
def read_stats(soup, team, stat):
    # read the advanced stats for the game
    html_str = str(soup)
    try:
        df = pd.read_html(StringIO(html_str), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
        df = df.apply(pd.to_numeric, errors="coerce")
    except ValueError as e:
        print(f"Error reading stats for team {team}, stat {stat}: {e}")
        print(f"HTML content: {html_str[:500]}")  # Print a portion of the HTML content for inspection
        return None
    return df

In [7]:
games = []
base_cols = None
for i, box_score in enumerate(box_scores):
    current_file = box_score
    try:
        print(f"Processing file {i + 1}/{len(box_scores)}: {box_score}")
        soup = parse_html(box_score)
        line_score = read_line_score(soup)
        teams = list(line_score["team"])

        summaries = []
        for team in teams:
            basic = read_stats(soup, team, "basic")
            advanced = read_stats(soup, team, "advanced")

            if basic is None or advanced is None:
                print(f"Skipping file {box_score} due to missing data for team {team}")
                continue

            totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
            totals.index = totals.index.str.lower()

            maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
            maxes.index = maxes.index.str.lower() + "_max"

            summary = pd.concat([totals, maxes])

            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep="first"))
                base_cols = [b for b in base_cols if "bpm" not in b]

            summary = summary[base_cols]

            summaries.append(summary)
        summary = pd.concat(summaries, axis=1).T

        game = pd.concat([summary, line_score], axis=1)

        game["home"] = [0, 1]

        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += "_opp"

        full_game = pd.concat([game, game_opp], axis=1)
        full_game["season"] = read_season_info(soup)

        full_game["date"] = os.path.basename(box_score)[:8]
        full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")

        full_game["won"] = full_game["total"] > full_game["total_opp"]
        games.append(full_game)

        if len(games) % 100 == 0:
            print(f"{len(games)} / {len(box_scores)}")

    except Exception as e:
        print(f"Error processing {box_score}: {e}")

Processing file 1/11519: data/scores\201510270ATL.html
Processing file 2/11519: data/scores\201510270CHI.html
Processing file 3/11519: data/scores\201510270GSW.html
Processing file 4/11519: data/scores\201510280BOS.html
Processing file 5/11519: data/scores\201510280BRK.html
Processing file 6/11519: data/scores\201510280DET.html
Processing file 7/11519: data/scores\201510280HOU.html
Processing file 8/11519: data/scores\201510280LAL.html
Processing file 9/11519: data/scores\201510280MEM.html
Processing file 10/11519: data/scores\201510280MIA.html
Processing file 11/11519: data/scores\201510280MIL.html
Processing file 12/11519: data/scores\201510280OKC.html
Processing file 13/11519: data/scores\201510280ORL.html
Processing file 14/11519: data/scores\201510280PHO.html
Processing file 15/11519: data/scores\201510280POR.html
Processing file 16/11519: data/scores\201510280SAC.html
Processing file 17/11519: data/scores\201510280TOR.html
Processing file 18/11519: data/scores\201510290IND.html
P

In [8]:
games_df = pd.concat(games, ignore_index=True)

In [9]:
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23033,240.0,240.0,38.0,86.0,0.442,9.0,25.0,0.360,14.0,16.0,...,50.0,35.7,205.0,115.0,BOS,106,0,2024,2024-06-12,False
23034,240.0,240.0,29.0,80.0,0.363,14.0,41.0,0.341,12.0,13.0,...,100.0,40.8,212.0,99.0,DAL,122,1,2024,2024-06-14,False
23035,240.0,240.0,46.0,91.0,0.505,15.0,37.0,0.405,15.0,22.0,...,50.0,35.9,200.0,138.0,BOS,84,0,2024,2024-06-14,True
23036,240.0,240.0,35.0,78.0,0.449,11.0,37.0,0.297,7.0,13.0,...,15.5,32.3,300.0,110.0,BOS,106,1,2024,2024-06-17,False


In [10]:
games_df.to_csv("nba_games.csv")