In [92]:
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [93]:
# open the html file and use html parser to turn it into a soup object

# Load the saved HTML file
with open("../data/raw/basketball_reference_202503.html", "r", encoding="utf-8") as file:
    html_content = file.read()
# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

In [94]:
# Locate the table
table_container = soup.find("div", class_="table_container", id="div_schedule")
table = table_container.find("table") if table_container else None

# Extract column headers (from <thead>)
headers = [th.text.strip() for th in table.find("thead").find("tr").find_all("th")]

# Extract all rows from <tbody>
rows = []
tbody = table.find("tbody")

for tr in tbody.find_all("tr"):
    row_data = []  # List to store row values in order

    # Extract first column (Date, inside <th>)
    th = tr.find("th")
    a_tag = th.find("a") if th else None
    row_data.append(a_tag.text.strip() if a_tag else th.text.strip())  # Store date value

    # Extract remaining columns from <td>
    for td in tr.find_all("td"):
        a_tag = td.find("a")

        # Special case: Extract the Box Score Link
        if td["data-stat"] == "box_score_text" and a_tag:
            row_data.append("https://www.basketball-reference.com" + a_tag["href"])
        else:
            row_data.append(a_tag.text.strip() if a_tag else td.text.strip())

    # Ensure row has the same number of columns as headers
    while len(row_data) < len(headers):
        row_data.append("")  # Fill missing values with empty string

    rows.append(row_data)

# Create DataFrame
df = pd.DataFrame(rows, columns=headers)

In [95]:
df.columns = ['Date', 'StartTime', 'Away', 'AwayPoints', 'Home', 'HomePoints', 'BoxScoreLink', 'Overtime', 'Attendance', 'LengthOfGame', 'Arena', 'Notes']

# date column to date type
df["Date"] = pd.to_datetime(df["Date"], format="%a, %b %d, %Y")
# Fix "StartTime" formatting by appending "m" and converting to time
df["StartTime"] = pd.to_datetime(df["StartTime"].str.replace("p", "PM").str.replace("a", "AM"), format="%I:%M%p").dt.time

# look at it!
df.head(10)

Unnamed: 0,Date,StartTime,Away,AwayPoints,Home,HomePoints,BoxScoreLink,Overtime,Attendance,LengthOfGame,Arena,Notes
0,2025-03-01,18:00:00,Washington Wizards,113,Charlotte Hornets,100,https://www.basketball-reference.com/boxscores...,,17904,1:59,Spectrum Center,
1,2025-03-01,19:00:00,Brooklyn Nets,94,Detroit Pistons,115,https://www.basketball-reference.com/boxscores...,,20062,2:23,Little Caesars Arena,
2,2025-03-01,20:00:00,Sacramento Kings,113,Houston Rockets,103,https://www.basketball-reference.com/boxscores...,,18055,2:10,Toyota Center,
3,2025-03-01,20:00:00,San Antonio Spurs,130,Memphis Grizzlies,128,https://www.basketball-reference.com/boxscores...,,16822,2:18,FedEx Forum,
4,2025-03-01,20:30:00,Milwaukee Bucks,132,Dallas Mavericks,117,https://www.basketball-reference.com/boxscores...,,20272,2:16,American Airlines Center,
5,2025-03-01,20:30:00,Golden State Warriors,119,Philadelphia 76ers,126,https://www.basketball-reference.com/boxscores...,,20159,2:30,Wells Fargo Center,
6,2025-03-02,13:00:00,Denver Nuggets,103,Boston Celtics,110,https://www.basketball-reference.com/boxscores...,,19156,2:18,TD Garden,
7,2025-03-02,15:30:00,Portland Trail Blazers,129,Cleveland Cavaliers,133,https://www.basketball-reference.com/boxscores...,OT,19432,2:51,Rocket Arena,
8,2025-03-02,17:00:00,Chicago Bulls,112,Indiana Pacers,127,https://www.basketball-reference.com/boxscores...,,17028,2:05,Gainbridge Fieldhouse,
9,2025-03-02,18:00:00,New York Knicks,116,Miami Heat,112,https://www.basketball-reference.com/boxscores...,OT,19725,2:32,Kaseya Center,
