In [39]:
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [40]:
# open the html file and use html parser to turn it into a soup object

# Load the saved HTML file
with open("../data/raw/202503_bbref.html", "r", encoding="utf-8") as file:
    html_content = file.read()
# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

In [41]:
# parse soup object to get the info needed

# Find the table container div
table_container = soup.find("div", class_="table_container", id="div_schedule")
# Find the actual table inside this div
table = table_container.find("table") if table_container else None
# Extract column headers
headers = [th.text.strip() for th in table.find("thead").find("tr").find_all("th")]

# print the headers
print("Headers:", headers)

Headers: ['Date', 'Start (ET)', 'Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS', '', '', 'Attend.', 'LOG', 'Arena', 'Notes']


In [42]:
# Extract table rows
rows = []

tbody = table.find("tbody")
for tr in tbody.find_all("tr"):
    cells = []
    
    # Extract data from <th> (first column)
    th = tr.find("th")
    if th:
        a_tag = th.find("a")
        cells.append(a_tag.text.strip() if a_tag else th.text.strip())  # Handle <a> inside <th>

    # Extract remaining data from <td>
    for td in tr.find_all("td"):
        a_tag = td.find("a")
        cells.append(a_tag.text.strip() if a_tag else td.text.strip())  # Handle <a> inside <td>

    rows.append(cells)

# Print extracted table data
for row in rows[0:10]:
    print(row)

['Sat, Mar 1, 2025', '6:00p', 'Washington Wizards', '113', 'Charlotte Hornets', '100', 'Box Score', '', '17,904', '1:59', 'Spectrum Center', '']
['Sat, Mar 1, 2025', '7:00p', 'Brooklyn Nets', '94', 'Detroit Pistons', '115', 'Box Score', '', '20,062', '2:23', 'Little Caesars Arena', '']
['Sat, Mar 1, 2025', '8:00p', 'Sacramento Kings', '113', 'Houston Rockets', '103', 'Box Score', '', '18,055', '2:10', 'Toyota Center', '']
['Sat, Mar 1, 2025', '8:00p', 'San Antonio Spurs', '130', 'Memphis Grizzlies', '128', 'Box Score', '', '16,822', '2:18', 'FedEx Forum', '']
['Sat, Mar 1, 2025', '8:30p', 'Milwaukee Bucks', '132', 'Dallas Mavericks', '117', 'Box Score', '', '20,272', '2:16', 'American Airlines Center', '']
['Sat, Mar 1, 2025', '8:30p', 'Golden State Warriors', '119', 'Philadelphia 76ers', '126', 'Box Score', '', '20,159', '2:30', 'Wells Fargo Center', '']
['Sun, Mar 2, 2025', '1:00p', 'Denver Nuggets', '103', 'Boston Celtics', '110', 'Box Score', '', '19,156', '2:18', 'TD Garden', '']


In [43]:
# Create a DataFrame from the headers and rows from above
df = pd.DataFrame(rows, columns=headers)

# drop empty / nameless columns
df = df.drop(columns = [col for col in df.columns if col == ''])
# rename coumns
df.columns = ['Date', 'Start Time', 'Away', 'Away Points', 'Home', 'Home Points', 'Attendance', 'Length', 'Arena', 'Notes']
# date column to date type
df["Date"] = pd.to_datetime(df["Date"], format="%a, %b %d, %Y")
# Fix "Start Time" formatting by appending "m" and converting to time
df["Start Time"] = pd.to_datetime(df["Start Time"].str.replace("p", "PM").str.replace("a", "AM"), format="%I:%M%p").dt.time
# Combine 'Date' and 'Start Time' into a full datetime
df["Start Datetime"] = df.apply(lambda row: pd.Timestamp.combine(row["Date"], row["Start Time"]), axis=1)
# drop duplicate columns
df.drop(columns = ['Date', 'Start Time'], inplace=True)
# reorder columns
cols = ['Start Datetime'] + [col for col in df.columns if col != 'Start Datetime']
df = df[cols]

# look at it!
df.head()


Unnamed: 0,Start Datetime,Away,Away Points,Home,Home Points,Attendance,Length,Arena,Notes
0,2025-03-01 18:00:00,Washington Wizards,113,Charlotte Hornets,100,17904,1:59,Spectrum Center,
1,2025-03-01 19:00:00,Brooklyn Nets,94,Detroit Pistons,115,20062,2:23,Little Caesars Arena,
2,2025-03-01 20:00:00,Sacramento Kings,113,Houston Rockets,103,18055,2:10,Toyota Center,
3,2025-03-01 20:00:00,San Antonio Spurs,130,Memphis Grizzlies,128,16822,2:18,FedEx Forum,
4,2025-03-01 20:30:00,Milwaukee Bucks,132,Dallas Mavericks,117,20272,2:16,American Airlines Center,
