In [None]:
import os

YEARS = list(range(2016, 2023))

BASKETBALL_DATA_FOLDER = "basketball_data"
GAME_RESULTS_FOLDER = os.path.join(BASKETBALL_DATA_FOLDER, "game_results")
LEAGUE_RANKINGS_FOLDER = os.path.join(BASKETBALL_DATA_FOLDER, "league_rankings")

In [None]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
# Make sure to install playwright browsers by running playwright install on the command line or !playwright install from Jupyter

In [None]:
async def fetch_page_content(web_address, element_selector, delay=5, attempts=3):
    content = None
    for attempt in range(1, attempts + 1):
        time.sleep(delay * attempt)
        try:
            async with async_playwright() as playwright_instance:
                browser = await playwright_instance.chromium.launch()
                page = await browser.new_page()
                await page.goto(web_address)
                print(await page.title())
                content = await page.inner_html(element_selector)
        except PlaywrightTimeout:
            print(f"Timeout occurred while accessing {web_address}")
            continue
        else:
            break
    return content

In [None]:
async def extract_season_data(year):
    page_url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games.html"
    page_content = await fetch_page_content(page_url, "#content .filter")

    parser = BeautifulSoup(page_content, 'html.parser')
    anchor_tags = parser.find_all("a")
    schedule_pages = [f"https://www.basketball-reference.com{tag['href']}" for tag in anchor_tags]

    for page_url in schedule_pages:
        file_path = os.path.join(LEAGUE_RANKINGS_FOLDER, page_url.split("/")[-1])
        if os.path.exists(file_path):
            continue

        schedule_content = await fetch_page_content(page_url, "#all_schedule")
        with open(file_path, "w+") as file:
            file.write(schedule_content)

In [None]:
for year in YEARS:
    await extract_season_data(year)

2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Refere

In [None]:
ranking_files = os.listdir(LEAGUE_RANKINGS_FOLDER)

In [None]:
async def extract_game_data(ranking_file):
    with open(ranking_file, 'r') as file:
        page_content = file.read()

    parser = BeautifulSoup(page_content, 'html.parser')
    anchor_tags = parser.find_all("a")
    hrefs = [tag.get('href') for tag in anchor_tags]
    game_links = [f"https://www.basketball-reference.com{href}" for href in hrefs if href and "boxscore" in href and '.html' in href]

    for game_url in game_links:
        file_path = os.path.join(GAME_RESULTS_FOLDER, game_url.split("/")[-1])
        if os.path.exists(file_path):
            continue

        game_content = await fetch_page_content(game_url, "#content")
        if not game_content:
            continue
        with open(file_path, "w+") as file:
            file.write(game_content)

In [None]:
import pandas as pd

for year in YEARS:
    files = [file for file in ranking_files if str(year) in file]
    
    for file_path in files:
        full_path = os.path.join(LEAGUE_RANKINGS_FOLDER, file_path)
        
        await extract_game_data(full_path)