In [1]:
import os
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

In [2]:
os.chdir("../")
os.getcwd()

'/Users/tunahankilic/Desktop/NBAGamePrediction'

In [4]:
SEASONS = list(range(2019, 2024))

In [3]:
DATA_DIR = 'data'
SCHEDULE_DIR = os.path.join(DATA_DIR, 'gameschedules')
BOXSCORES_DIR = os.path.join(DATA_DIR, 'boxscores')

In [10]:
!playwright install

Downloading Chromium 115.0.5790.75 (playwright build v1071)[2m from https://playwright.azureedge.net/builds/chromium/1071/chromium-mac-arm64.zip[22m
Chromium 115.0.5790.75 (playwright build v1071) downloaded to /Users/tunahankilic/Library/Caches/ms-playwright/chromium-1071
Downloading FFMPEG playwright build v1009[2m from https://playwright.azureedge.net/builds/ffmpeg/1009/ffmpeg-mac-arm64.zip[22m
FFMPEG playwright build v1009 downloaded to /Users/tunahankilic/Library/Caches/ms-playwright/ffmpeg-1009
Downloading Firefox 115.0 (playwright build v1419)[2m from https://playwright.azureedge.net/builds/firefox/1419/firefox-mac-13-arm64.zip[22m
Firefox 115.0 (playwright build v1419) downloaded to /Users/tunahankilic/Library/Caches/ms-playwright/firefox-1419
Downloading Webkit 17.0 (playwright build v1869)[2m from https://playwright.azureedge.net/builds/webkit/1869/webkit-mac-13-arm64.zip[22m
Webkit 17.0 (playwright build v1869) downloaded to /Users/tunahankilic/Library/Caches/ms-play

In [4]:
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)

        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout Error on {url}")
            continue
        else:
            break
    return html

In [5]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    href = [link["href"] for link in links]
    standings_pages = [f"https://basketball-reference.com{link}" for link in href]

    for url in standings_pages:
        path = os.path.join(SCHEDULE_DIR, url.split("/")[-1])
        if os.path.exists(path):
            continue
        
        html = await get_html(url, "#all_schedule")
        with open(path, "w+") as f:
            f.write(html)

In [27]:
for season in SEASONS:
    await scrape_season(season)

2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Refere

In [6]:
async def scrape_box_scores(standings_file):
    with open(standings_file, 'r', errors='ignore') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    hrefs = [link.get("href") for link in links]
    years_to_check = ['2019', '2020', '2021', '2022', '2023']
    box_score_extensions = [link for link in hrefs if link and "boxscore" in link and ".html" in link and any(year in link for year in years_to_check)]
    box_scores = [f"https://www.basketball-reference.com{link}" for link in box_score_extensions]

    for url in box_scores:
        path = os.path.join(BOXSCORES_DIR, url.split("/")[-1])
        if os.path.exists(path):
            continue

        html = await get_html(url, "#content")
        if not html:
            continue
        with open(path, "w+") as f:
            f.write(html)

In [16]:
standings_files = os.listdir(SCHEDULE_DIR)
for file in standings_files:
    filepath = os.path.join(SCHEDULE_DIR, file)

    await scrape_box_scores(filepath)

In [8]:
BOXSCORES_DIR

'data/boxscores'

In [28]:
pd.DataFrame([file[:8] for file in os.listdir(BOXSCORES_DIR) if any(year in file[:8] for year in ['2018', '2019', '2020', '2021', '2022', '2023'])], columns=['date']).to_csv('dates.csv')

In [27]:
[file[:8] for file in os.listdir(BOXSCORES_DIR) if any(year in file[:8] for year in ['2018', '2019', '2020', '2021', '2022', '2023'])]

['20221107',
 '20221214',
 '20221207',
 '20210430',
 '20190326',
 '20221209',
 '20220123',
 '20200116',
 '20220318',
 '20210219',
 '20191118',
 '20221029',
 '20211124',
 '20210303',
 '20221107',
 '20210310',
 '20210328',
 '20180120',
 '20200822',
 '20190407',
 '20210212',
 '20220228',
 '20230506',
 '20220307',
 '20221206',
 '20221119',
 '20210209',
 '20190326',
 '20221207',
 '20210508',
 '20221114',
 '20210213',
 '20180411',
 '20200117',
 '20210405',
 '20210422',
 '20220321',
 '20191201',
 '20220413',
 '20210113',
 '20220401',
 '20190409',
 '20230226',
 '20180119',
 '20220401',
 '20190407',
 '20200205',
 '20221029',
 '20210311',
 '20230426',
 '20221214',
 '20220125',
 '20220113',
 '20180409',
 '20220111',
 '20210610',
 '20180214',
 '20210510',
 '20220129',
 '20210302',
 '20211110',
 '20220401',
 '20190305',
 '20230409',
 '20190404',
 '20220210',
 '20211020',
 '20230104',
 '20210513',
 '20200222',
 '20200106',
 '20221120',
 '20180409',
 '20210220',
 '20230301',
 '20191202',
 '20200223',