In [24]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import asyncio
import pandas as pd

In [25]:
SCRAPE_DATA = False # set to true when you need to scrape new data (saves repetition)

In [26]:
SEASONS = list(range(2019, 2026))

In [27]:
SEASONS

[2019, 2020, 2021, 2022, 2023, 2024, 2025]

In [28]:
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR)

In [29]:
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=False)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        )
        page = await context.new_page()
        
        try:
            for i in range(1, retries + 1):
                if i > 1:
                    await asyncio.sleep(sleep * i)
                
                try:
                    await page.goto(url, timeout=60000)
                    await page.wait_for_selector(selector, timeout=30000)
                    print(await page.title())
                    html = await page.inner_html(selector)
                    break
                    
                except PlaywrightTimeout:
                    print(f"Timeout error on {url} (attempt {i}/{retries})")
                    if i == retries:
                        print(f"Failed after {retries} attempts")
                except Exception as e:
                    print(f"Error on {url}: {e} (attempt {i}/{retries})")
                    if i == retries:
                        print(f"Failed after {retries} attempts")
                        
        finally:
            await browser.close()
    
    return html

In [30]:
async def scrape_season(season):
  url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
  html = await get_html(url, "#content .filter")

  soup = BeautifulSoup(html, 'html.parser')
  links = soup.find_all('a')
  href = [l["href"] for l in links]
  standings_pages = [f"https://basketball-reference.com{l}" for l in href]

  for url in standings_pages:
    save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, "#all_schedule")
    with open(save_path, "w+") as f:
      f.write(html)

In [31]:
if (SCRAPE_DATA):
  for season in SEASONS:
    await scrape_season(season)

In [32]:
async def scrape_game(standings_file):
  with open(standings_file, 'r') as f:
    html = f.read()
  
  soup = BeautifulSoup(html)
  links = soup.find_all("a")
  hrefs = [l.get("href") for l in links]
  box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l]
  box_scores =  [f"https://www.basketball-reference.com{l}" for l in box_scores]

  for url in box_scores:
    save_path = os.path.join(SCORES_DIR, url.split("/")[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, "#content")
    if not html:
      continue
    with open(save_path, "w+") as f:
      f.write(html)

In [33]:
if SCRAPE_DATA:
  standings_files = os.listdir(STANDINGS_DIR)
  for f in standings_files:
    filepath = os.path.join(STANDINGS_DIR, f)

    await scrape_game(filepath) 

In [34]:
if SCRAPE_DATA:
  await scrape_season(range(list(2019, 2026)))

In [35]:
current_season = 2026

In [36]:
def process_season_schedule(current_season):
    # get the schedule file for the current season
    schedule_files = [f for f in os.listdir(STANDINGS_DIR) if f.startswith(f"NBA_{current_season}")]

    played_games = []
    upcoming_games = []

    for schedule_file in schedule_files:
        filepath = os.path.join(STANDINGS_DIR, schedule_file)

        with open(filepath, 'r') as f:
            html = f.read()
    
        soup = BeautifulSoup(html, 'html.parser')
    
        # find the schedule table
        schedule_table = soup.find('table', {'id': 'schedule'})
        if not schedule_table:
            continue

        rows = schedule_table.find('tbody').find_all('tr')

        for row in rows:
            # skip header rows
            if row.get('class') and 'thead' in row.get('class'):
                continue
            
            # check if game has been played (has a box score link)
            box_score_link = row.find('td', {'data-stat': 'box_score_text'})

            if box_score_link and box_score_link.find('a'):
                # game has been played - get scores to determine winner
                visitor = row.find('td', {'data-stat': 'visitor_team_name'}).text if row.find('td', {'data-stat': 'visitor_team_name'}) else ''
                home = row.find('td', {'data-stat': 'home_team_name'}).text if row.find('td', {'data-stat': 'home_team_name'}) else ''
                
                # get scores
                visitor_score = row.find('td', {'data-stat': 'visitor_pts'})
                home_score = row.find('td', {'data-stat': 'home_pts'})
                
                visitor_pts = int(visitor_score.text) if visitor_score and visitor_score.text else 0
                home_pts = int(home_score.text) if home_score and home_score.text else 0
                
                # determine winner
                winner = home if home_pts > visitor_pts else visitor
                
                played_games.append({
                    'date': row.find('th', {'data-stat': 'date_game'}).text if row.find('th', {'data-stat': 'date_game'}) else '',
                    'visitor': visitor,
                    'home': home,
                    'visitor_pts': visitor_pts,
                    'home_pts': home_pts,
                    'winner': winner,
                    'box_score': box_score_link.find('a')['href'] if box_score_link.find('a') else ''
                })
            else:
                # game is upcoming
                upcoming_games.append({
                    'date': row.find('th', {'data-stat': 'date_game'}).text if row.find('th', {'data-stat': 'date_game'}) else '',
                    'visitor': row.find('td', {'data-stat': 'visitor_team_name'}).text if row.find('td', {'data-stat': 'visitor_team_name'}) else '',
                    'home': row.find('td', {'data-stat': 'home_team_name'}).text if row.find('td', {'data-stat': 'home_team_name'}) else ''
                })
    
    # save to csv file (sorted by date)
    if played_games:
        played_df = pd.DataFrame(played_games)
        played_df['date'] = pd.to_datetime(played_df['date'], format='mixed')
        played_df = played_df.sort_values('date')
        played_df.to_csv(os.path.join(DATA_DIR, f'played_games_{current_season}.csv'), index=False)
        print(f"Saved {len(played_games)} played games (sorted by date)")

    if upcoming_games:
        upcoming_df = pd.DataFrame(upcoming_games)
        upcoming_df['date'] = pd.to_datetime(upcoming_df['date'], format='mixed')
        upcoming_df = upcoming_df.sort_values('date')
        upcoming_df.to_csv(os.path.join(DATA_DIR, f'upcoming_games_{current_season}.csv'), index=False)
        print(f"Saved {len(upcoming_games)} upcoming games (sorted by date)")
    
    return played_games, upcoming_games

In [37]:
def update_completed_games(predictions_file, played_games_file, season=2026):
    # load predictions
    if not os.path.exists(predictions_file):
        print(f"No predictions file found at {predictions_file}")
        return None
    
    predictions = pd.read_csv(predictions_file)
    played = pd.read_csv(played_games_file)

    # normalize dates for matching
    predictions['date'] = pd.to_datetime(predictions['date']).dt.strftime('%Y-%m-%d')
    played['date'] = pd.to_datetime(played['date'], format='mixed').dt.strftime('%Y-%m-%d')

    # initialize columns if not exists
    if 'result' not in predictions.columns:
        predictions['result'] = 'not_played'
    if 'actual_winner' not in predictions.columns:
        predictions['actual_winner'] = None
    
    # check each prediction against played games
    for idx, pred in predictions.iterrows():
        # find matching game in played games
        match = played[
            (played['date'] == pred['date']) &
            (played['home'] == pred['home']) &
            (played['visitor'] == pred['visitor'])
        ]

        if len(match) > 0:
            # game has been played - need to determine winner
            game = match.iloc[0]
            
            if 'winner' in played.columns:
                actual_winner = game['winner']
                predictions.loc[idx, 'actual_winner'] = actual_winner

                if pred['predicted_winner'] == actual_winner:
                    predictions.loc[idx, 'result'] = 'correct'
                else:
                    predictions.loc[idx, 'result'] = 'incorrect'
            else:
                # game played but no winner column - need to look up from box score
                predictions.loc[idx, 'result'] = 'played_needs_winner'
        # else: game not played yet, keep as 'not_played'
    
    # sort by date before saving
    predictions['date'] = pd.to_datetime(predictions['date'])
    predictions = predictions.sort_values('date')
    predictions['date'] = predictions['date'].dt.strftime('%Y-%m-%d')
    
    # save updated predictions
    predictions.to_csv(predictions_file, index=False)

    # print summary
    print("=" * 50)
    print("PREDICTION RESULTS SUMMARY")
    print("=" * 50)
    total = len(predictions)
    correct = len(predictions[predictions['result'] == 'correct'])
    incorrect = len(predictions[predictions['result'] == 'incorrect'])
    not_played = len(predictions[predictions['result'] == 'not_played'])
    needs_winner = len(predictions[predictions['result'] == 'played_needs_winner'])
    
    print(f"Total predictions: {total}")
    print(f"Correct: {correct}")
    print(f"Incorrect: {incorrect}")
    print(f"Not played yet: {not_played}")
    if needs_winner > 0:
        print(f"Played (needs winner lookup): {needs_winner}")
    
    if correct + incorrect > 0:
        accuracy = correct / (correct + incorrect) * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
    print("=" * 50)
    
    return predictions

In [38]:
def sort_all_csv_files():
    # sort played_games
    played_file = 'data/played_games_2026.csv'
    if os.path.exists(played_file):
        df = pd.read_csv(played_file)
        df['date'] = pd.to_datetime(df['date'], format='mixed')
        df = df.sort_values('date')
        df.to_csv(played_file, index=False)
        print(f"Sorted {played_file}")
    
    # sort upcoming_games
    upcoming_file = 'data/upcoming_games_2026.csv'
    if os.path.exists(upcoming_file):
        df = pd.read_csv(upcoming_file)
        df['date'] = pd.to_datetime(df['date'], format='mixed')
        df = df.sort_values('date')
        df.to_csv(upcoming_file, index=False)
        print(f"Sorted {upcoming_file}")
    
    # sort predictions
    pred_file = 'data/predictions.csv'
    if os.path.exists(pred_file):
        df = pd.read_csv(pred_file)
        df['date'] = pd.to_datetime(df['date'], format='mixed')
        df = df.sort_values('date')
        df['date'] = df['date'].dt.strftime('%Y-%m-%d')
        df.to_csv(pred_file, index=False)
        print(f"Sorted {pred_file}")
    
    # sort prediction_history
    history_file = 'data/prediction_history.csv'
    if os.path.exists(history_file):
        df = pd.read_csv(history_file)
        df['date'] = pd.to_datetime(df['date'], format='mixed')
        df = df.sort_values('date')
        df['date'] = df['date'].dt.strftime('%Y-%m-%d')
        df.to_csv(history_file, index=False)
        print(f"Sorted {history_file}")
    
    print("All CSV files sorted by date!")

In [39]:
def get_prediction_record(predictions_file='data/predictions.csv', history_file='data/prediction_history.csv'):
    
    correct = 0
    incorrect = 0
    not_played = 0

    if os.path.exists(history_file):
        history = pd.read_csv(history_file)
        correct += len(history[history['result'] == 'correct'])
        incorrect += len(history[history['result'] == 'incorrect'])
    
    if os.path.exists(predictions_file):
        predictions = pd.read_csv(predictions_file)
        correct += len(predictions[predictions['result'] == 'correct'])
        incorrect += len(predictions[predictions['result'] == 'incorrect'])
        not_played += len(predictions[predictions['result'] == 'not_played'])
    
    total_completed = correct + incorrect

    if total_completed > 0:
        accuracy = correct / total_completed * 100
        print(f"\n{'='*40}")
        print(f" PREDICTION RECORD: {correct}-{incorrect}")
        print(f" Accuracy: {accuracy:.1f}%")
        print(f" Pending: {not_played} games")
        print(f"{'='*40}\n")
    else:
        print(f"\n{'='*40}")
        print(f" PREDICTION RECORD: 0-0")
        print(f" Pending: {not_played} games")
        print(f"{'='*40}\n")

    return {'wins': correct, 'losses': incorrect, 'pending': not_played}

In [42]:
# Daily Update : RE-RUN TO UPDATE PREDICTIONS

#delete old schedule files and re-scrape everything
print("Scraping fresh data...")
for f in os.listdir(STANDINGS_DIR):
  if f.startswith("NBA_2026"):
    os.remove(os.path.join(STANDINGS_DIR, f))

await scrape_season(2026)

#update played/upcoming games
print("\nProcessing games...")
played, upcoming = process_season_schedule(2026)

#compare predictions to actual results
print("\nUpdating prediction results...")
update_completed_games(
    predictions_file='data/predictions.csv',
    played_games_file='data/played_games_2026.csv'
)

#show record
get_prediction_record()

Scraping fresh data...
Timeout error on https://www.basketball-reference.com/leagues/NBA_2026_games.html (attempt 1/3)
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com

Processing games...
Saved 684 played games (sorted by date)
Saved 545 upcoming games (sorted by date)

Updating prediction results...
PREDICTION RESULTS SUMMARY
Total predictions: 53
Correct: 0
Incorrect: 0
Not played yet: 53

 PREDICTION RECORD: 14-13
 Accuracy: 51.9%
 Pending: 53 games



{'wins': 14, 'losses': 13, 'pending': 53}

In [41]:
# Daily Update : RE-RUN TO UPDATE PREDICTIONS

# 1. Delete old schedule files and re-scrape everything
print("Scraping fresh data...")
for f in os.listdir(STANDINGS_DIR):
  if f.startswith("NBA_2026"):
    os.remove(os.path.join(STANDINGS_DIR, f))

await scrape_season(2026)

# 2. Update played/upcoming games
print("\nProcessing games...")
played, upcoming = process_season_schedule(2026)

# 3. Compare predictions to actual results
print("\nUpdating prediction results...")
update_completed_games(
    predictions_file='data/predictions.csv',
    played_games_file='data/played_games_2026.csv'
)

# 4. Sort all CSV files by date
print("\nSorting all files by date...")
sort_all_csv_files()

# 5. Show your record
get_prediction_record()

Scraping fresh data...
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com
2025-26 NBA Schedule | Basketball-Reference.com

Processing games...
Saved 684 played games (sorted by date)
Saved 545 upcoming games (sorted by date)

Updating prediction results...
PREDICTION RESULTS SUMMARY
Total predictions: 53
Correct: 0
Incorrect: 0
Not played yet: 53

Sorting all files by date...
Sorted data/played_games_2026.csv
Sorted data/upcoming_games_2026.csv
Sorted data/predictions.csv
Sorted data/prediction_history.csv
All CSV files sorted by date!

 PREDICTION RECORD: 14-13
 Accuracy: 51.9%
 Pending: 53 games



{'wins': 14, 'losses': 13, 'pending': 53}