In [19]:
import os

CACHE_DIR = "html_cache"
os.makedirs(CACHE_DIR, exist_ok=False)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'html_cache'

In [20]:
import requests

seasons = [str(s) for s in range(2022,2026)]


In [25]:
base_url = "https://www.hockey-reference.com"
schedule_links = [f"{base_url}/leagues/NHL_{season}_games.html" for season in seasons]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/"
}


# TODO: implement caching function for safe web-scraping

# Takes URL --> file name for caching
def url_to_file(url):
    file = url.split('/')[-1]
    return file

# TODO: TEST FUNC
def get_page(url, headers=None):

    file = url_to_file(url)
    filename = os.path.join(CACHE_DIR, file)

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            return f.read()

    print(f"File not cached --> Fetching {url}...")
    page = requests.get(url, headers)
    if page.status_code == '200':
        print(f"Succesfully fetched page, now caching...")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(page.text)
        return page.text
    else:
        print(f"Couldn't download page. Status code: {page.status_code}")
        return None







In [24]:
test_url = 'https://www.hockey-reference.com/leagues/NHL_2022_games.html'



In [7]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)
from io import StringIO
from bs4 import BeautifulSoup
import time

all_games = []
all_box_scores_links = []
for season in schedule_links:
    data = requests.get(season, headers=headers)   # gets html page for a season
    soup = BeautifulSoup(data.text)
    data = StringIO(data.text)    # Turn raw HTML into file-like object
    games = pd.read_html(data, match='NHL Regular Season Schedule')
    games = games[0]              # Dataframe
    all_games.append(games)
    box_scores_table = soup.find(id='games')  # Get table for box scores links
    box_scores_links = [l.get("href") for l in box_scores_table.find_all("a")]    # Get all links
    box_scores_links = [f"{base_url}{l}" for l in box_scores_links if l and "boxscores/" in l]   # Get absolute URLs
    all_box_scores_links.extend(box_scores_links)
    time.sleep(3)

games = pd.concat(all_games)

In [8]:
games = games.drop(['LOG', 'Unnamed: 6', 'Notes'], axis=1)    # dropping non-relevant columns
games['Home Team Win'] = games['Home'] > games['Visitor']     # creating win column
games['Home Team Win'] = games['Home Team Win'].astype(int)   # Converting all True/False to 1/0

In [9]:
games

Unnamed: 0,Date,Time,Visitor,G,Home,G.1,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,0
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,0
...,...,...,...,...,...,...,...,...
1307,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,1
1308,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,0
1309,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1
1310,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,0


In [10]:
#all_box_scores_links
print(f"Length of box_scores_links: {len(all_box_scores_links)}")
print(f"Length of games: {len(games)}")

Length of box_scores_links: 5248
Length of games: 5248


In [12]:
games.head()


Unnamed: 0,Date,Time,Visitor,G,Home,G.1,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,0
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,0


In [16]:
box_scores_stats = []

for links in all_box_scores_links:
    data = requests.get(links, headers=headers)
    data = StringIO(data.text)
    v_stats_table = pd.read_html(data, header=1)[2].iloc[[-1]]   # Visitor team stats
    h_stats_table = pd.read_html(data, header=1)[4].iloc[[-1]]   # Home team stats

    final_v_stats_table = v_stats_table[['PIM', 'S', 'S%']]
    final_h_stats_table = h_stats_table[['PIM', 'S', 'S%']]

    final_v_stats_table.rename(columns={'PIM' : 'Visitors PIM', 'S' : 'Visitors S', 'S%' : 'Visitors S%'}, inplace=True)
    final_h_stats_table.rename(columns={'PIM' : 'Home PIM', 'S' : 'Home S', 'S%' : 'Home S%'}, inplace=True)

    final_game_stats = pd.concat([final_v_stats_table, final_h_stats_table], axis=1)

    box_scores_stats.append(final_game_stats)

print(f"Length of box_scores_stats: {len(box_scores_stats)}")






ValueError: No tables found

In [None]:
#for link in all_box_scores_links:
#    data = requests.get(link)
#    goalie_stats = pd.read_html(data.text, match='Goalies Table')


#link = all_box_scores_links[0]
#data = requests.get(link)
#data = StringIO(data.text)
#v_goalie_stats = pd.read_html(data, match='Goalies Table')[0]   # visitor team's goalie stats
#h_goalie_stats = pd.read_html(data, match='Goalies Table')[1]   # home team's goalie stats