In [29]:
import os

CACHE_DIR = r"C:\Users\vikto\OneDrive\Bureau\Cache"


In [30]:
import requests

seasons = [str(s) for s in range(2022,2026)]


In [35]:
base_url = "https://www.hockey-reference.com"
schedule_links = [f"{base_url}/leagues/NHL_{season}_games.html" for season in seasons]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/"
}


# Takes URL --> file name for caching
def url_to_file(url):
    file = url.split('/')[-1]
    return file

def get_page(url, headers=None):

    file = url_to_file(url)
    filename = os.path.join(CACHE_DIR, file)

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            return f.read()

    print(f"File not cached --> Fetching {url}...")
    page = requests.get(url, headers)
    time.sleep(3.1)
    if page.status_code == 200:
        print(f"Successfully fetched page, now caching...")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(page.text)
            time.sleep(0.4)
            print(f"File Cached. Excellent. Moving right along...")
        return page.text
    else:
        print(f"Couldn't download page. Status code: {page.status_code}")
        print(f"Retry-after: {page.headers.get('Retry-After')}")
        return None







In [32]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)
from io import StringIO
from bs4 import BeautifulSoup
import time

all_games = []
all_box_scores_links = []
for season in schedule_links:
    data = get_page(season, headers)
    soup = BeautifulSoup(data)
    data = StringIO(data)
    games = pd.read_html(data, match='NHL Regular Season Schedule')
    games = games[0]              # Dataframe
    all_games.append(games)
    box_scores_table = soup.find(id='games')  # Get table for box scores links
    box_scores_links = [l.get("href") for l in box_scores_table.find_all("a")]    # Get all links
    box_scores_links = [f"{base_url}{l}" for l in box_scores_links if l and "boxscores/" in l]   # Get absolute URLs
    all_box_scores_links.extend(box_scores_links)
    time.sleep(3)

games = pd.concat(all_games)

File not cached --> Fetching https://www.hockey-reference.com/leagues/NHL_2022_games.html...
Successfully fetched page, now caching...
File Cached. Excellent. Moving right along.
File not cached --> Fetching https://www.hockey-reference.com/leagues/NHL_2023_games.html...
Successfully fetched page, now caching...
File Cached. Excellent. Moving right along.
File not cached --> Fetching https://www.hockey-reference.com/leagues/NHL_2024_games.html...
Successfully fetched page, now caching...
File Cached. Excellent. Moving right along.
File not cached --> Fetching https://www.hockey-reference.com/leagues/NHL_2025_games.html...
Successfully fetched page, now caching...
File Cached. Excellent. Moving right along.


In [5]:
games = games.drop(['LOG', 'Unnamed: 6', 'Notes'], axis=1)    # dropping non-relevant columns
games['Home Team Win'] = games['Home'] > games['Visitor']     # creating win column
games['Home Team Win'] = games['Home Team Win'].astype(int)   # Converting all True/False to 1/0

from team_abbreviations import team_map

# The following would be to map each name to their abbreviation
# TODO: Decide if you want to use it or not
#games['Visitor'] = games['Visitor'].map(team_map)
#games['Home'] = games['Home'].map(team_map)

In [6]:
games

Unnamed: 0,Date,Time,Visitor,G,Home,G.1,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,0
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,0
...,...,...,...,...,...,...,...,...
1307,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,1
1308,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,0
1309,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1
1310,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,0


In [7]:
#all_box_scores_links
print(f"Length of box_scores_links: {len(all_box_scores_links)}")
print(f"Length of games: {len(games)}")

Length of box_scores_links: 5248
Length of games: 5248


In [8]:
games.head()


Unnamed: 0,Date,Time,Visitor,G,Home,G.1,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,0
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,0


In [24]:
# Cell for testing purposes
from team_abbreviations import team_map

all_scores = []

for link in all_box_scores_links:
    data = get_page(link)
    soup = BeautifulSoup(data)

    box = soup.find('div', class_='scorebox')    # Finding scorebox to get names
    teams = box.find_all('strong')
    visitors = teams[0].text.strip()             # extracting visitor team's name
    home = teams[1].text.strip()                 # extracting home team's name

    visitors = team_map.get(visitors)            # Mapping to abbreviation
    home = team_map.get(home)

    data = StringIO(data)

    # Getting scorebox tables
    v_stats_table = pd.read_html(data, attrs={'id' : f'{visitors}_skaters'}, header=1)[0].iloc[[-1]]
    h_stats_table = pd.read_html(data, attrs={'id' : f'{home}_skaters'}, header=1)[0].iloc[[-1]]

    # Only keeping relevant columns
    v_stats_table = v_stats_table[['PIM', 'S', 'S%']]
    h_stats_table = h_stats_table[['PIM', 'S', 'S%']]

    # Renaming for processing later
    v_stats_table.rename(columns={'PIM' : 'Visitors PIM', 'S' : 'Visitors S', 'S%' : 'Visitors S%'}, inplace=True)
    h_stats_table.rename(columns={'PIM' : 'Home PIM', 'S' : 'Home S', 'S%' : 'Home S%'}, inplace=True)

    # Concatenating both tables
    final_table = pd.concat([v_stats_table, h_stats_table], axis=1)

    all_scores.append(final_table)




File not cached --> Fetching https://www.hockey-reference.com/boxscores/202210120CAR.html...
Successfully fetched page, now caching...
File not cached --> Fetching https://www.hockey-reference.com/boxscores/202210290PHI.html...
Successfully fetched page, now caching...
    Visitors PIM  Visitors S  Visitors S%  Home PIM  Home S  Home S%
19             0          36          8.3         0      28      3.6
    Visitors PIM  Visitors S  Visitors S%  Home PIM  Home S  Home S%
19             4          32          3.1         4      43      9.3
    Visitors PIM  Visitors S  Visitors S%  Home PIM  Home S  Home S%
19            14          38         10.5        10      29     10.3


In [42]:
link = 'https://www.hockey-reference.com/boxscores/202210200CGY.html'


data = get_page(link, headers=headers)
data = StringIO(data)
goalie_stats = pd.read_html(data, match='Goalies Table', header=1)
v_goalie_stats = goalie_stats[0]
h_goalie_stats = goalie_stats[1]


h_goalie_stats = h_goalie_stats[h_goalie_stats['Player'] != 'Empty Net']

