In [166]:
import requests

seasons = [str(s) for s in range(2022,2026)]


In [167]:
base_url = "https://www.hockey-reference.com"
schedule_links = [f"{base_url}/leagues/NHL_{season}_games.html" for season in seasons]


In [168]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)
from io import StringIO
from bs4 import BeautifulSoup

all_games = []
all_box_scores_links = []
for season in schedule_links:
    data = requests.get(season)   # gets html page for a season
    soup = BeautifulSoup(data.text)
    data = StringIO(data.text)    # Turn raw HTML into file-like object
    games = pd.read_html(data, match='NHL Regular Season Schedule')
    games = games[0]              # Dataframe
    all_games.append(games)
    box_scores_links = [l.get("href") for l in soup.find_all("a")]  # getting all href links
    box_scores_links = [f"{base_url}{l}" for l in box_scores_links if l and "boxscores/" in l]   # getting links to all box scores
    del box_scores_links[0]   # deleting link to overall box scores
    all_box_scores_links.extend(box_scores_links)

games = pd.concat(all_games)

In [169]:
games = games.drop(['LOG', 'Unnamed: 6', 'Notes'], axis=1)    # dropping non-relevant columns
games['Home Team Win'] = games['Home'] > games['Visitor']     # creating win column
games['Home Team Win'] = games['Home Team Win'].astype(int)   # Converting all True/False to 1/0

In [170]:
games

Unnamed: 0,Date,Time,Visitor,G,Home,G.1,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,0
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,0
...,...,...,...,...,...,...,...,...
1307,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,1
1308,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,0
1309,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1
1310,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,0


In [171]:
#all_box_scores_links
len(all_box_scores_links)

5619

In [172]:
#for link in all_box_scores_links:
#    data = requests.get(link)
#    goalie_stats = pd.read_html(data.text, match='Goalies Table')


#link = all_box_scores_links[0]
#data = requests.get(link)
#data = StringIO(data.text)
#v_goalie_stats = pd.read_html(data, match='Goalies Table')[0]   # visitor team's goalie stats
#h_goalie_stats = pd.read_html(data, match='Goalies Table')[1]   # home team's goalie stats




In [179]:
link = all_box_scores_links[0]
data = requests.get(link)
data = StringIO(data.text)
v_stats_table = pd.read_html(data, header=1)[2].iloc[[-1]]   # Visitor team stats
h_stats_table = pd.read_html(data, header=1)[4].iloc[[-1]]   # Home team stats


final_v_stats_table = v_stats_table[['PIM', 'S', 'S%']]
final_h_stats_table = h_stats_table[['PIM', 'S', 'S%']]

final_v_stats_table.rename(columns={'PIM' : 'Visitors PIM', 'S' : 'Visitors S', 'S%' : 'Visitors S%'}, inplace=True)
final_h_stats_table.rename(columns={'PIM' : 'Home PIM', 'S' : 'Home S', 'S%' : 'Home S%'}, inplace=True)

final_game_stats = pd.concat([final_v_stats_table, final_h_stats_table], axis=1)


#final_h_stats_table.rename(columns={'Unnamed: 6_level_0': 'Penalty Minutes', 'Unnamed: 14_level_0': 'SG', 'Unnamed: 15_level_0' : 'S%'}, inplace=True)
#final_h_stats_table





Unnamed: 0,Visitors PIM,Visitors S,Visitors S%,Home PIM,Home S,Home S%
19,2,35,17.1,2,28,7.1
