In [5]:
import os

CACHE_DIR = r"C:\Users\vikto\OneDrive\Bureau\Cache"


In [6]:
import requests

seasons = [str(s) for s in range(2022,2026)]


In [7]:
base_url = "https://www.hockey-reference.com"
schedule_links = [f"{base_url}/leagues/NHL_{season}_games.html" for season in seasons]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/"
}


# Takes URL --> file name for caching
def url_to_file(url):
    file = url.split('/')[-1]
    return file

# Function to get html from a page. Checks cache first, else fetches page then caches
def get_page(url, headers=None):

    file = url_to_file(url)
    filename = os.path.join(CACHE_DIR, file)

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            print('File already cached!')
            return f.read()

    print(f"File not cached --> Fetching {url}...")
    page = requests.get(url, headers)
    time.sleep(3.1)
    if page.status_code == 200:
        print(f"Successfully fetched page, now caching...")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(page.text)
            time.sleep(0.4)
            print(f"File Cached. Excellent. Moving right along...")
        return page.text
    else:
        print(f"Couldn't download page. Status code: {page.status_code}")
        print(f"Retry-after: {page.headers.get('Retry-After')}")
        return None







In [8]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)
from io import StringIO
from bs4 import BeautifulSoup
import time

all_games = []
all_box_scores_links = []

# Grabs data from each season's page (including box scores links for each game)
for season in schedule_links:
    data = get_page(season, headers)
    soup = BeautifulSoup(data)
    data = StringIO(data)
    games = pd.read_html(data, match='NHL Regular Season Schedule')
    games = games[0]              # Dataframe
    all_games.append(games)
    box_scores_table = soup.find(id='games')  # Get table for box scores links
    box_scores_links = [l.get("href") for l in box_scores_table.find_all("a")]    # Get all links
    box_scores_links = [f"{base_url}{l}" for l in box_scores_links if l and "boxscores/" in l]   # Get absolute URLs
    all_box_scores_links.extend(box_scores_links)
    time.sleep(3)

games = pd.concat(all_games)

File already cached!
File already cached!
File already cached!
File already cached!


In [5]:
games = games.drop(['LOG', 'Unnamed: 6', 'Notes'], axis=1)    # dropping non-relevant columns
games['Home Team Win'] = games['G.1'] > games['G']     # creating win column
games['Home Team Win'] = games['Home Team Win'].astype(int)   # Converting all True/False to 1/0
games.rename(columns={'G' : 'Visitor G', 'G.1' : 'Home G'}, inplace=True)


In [6]:
games

Unnamed: 0,Date,Time,Visitor,Visitor G,Home,Home G,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,0
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,1
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,1
...,...,...,...,...,...,...,...,...
1307,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,0
1308,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,1
1309,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1
1310,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,1


In [7]:
#all_box_scores_links
print(f"Length of box_scores_links: {len(all_box_scores_links)}")
print(f"Length of games: {len(games)}")

Length of box_scores_links: 5248
Length of games: 5248


In [8]:
games.head()


Unnamed: 0,Date,Time,Visitor,Visitor G,Home,Home G,Att.,Home Team Win
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,0
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,1
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,1


In [9]:
# Cell for testing purposes
from Data.team_abbreviations import team_map

all_scores = []
count = 0

# Get box score data for each game
for link in all_box_scores_links:
    data = get_page(link)
    soup = BeautifulSoup(data, 'html.parser')

    box = soup.find('div', class_='scorebox')    # Finding scorebox to get names
    teams = box.find_all('strong')
    visitors = teams[0].text.strip()             # extracting visitor team's name
    home = teams[1].text.strip()                 # extracting home team's name

    visitors = team_map.get(visitors)            # Mapping to abbreviation
    home = team_map.get(home)

    data = StringIO(data)

    # Getting scorebox tables
    v_table = pd.read_html(data, attrs={'id' : f'{visitors}_skaters'}, header=1)[0]
    h_table = pd.read_html(data, attrs={'id' : f'{home}_skaters'}, header=1)[0]

    # Find row where the player name/index contains "TOTAL" and get the last one
    v_stats_table = v_table[v_table.iloc[:, 1].str.contains('TOTAL', na=False)].iloc[[-1]]
    h_stats_table = h_table[h_table.iloc[:, 1].str.contains('TOTAL', na=False)].iloc[[-1]]

    # Only keeping relevant columns
    v_stats_table = v_stats_table[['PIM', 'S', 'S%']]
    h_stats_table = h_stats_table[['PIM', 'S', 'S%']]

    # Renaming for processing later
    v_stats_table.rename(columns={'PIM' : 'Visitors PIM', 'S' : 'Visitors S', 'S%' : 'Visitors S%'}, inplace=True)
    h_stats_table.rename(columns={'PIM' : 'Home PIM', 'S' : 'Home S', 'S%' : 'Home S%'}, inplace=True)

    # Resetting indices for concatenating
    v_stats_table.reset_index(drop=True, inplace=True)
    h_stats_table.reset_index(drop=True, inplace=True)

    # Concatenating both tables
    final_table = pd.concat([v_stats_table, h_stats_table], axis=1)

    all_scores.append(final_table)
    count = count + 1
    print(count)




File already cached!
1
File already cached!
2
File already cached!
3
File already cached!
4
File already cached!
5
File already cached!
6
File already cached!
7
File already cached!
8
File already cached!
9
File already cached!
10
File already cached!
11
File already cached!
12
File already cached!
13
File already cached!
14
File already cached!
15
File already cached!
16
File already cached!
17
File already cached!
18
File already cached!
19
File already cached!
20
File already cached!
21
File already cached!
22
File already cached!
23
File already cached!
24
File already cached!
25
File already cached!
26
File already cached!
27
File already cached!
28
File already cached!
29
File already cached!
30
File already cached!
31
File already cached!
32
File already cached!
33
File already cached!
34
File already cached!
35
File already cached!
36
File already cached!
37
File already cached!
38
File already cached!
39
File already cached!
40
File already cached!
41
File already cached!
42
F

In [10]:
print(len(all_scores))
scores = pd.concat(all_scores)
scores

5248


Unnamed: 0,Visitors PIM,Visitors S,Visitors S%,Home PIM,Home S,Home S%
0,2,35,17.1,2,28,7.1
0,8,31,9.7,8,30,13.3
0,14,34,2.9,24,22,18.2
0,12,34,5.9,12,36,11.1
0,11,38,5.3,25,34,5.9
...,...,...,...,...,...,...
0,6,29,17.2,4,31,3.2
0,2,27,0.0,6,22,18.2
0,28,33,15.2,6,32,21.9
0,4,20,10.0,4,38,13.2


In [11]:
# Combine & Renaming columns
combined_games = pd.concat([games.reset_index(drop=True), scores.reset_index(drop=True)], axis=1)
combined_games.rename(columns={'Visitor' : 'Opponent', 'Visitor G' : 'GA', 'Visitors S' : 'SA', 'Visitors S%' : 'SA%'}, inplace=True)
combined_games.rename(columns={'Visitors PIM' : 'Opponent PIM', 'Home G' : 'G', 'Home Team Win' : 'Win/Loss', 'Home PIM' : 'PIM', 'Home S' : 'S', 'Home S%': 'S%'}, inplace=True)

combined_games

Unnamed: 0,Date,Time,Opponent,GA,Home,G,Att.,Win/Loss,Opponent PIM,SA,SA%,PIM,S,S%
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,0,2,35,17.1,2,28,7.1
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1,8,31,9.7,8,30,13.3
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,1,14,34,2.9,24,22,18.2
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1,12,34,5.9,12,36,11.1
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,1,11,38,5.3,25,34,5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,0,6,29,17.2,4,31,3.2
5244,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,1,2,27,0.0,6,22,18.2
5245,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1,28,33,15.2,6,32,21.9
5246,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,1,4,20,10.0,4,38,13.2


In [54]:
all_data = []

# Get goalie data for each game
for link in all_box_scores_links:
    data = get_page(link, headers=headers)
    data = StringIO(data)
    goalie_stats = pd.read_html(data, match='Goalies Table', header=1)   # List of tables (df)
    v_goalie_stats = goalie_stats[0]   # First table is visitors
    h_goalie_stats = goalie_stats[1]   # Second table is home


    # Getting team's primary goalie
    v_goalie_stats = v_goalie_stats[v_goalie_stats['Rk'] == 1]
    h_goalie_stats = h_goalie_stats[h_goalie_stats['Rk'] == 1]

    # Only keeping relevant columns
    v_goalie_stats = v_goalie_stats[['SV%']]
    h_goalie_stats = h_goalie_stats[['SV%']]

    v_goalie_stats.rename(columns={'SV%' : 'Opponent SV%'}, inplace=True)

    goalie_stats = pd.concat([h_goalie_stats, v_goalie_stats], axis=1)

    all_data.append(goalie_stats)




File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already cached!
File already 

In [55]:
# Adding goalie stats to main dataframe
goalie_s = pd.concat(all_data)
all_matches = pd.concat([combined_games.reset_index(drop=True), goalie_s.reset_index(drop=True)], axis=1)
all_matches

Unnamed: 0,Date,Time,Opponent,GA,Home,G,Att.,Win/Loss,Opponent PIM,SA,SA%,PIM,S,S%,SV%,Opponent SV%
0,2021-10-12,7:30 PM,Pittsburgh Penguins,6,Tampa Bay Lightning,2,19092.0,0,2,35,17.1,2,28,7.1,0.906,0.929
1,2021-10-12,10:00 PM,Seattle Kraken,3,Vegas Golden Knights,4,18431.0,1,8,31,9.7,8,30,13.3,0.903,0.867
2,2021-10-13,10:00 PM,Winnipeg Jets,1,Anaheim Ducks,4,16260.0,1,14,34,2.9,24,22,18.2,0.971,0.818
3,2021-10-13,10:00 PM,Chicago Blackhawks,2,Colorado Avalanche,4,18037.0,1,12,34,5.9,12,36,11.1,0.941,0.889
4,2021-10-13,10:00 PM,Vancouver Canucks,2,Edmonton Oilers,3,16034.0,1,11,38,5.3,25,34,5.9,0.947,0.941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,2025-04-17,9:30 PM,Calgary Flames,5,Los Angeles Kings,1,18145.0,0,6,29,17.2,4,31,3.2,0.828,0.968
5244,2025-04-17,7:00 PM,Tampa Bay Lightning,0,New York Rangers,4,18006.0,1,2,27,0.0,6,22,18.2,1.000,0.818
5245,2025-04-17,7:00 PM,Carolina Hurricanes,5,Ottawa Senators,7,16193.0,1,28,33,15.2,6,32,21.9,0.848,0.781
5246,2025-04-17,7:00 PM,Washington Capitals,2,Pittsburgh Penguins,5,18348.0,1,4,20,10.0,4,38,13.2,0.900,0.868


In [56]:
# Changing columns order for clarity
wanted_order = ['Date', 'Time', 'Home', 'Opponent', 'Att.', 'G', 'GA', 'S', 'S%', 'SV%', 'PIM', 'SA', 'SA%', 'Opponent SV%', 'Opponent PIM', 'Win/Loss']

all_matches = all_matches[wanted_order]
all_matches

Unnamed: 0,Date,Time,Home,Opponent,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Win/Loss
0,2021-10-12,7:30 PM,Tampa Bay Lightning,Pittsburgh Penguins,19092.0,2,6,28,7.1,0.906,2,35,17.1,0.929,2,0
1,2021-10-12,10:00 PM,Vegas Golden Knights,Seattle Kraken,18431.0,4,3,30,13.3,0.903,8,31,9.7,0.867,8,1
2,2021-10-13,10:00 PM,Anaheim Ducks,Winnipeg Jets,16260.0,4,1,22,18.2,0.971,24,34,2.9,0.818,14,1
3,2021-10-13,10:00 PM,Colorado Avalanche,Chicago Blackhawks,18037.0,4,2,36,11.1,0.941,12,34,5.9,0.889,12,1
4,2021-10-13,10:00 PM,Edmonton Oilers,Vancouver Canucks,16034.0,3,2,34,5.9,0.947,25,38,5.3,0.941,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,2025-04-17,9:30 PM,Los Angeles Kings,Calgary Flames,18145.0,1,5,31,3.2,0.828,4,29,17.2,0.968,6,0
5244,2025-04-17,7:00 PM,New York Rangers,Tampa Bay Lightning,18006.0,4,0,22,18.2,1.000,6,27,0.0,0.818,2,1
5245,2025-04-17,7:00 PM,Ottawa Senators,Carolina Hurricanes,16193.0,7,5,32,21.9,0.848,6,33,15.2,0.781,28,1
5246,2025-04-17,7:00 PM,Pittsburgh Penguins,Washington Capitals,18348.0,5,2,38,13.2,0.900,4,20,10.0,0.868,4,1


In [57]:
# Re-organizing the Dataframe to be from a team's perspective rather than 1 row/game

home = all_matches.copy()
home.rename(columns={'Home' : 'Team', 'Win/Loss' : 'Result'}, inplace=True)
home['venue'] = 'Home'

away = all_matches.copy()
away['Team'] = away['Opponent']
away['Opponent'] = away['Home']
away['venue'] = 'Away'

# Flip home team/away stats to be from the away team's perspective
away['G'], away['GA'] = away['GA'], away['G']
away['S'], away['SA'] = away['SA'], away['S']
away['S%'], away['SA%'] = away['SA%'], away['S%']
away['SV%'], away['Opponent SV%'] = away['Opponent SV%'], away['SV%']
away['PIM'], away['Opponent PIM'] = away['Opponent PIM'], away['PIM']
away['Win/Loss'] = away['Win/Loss'].apply(lambda x: 1 - x)
away.rename(columns={'Win/Loss' : 'Result'}, inplace=True)
home



Unnamed: 0,Date,Time,Team,Opponent,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result,venue
0,2021-10-12,7:30 PM,Tampa Bay Lightning,Pittsburgh Penguins,19092.0,2,6,28,7.1,0.906,2,35,17.1,0.929,2,0,Home
1,2021-10-12,10:00 PM,Vegas Golden Knights,Seattle Kraken,18431.0,4,3,30,13.3,0.903,8,31,9.7,0.867,8,1,Home
2,2021-10-13,10:00 PM,Anaheim Ducks,Winnipeg Jets,16260.0,4,1,22,18.2,0.971,24,34,2.9,0.818,14,1,Home
3,2021-10-13,10:00 PM,Colorado Avalanche,Chicago Blackhawks,18037.0,4,2,36,11.1,0.941,12,34,5.9,0.889,12,1,Home
4,2021-10-13,10:00 PM,Edmonton Oilers,Vancouver Canucks,16034.0,3,2,34,5.9,0.947,25,38,5.3,0.941,11,1,Home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,2025-04-17,9:30 PM,Los Angeles Kings,Calgary Flames,18145.0,1,5,31,3.2,0.828,4,29,17.2,0.968,6,0,Home
5244,2025-04-17,7:00 PM,New York Rangers,Tampa Bay Lightning,18006.0,4,0,22,18.2,1.000,6,27,0.0,0.818,2,1,Home
5245,2025-04-17,7:00 PM,Ottawa Senators,Carolina Hurricanes,16193.0,7,5,32,21.9,0.848,6,33,15.2,0.781,28,1,Home
5246,2025-04-17,7:00 PM,Pittsburgh Penguins,Washington Capitals,18348.0,5,2,38,13.2,0.900,4,20,10.0,0.868,4,1,Home


In [58]:
# Cleaning / Rearranging data for ML
combined = pd.concat([home, away], ignore_index=True)
combined = combined.drop(columns=['Home'])
combined = combined.sort_values(by=['Team']).reset_index(drop=True)
combined

Unnamed: 0,Date,Time,Team,Opponent,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result,venue
0,2021-12-12,7:00 PM,Anaheim Ducks,St. Louis Blues,17010.0,3,2,39,7.7,0.920,2,25,8.0,0.923,2,1,Away
1,2023-01-28,10:30 PM,Anaheim Ducks,Arizona Coyotes,16126.0,2,1,45,4.4,0.971,20,34,2.9,0.956,10,1,Home
2,2022-01-14,8:00 PM,Anaheim Ducks,Minnesota Wild,18300.0,3,7,42,7.1,0.793,5,42,16.7,0.929,7,0,Away
3,2024-11-05,10:00 PM,Anaheim Ducks,Vancouver Canucks,13538.0,1,5,22,4.5,0.865,10,37,13.5,0.955,8,0,Home
4,2022-10-12,10:00 PM,Anaheim Ducks,Seattle Kraken,17530.0,5,4,27,18.5,0.917,15,48,8.3,0.815,11,1,Home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,2024-01-02,8:00 PM,Winnipeg Jets,Tampa Bay Lightning,14157.0,4,2,28,14.3,0.941,6,34,5.9,0.889,6,1,Home
10492,2025-03-07,7:00 PM,Winnipeg Jets,New Jersey Devils,16088.0,6,1,35,17.1,0.957,4,23,4.3,0.829,4,1,Away
10493,2023-02-20,7:00 PM,Winnipeg Jets,New York Rangers,18006.0,4,1,21,19.0,0.980,13,51,2.0,0.810,7,1,Away
10494,2025-03-09,5:00 PM,Winnipeg Jets,Carolina Hurricanes,18700.0,2,4,22,9.1,0.885,24,27,14.8,0.909,8,0,Away


In [59]:
# Moving venue column (for readability)

columns = combined.columns.tolist()
columns.remove('venue')
columns.insert(columns.index('Opponent') + 1, 'venue')

combined = combined[columns]
combined


Unnamed: 0,Date,Time,Team,Opponent,venue,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result
0,2021-12-12,7:00 PM,Anaheim Ducks,St. Louis Blues,Away,17010.0,3,2,39,7.7,0.920,2,25,8.0,0.923,2,1
1,2023-01-28,10:30 PM,Anaheim Ducks,Arizona Coyotes,Home,16126.0,2,1,45,4.4,0.971,20,34,2.9,0.956,10,1
2,2022-01-14,8:00 PM,Anaheim Ducks,Minnesota Wild,Away,18300.0,3,7,42,7.1,0.793,5,42,16.7,0.929,7,0
3,2024-11-05,10:00 PM,Anaheim Ducks,Vancouver Canucks,Home,13538.0,1,5,22,4.5,0.865,10,37,13.5,0.955,8,0
4,2022-10-12,10:00 PM,Anaheim Ducks,Seattle Kraken,Home,17530.0,5,4,27,18.5,0.917,15,48,8.3,0.815,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,2024-01-02,8:00 PM,Winnipeg Jets,Tampa Bay Lightning,Home,14157.0,4,2,28,14.3,0.941,6,34,5.9,0.889,6,1
10492,2025-03-07,7:00 PM,Winnipeg Jets,New Jersey Devils,Away,16088.0,6,1,35,17.1,0.957,4,23,4.3,0.829,4,1
10493,2023-02-20,7:00 PM,Winnipeg Jets,New York Rangers,Away,18006.0,4,1,21,19.0,0.980,13,51,2.0,0.810,7,1
10494,2025-03-09,5:00 PM,Winnipeg Jets,Carolina Hurricanes,Away,18700.0,2,4,22,9.1,0.885,24,27,14.8,0.909,8,0


In [60]:
games['Date'] = pd.to_datetime(games['Date'])

In [61]:
combined_sorted = combined.sort_values(by=['Team', 'Date']).reset_index(drop=True)
combined_sorted

Unnamed: 0,Date,Time,Team,Opponent,venue,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result
0,2021-10-13,10:00 PM,Anaheim Ducks,Winnipeg Jets,Home,16260.0,4,1,22,18.2,0.971,24,34,2.9,0.818,14,1
1,2021-10-15,10:00 PM,Anaheim Ducks,Minnesota Wild,Home,11938.0,1,2,29,3.4,0.953,25,43,4.7,0.966,23,0
2,2021-10-18,9:30 PM,Anaheim Ducks,Calgary Flames,Away,15174.0,3,2,27,11.1,0.953,11,43,4.7,0.889,9,1
3,2021-10-19,9:00 PM,Anaheim Ducks,Edmonton Oilers,Away,14082.0,5,6,36,13.9,0.861,12,37,16.2,0.733,4,0
4,2021-10-21,8:00 PM,Anaheim Ducks,Winnipeg Jets,Away,13886.0,1,5,39,2.6,0.846,4,27,18.5,0.974,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,2025-04-07,7:30 PM,Winnipeg Jets,St. Louis Blues,Home,15225.0,3,1,26,11.5,0.933,4,15,6.7,0.920,2,1
10492,2025-04-10,8:00 PM,Winnipeg Jets,Dallas Stars,Away,18532.0,4,0,35,11.4,1.000,7,25,0.0,0.886,7,1
10493,2025-04-12,7:00 PM,Winnipeg Jets,Chicago Blackhawks,Away,20634.0,5,4,42,9.5,0.875,4,32,12.5,0.905,4,1
10494,2025-04-13,7:00 PM,Winnipeg Jets,Edmonton Oilers,Home,15225.0,1,4,18,5.6,0.921,2,39,10.3,0.944,4,0


In [None]:
# Saving as CSV
combined.to_csv('data.csv')

In [3]:
import pandas as pd
data = pd.read_csv('games.csv')
data

Unnamed: 0.1,Unnamed: 0,Date,Time,Team,Opponent,venue,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result
0,0,2021-12-12,7:00 PM,Anaheim Ducks,St. Louis Blues,Away,17010.0,3,2,39,7.7,0.920,2,25,8.0,0.923,2,1
1,1,2023-01-28,10:30 PM,Anaheim Ducks,Arizona Coyotes,Home,16126.0,2,1,45,4.4,0.971,20,34,2.9,0.956,10,1
2,2,2022-01-14,8:00 PM,Anaheim Ducks,Minnesota Wild,Away,18300.0,3,7,42,7.1,0.793,5,42,16.7,0.929,7,0
3,3,2024-11-05,10:00 PM,Anaheim Ducks,Vancouver Canucks,Home,13538.0,1,5,22,4.5,0.865,10,37,13.5,0.955,8,0
4,4,2022-10-12,10:00 PM,Anaheim Ducks,Seattle Kraken,Home,17530.0,5,4,27,18.5,0.917,15,48,8.3,0.815,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,10491,2024-01-02,8:00 PM,Winnipeg Jets,Tampa Bay Lightning,Home,14157.0,4,2,28,14.3,0.941,6,34,5.9,0.889,6,1
10492,10492,2025-03-07,7:00 PM,Winnipeg Jets,New Jersey Devils,Away,16088.0,6,1,35,17.1,0.957,4,23,4.3,0.829,4,1
10493,10493,2023-02-20,7:00 PM,Winnipeg Jets,New York Rangers,Away,18006.0,4,1,21,19.0,0.980,13,51,2.0,0.810,7,1
10494,10494,2025-03-09,5:00 PM,Winnipeg Jets,Carolina Hurricanes,Away,18700.0,2,4,22,9.1,0.885,24,27,14.8,0.909,8,0


In [29]:

from bs4 import Comment

from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from Data.team_abbreviations import team_map
from io import StringIO

edge_driver_path = r"C:\Users\vikto\Downloads\edgedriver_win64\msedgedriver.exe"
service = EdgeService(executable_path=edge_driver_path)
driver = webdriver.Edge(service=service)


all_adv = []
counter = 0

for link in all_box_scores_links:

    edge_driver_path = r"C:\Users\vikto\Downloads\edgedriver_win64\msedgedriver.exe"
    service = EdgeService(executable_path=edge_driver_path)
    driver = webdriver.Edge(service=service)

    data = get_page(link)
    soup = BeautifulSoup(data, 'html.parser')

    box = soup.find('div', class_='scorebox')    # Finding scorebox to get names
    teams = box.find_all('strong')
    visitors = teams[0].text.strip()             # extracting visitor team's name
    home = teams[1].text.strip()                 # extracting home team's name

    visitors = team_map.get(visitors)            # Mapping to abbreviation
    home = team_map.get(home)

    url = link
    driver.get(url)
    wait = WebDriverWait(driver, 2)

    vis = driver.find_element(By.ID, f'{visitors}_adv_ALLAll')
    home = driver.find_element(By.ID, f'{home}_adv_ALLAll')

    vis_table = vis.get_attribute('outerHTML')
    home_table = home.get_attribute('outerHTML')

    vis_table = StringIO(vis_table)
    home_table = StringIO(home_table)

    vis_df = pd.read_html(vis_table)[0]
    home_df = pd.read_html(home_table)[0]

    vis_df = vis_df[['SAT‑F', 'SAT‑A', 'CF%', 'oZS%']]
    home_df = home_df[['SAT‑F', 'SAT‑A', 'CF%', 'oZS%']]

    vis_df.rename(columns={'SAT‑F' : 'Opponent SAT‑F', 'SAT‑A' : 'Opponent SAT‑A', 'CF%' : 'Opponent CF%', 'oZS%' : 'Opponent oZS%'}, inplace=True)

    final = pd.concat([vis_df, home_df], axis=1)

    all_adv.append(final)
    counter = counter + 1
    print(counter)
    driver.quit()





File already cached!
1
File already cached!
2
File already cached!
3
File already cached!
4
File already cached!
5
File already cached!
6
File already cached!
7
File already cached!
8
File already cached!
9
File already cached!
10
File already cached!


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=138.0.3351.95)
Stacktrace:
	GetHandleVerifier [0x0x7ff7e7128c85+23461]
	(No symbol) [0x0x7ff7e707cd70]
	GetHandleVerifier [0x0x7ff7e73a0bb8+2611928]
	(No symbol) [0x0x7ff7e6e2d2ef]
	(No symbol) [0x0x7ff7e6ebfad4]
	(No symbol) [0x0x7ff7e6ed6e5a]
	(No symbol) [0x0x7ff7e6eba423]
	(No symbol) [0x0x7ff7e6e8ea86]
	(No symbol) [0x0x7ff7e6e8dd11]
	(No symbol) [0x0x7ff7e6e8e8b3]
	(No symbol) [0x0x7ff7e6f8e6fd]
	(No symbol) [0x0x7ff7e6f9ba88]
	GetHandleVerifier [0x0x7ff7e7208acb+940523]
	GetHandleVerifier [0x0x7ff7e7211821+976705]
	(No symbol) [0x0x7ff7e708a961]
	(No symbol) [0x0x7ff7e7083344]
	(No symbol) [0x0x7ff7e7083493]
	(No symbol) [0x0x7ff7e7074f36]
	BaseThreadInitThunk [0x0x7ffb53d1e8d7+23]
	RtlUserThreadStart [0x0x7ffb5505c34c+44]
