# Update NHL Data

This Jupyter Notebook is scheduled to run every day, scraping MoneyPuck, Odds, and historical game data from MoneyPuck, ESPN, and Hockey Reference. The results are stored in csv files to be manually committed to the github when convienient.

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import warnings 
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import datetime as dt

#!pip install selenium
#!pip install webdriver_manager
#!pip install jupyter_scheduler
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 

## Scrape MoneyPuck Data

In [8]:
def get_todays_matchups():
    date = dt.date.today()
    url = "https://moneypuck.com/index.html?date=" + str(date)
    return url

def update_matchups(matchup_df, url=get_todays_matchups()):
    options = webdriver.ChromeOptions()  # instantiate options 
    options.headless = True  # run browser in headless mode 
    # instantiate driver 
    driver = webdriver.Chrome() 

    # load website  
    driver.get(url) # get the entire website content 

    # select win probability table
    win_table = driver.find_elements(By.ID, 'includedContent')[0] 
    games = win_table.find_elements(By.TAG_NAME, "tr")
    date = str(dt.date.today())
    for game in games:
        win_probs = game.find_elements(By.TAG_NAME, "h2")
        teams = game.find_elements(By.TAG_NAME, "img")
        away_win_prob = win_probs[0].text
        home_win_prob = win_probs[1].text
        away_team = teams[0].get_attribute("alt")
        home_team = teams[1].get_attribute("alt")
        away_win_prob = float(away_win_prob.replace("%",""))/100 #Remove % symbol and convert to float (0-1)
        home_win_prob = float(home_win_prob.replace("%",""))/100
        row = {"date":date, "away_team":away_team, "away_win_prob":away_win_prob, "home_team":home_team, "home_win_prob":home_win_prob}
        matchup_df = matchup_df.append(row, ignore_index=True)
    
    driver.close()
    return matchup_df

In [164]:
moneypuck_df = pd.read_csv("Historical_Moneypuck_Predictions.csv", index_col=0)
moneypuck_df = update_matchups(moneypuck_df)
moneypuck_df.tail(10)

Unnamed: 0,date,away_team,away_win_prob,home_team,home_win_prob
19,2023-11-02,CAROLINA HURRICANES,0.517,NEW YORK RANGERS,0.483
20,2023-11-02,LOS ANGELES KINGS,0.506,OTTAWA SENATORS,0.494
21,2023-11-02,NEW YORK ISLANDERS,0.521,WASHINGTON CAPITALS,0.479
22,2023-11-02,TORONTO MAPLE LEAFS,0.458,BOSTON BRUINS,0.542
23,2023-11-02,NEW JERSEY DEVILS,0.559,MINNESOTA WILD,0.441
24,2023-11-02,DALLAS STARS,0.416,EDMONTON OILERS,0.584
25,2023-11-02,MONTREAL CANADIENS,0.468,ARIZONA COYOTES,0.532
26,2023-11-02,NASHVILLE PREDATORS,0.465,SEATTLE KRAKEN,0.535
27,2023-11-02,WINNIPEG JETS,0.447,VEGAS GOLDEN KNIGHTS,0.553
28,2023-11-02,VANCOUVER CANUCKS,0.687,SAN JOSE SHARKS,0.313


In [6]:
moneypuck_df.to_csv("Historical_Moneypuck_Predictions.csv")

### Scrape Daily Lines

In [11]:
def convert_odds(odds):
    '''Converts a pandas series of odds from american odds to decimal odds'''
    if odds < 0: #If team is the favorite (negative american odds)
        decimal_odds = -1/(odds/100) + 1
    else:
        decimal_odds = (odds/100) + 1
    return decimal_odds

def update_odds(odds_df):
    options = webdriver.ChromeOptions()  # instantiate options 
    options.headless = True  # run browser in headless mode 
    # instantiate driver 
    driver = webdriver.Chrome(options=options)
    driver.get('https://www.espn.com/nhl/lines') # get the entire website content

    date = str(dt.date.today())
    games = driver.find_elements(By.TAG_NAME, 'tr')
    i = 0
    for game in games:
        game_str = game.text
        if i % 3 != 0: #Skip the header of every table
            team_data = game_str.splitlines()
            odds_line = team_data[-1] #Line containing each team's odds is the last of the table
            odds_line = odds_line.split(" ")
            if len(odds_line) == 6: #The over is always listed on the away team's row
                away_team = team_data[0].upper()
                away_odds = convert_odds(int(odds_line[2])) #Have to account for the goalie name being split
                over_under = float(odds_line[3])
                away_puck_line = float(odds_line[4])
                away_puck_odds = covert_odds(int(odds_line[5]))
            else:
                home_team = team_data[0].upper()
                home_odds = convert_odds(int(odds_line[2]))
                home_puck_line = float(odds_line[3])
                home_puck_odds = convert_odds(int(odds_line[4]))
        elif i > 0:
            row = {"date": date, "away_team":away_team, "away_odds":away_odds, "home_team":home_team, "home_odds":home_odds,
                   "over_under":over_under, "away_puck_line":away_puck_line, "away_puck_odds":away_puck_odds, 
                   "home_puck_line":home_puck_line, "home_puck_odds":home_puck_odds}
            odds_df = odds_df.append(row, ignore_index=True)
        i += 1
    driver.close()
    return odds_df

In [157]:
odds_df = pd.read_csv("Historical_Odds.csv", index_col=0)
odds_df = update_odds(odds_df)
odds_df.tail(10)

PermissionError: [WinError 5] Access is denied: 'C:\\Users\\geniu\\.wdm\\drivers\\chromedriver\\win64\\118.0.5993.70\\chromedriver-win32\\chromedriver.exe' -> 'C:\\Users\\geniu\\.wdm\\drivers\\chromedriver\\win64\\118.0.5993.70\\chromedriver.exe'

In [13]:
odds_df.to_csv("Historical_Odds.csv")

### Scrape Past Games

In [3]:
def update_past_games(historical_df):
    hockey_scores = requests.get("https://www.hockey-reference.com/boxscores/")
    soup = BeautifulSoup(hockey_scores.text)
    games = soup.find_all(attrs={"class":"teams"})
    date = str(dt.date.today() - dt.timedelta(days=1))
    for game in games: 
        teams = game.find_all("a")[::2] #Ignore the "Final", since it's not a team
        scores = game.find_all(attrs={"class":"right"})[:3:2]
        i = 0
        for team, score in zip(teams,scores):
            if i % 2 == 0:
                away_team = team.get_text().upper()
                away_score = score.get_text()
            else:
                home_team = team.get_text().upper()
                home_score = score.get_text()
                row = {"date": date, "away_team":away_team, "away_score":away_score, "home_team":home_team, "home_score":home_score}
                historical_df = historical_df.append(row, ignore_index=True)
            i += 1

    return historical_df

Unnamed: 0,date,away_team,away_score,home_team,home_score
0,2023-11-01,ARIZONA COYOTES,3,ANAHEIM DUCKS,4
1,2023-11-01,DALLAS STARS,4,CALGARY FLAMES,3
2,2023-11-01,ST. LOUIS BLUES,1,COLORADO AVALANCHE,4
3,2023-11-01,BUFFALO SABRES,5,PHILADELPHIA FLYERS,2


In [4]:
historical_df.read_csv("Game_Outcomes.csv", index_col=0)
historical_df = update_past_games(historical_df)
historical_df.tail()

In [None]:
historical_df.to_csv("Game_Outcomes.csv")