In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_icydata_player_stats(id, year):
    # Construct the URL based on the provided id
    url = f"https://www.icydata.hockey/player_stats/{id}"

    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table with the id "player-stats"
        table = soup.find('table', {'id': 'player-stats'})

        if table:
            # Extract data from the table
            rows = table.find_all('tr')
            data = []

            for row in rows:
                data_cells = row.find_all('td')
                if data_cells:
                    data.append([cell.get_text(strip=True) for cell in data_cells])

            # Assuming the first row is the header
            header_cells = rows[0].find_all('th')
            headers = [cell.get_text(strip=True) for cell in header_cells]

            # Create a pandas DataFrame
            df = pd.DataFrame(data, columns=headers)

            # Create a Series with 2023
            year_series = pd.Series([year] * len(df))

            # Assign the Series to the 'Year' column in the DataFrame
            df['Year'] = year_series

            # Return the DataFrame
            return df

        else:
            print("Table with id 'player-stats' not found on the page.")
            return None

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        return None

In [4]:
# 2023 Player States:
id_to_scrape = 39
target_year = 2023
df_2023 = scrape_icydata_player_stats(id_to_scrape, target_year)
df_2023.to_csv(f'IceHockey{target_year}.csv', encoding='utf-8')
df_2023

Unnamed: 0,Player,Team,GP,G,A,Pts,+/-,PN,PIM,S,SB,MS,H,GV,TK,BS,FW,FL,F%,Year
0,Jake Livingstone,NSH,5,0,1,1,-2,1,2,3,4,2,6,3,0,12,0,0,0,2023
1,Philip Tomasino,NSH,31,5,13,18,5,3,6,53,21,26,28,15,10,16,9,8,52.94,2023
2,Luke Evangelista,NSH,24,7,8,15,7,3,6,54,17,7,9,9,15,6,0,0,0,2023
3,Spencer Stastney,NSH,8,0,2,2,4,1,2,9,3,2,2,3,2,9,0,0,0,2023
4,Tommy Novak,NSH,42,13,24,37,4,3,6,71,32,27,6,13,24,21,161,181,47.08,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,Mikael Granlund,NSH,79,10,32,42,-15,10,20,121,51,51,78,33,38,61,171,213,44.53,2023
955,Filip Forsberg,NSH,50,19,23,42,-3,10,20,157,81,84,86,29,34,20,0,1,0,2023
956,Colton Sissons,NSH,82,12,18,30,-3,10,20,88,43,48,164,27,32,72,591,516,53.39,2023
957,Yakov Trenin,NSH,77,12,12,24,-7,18,47,140,52,64,167,43,44,30,14,22,38.89,2023


In [5]:
# 2023 Player States:
id_to_scrape = 36
target_year = 2022
df_2022 = scrape_icydata_player_stats(id_to_scrape, target_year)
df_2022.to_csv(f'IceHockey{target_year}.csv', encoding='utf-8')
df_2022

Unnamed: 0,Player,Team,GP,G,A,Pts,+/-,PN,PIM,S,SB,MS,H,GV,TK,BS,FW,FL,F%,Year
0,Viktor Lodin,OTT,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2022
1,Chris Wagner,BOS,1,0,0,0,-1,0,0,2,0,1,11,0,0,0,0,1,0,2022
2,Kent Johnson,CBJ,9,0,3,3,2,1,2,5,7,8,4,2,2,1,0,0,0,2022
3,Zac Jones,NYR,12,0,2,2,-7,0,0,8,3,6,4,12,15,13,0,0,0,2022
4,Hunter Drew,ANA,2,0,0,0,0,1,5,0,1,0,5,0,0,0,0,0,0,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,John Marino,PIT,81,1,23,24,4,10,23,90,62,39,73,35,43,88,0,0,0,2022
1008,Drew O'Connor,PIT,22,3,2,5,-2,2,4,40,8,14,12,5,9,7,18,34,34.62,2022
1009,Jan Rutta,TBL,76,3,15,18,24,22,49,86,68,44,93,18,6,71,0,0,0,2022
1010,Teddy Blueger,PIT,65,9,21,30,12,5,10,93,20,21,87,22,37,32,356,318,52.82,2022


In [24]:
def scrape_nhl_salaries(year):
    # Construct the URL based on the provided year
    url = f"https://www.hockey-reference.com/friv/current_nhl_salaries.cgi"

    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table with the id "salaries"
        table = soup.find('table', {'id': 'salaries'})

        if table:
            # Extract data from the table
            rows = table.find_all('tr')
            data = []

            for row in rows:
                # Check if it's a header row (th elements) or data row (td elements)
                cells = row.find_all(['th', 'td'])
                row_data = [cell.get_text(strip=True) for cell in cells]
                data.append(row_data)

            # Assuming the first row is the header
            header_cells = rows[0].find_all('th')
            headers = [cell.get_text(strip=True) for cell in header_cells]

            # Create a pandas DataFrame
            df = pd.DataFrame(data, columns=headers)

            # Create a Series with the provided year
            year_series = pd.Series([year] * len(df))

            # Assign the Series to the 'Year' column in the DataFrame
            df['Year'] = year_series

            # Return the DataFrame
            return df

        else:
            print("Table with id 'salaries' not found on the page.")
            return None

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        return None

In [26]:
# Example usage:
year_to_scrape = 2024

data_frame_salaries = scrape_nhl_salaries(year_to_scrape)

data_frame_salaries.to_csv('Player2024Salary.csv', encoding='utf-8')
data_frame_salaries

Unnamed: 0,Player,Tm,Salary,Cap Hit,Year
0,Player,Tm,Salary,Cap Hit,2024
1,Kirill Kaprizov,MIN,12500000,9000000,2024
2,Adam Fox,NYR,12000000,9500000,2024
3,Drew Doughty,LAK,11000000,11000000,2024
4,Cale Makar,COL,11000000,9000000,2024
...,...,...,...,...,...
711,Justin Sourdif,FLA,775000,847500,2024
712,Oliver Ekman-Larsson,FLA,775000,2250000,2024
713,Uvis Balinskis,FLA,775000,870000,2024
714,"Cuylle, Will",NYR,775000,828333,2024
