In [None]:
import pandas as pd
from lxml import html
import requests
import time

In [1]:
def scrape_nhl_games(year):
    """
    Scrape NHL game results for a given year from Hockey Reference.

    Args:
        year (int): The year of the games.

    Returns:
        pd.DataFrame: A DataFrame containing the scraped game results.
    """
    # Format the URL for the given year
    url = f"https://www.hockey-reference.com/leagues/NHL_{year}_games.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive"
    }

    try:
        # Fetch the webpage content
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check if the request was successful
        
        # Parse the HTML content
        tree = html.fromstring(response.content)

        # XPath expression to select rows in the table body
        rows = tree.xpath('//table[@id="games"]/tbody/tr')

        # Initialize a list to store game data
        games = []

        # Iterate over each row and extract data
        for row in rows:
            date = row.xpath('./th[@data-stat="date_game"]/a/text()')
            visitor_team = row.xpath('./td[@data-stat="visitor_team_name"]/a/text()')
            visitor_goals = row.xpath('./td[@data-stat="visitor_goals"]/text()')
            home_team = row.xpath('./td[@data-stat="home_team_name"]/a/text()')
            home_goals = row.xpath('./td[@data-stat="home_goals"]/text()')
            attendance = row.xpath('./td[@data-stat="attendance"]/text()')
            game_length = row.xpath('./td[@data-stat="game_duration"]/text()')
            arena = row.xpath('./td[@data-stat="arena_name"]/a/text()')

            # Append a dictionary to the games list
            games.append({
                'date': date[0] if date else None,
                'visitor_team': visitor_team[0] if visitor_team else None,
                'visitor_goals': visitor_goals[0] if visitor_goals else None,
                'home_team': home_team[0] if home_team else None,
                'home_goals': home_goals[0] if home_goals else None,
                'attendance': attendance[0].replace(",", "") if attendance else None,
                'game_length': game_length[0] if game_length else None,
                'arena': arena[0] if arena else None
            })

        # Convert the games list to a pandas DataFrame
        return pd.DataFrame(games)

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()


In [None]:
def scrape_nhl_games_all_years():
    """
    Scrape NHL game results from January 2000 to the most recent results.

    Returns:
        pd.DataFrame: A DataFrame containing the combined game results for all years.
    """
    # Start and end year
    start_year = 2000
    end_year = pd.Timestamp.now().year
    
    all_games = []

    # Iterate over years
    for year in range(start_year, end_year + 1):
        print(f"Scraping {year}...")
        df = scrape_nhl_games(year)
        if not df.empty:
            all_games.append(df)
        time.sleep(3)  # Wait for 3 seconds between requests

    # Combine all dataframes into one
    return pd.concat(all_games, ignore_index=True)

# Example usage
df_all_nhl = scrape_nhl_games_all_years()


In [11]:
df_all_nhl.to_csv('NHL.csv', index=False)