In [66]:
import requests
import pandas as pd
import time
from lxml import html

In [68]:
def get_details(url):
    full_url = f"https://www.pro-football-reference.com{url}"
    try:
        response = requests.get(full_url)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        # Extract details
        home_team = tree.xpath('(//div[@class="scorebox"]//strong/a)[2]/text()')[0]
        visitor_team = tree.xpath('(//div[@class="scorebox"]//strong/a)[1]/text()')[0]
        home_score = tree.xpath('(//div[@class="scorebox"]//div[@class="score"])[2]/text()')[0]
        visitor_score = tree.xpath('(//div[@class="scorebox"]//div[@class="score"])[1]/text()')[0]
        stadium = tree.xpath('(//div[@class="scorebox_meta"]//div/a)[1]/text()')[0]
        attendance = tree.xpath('(//div[@class="scorebox_meta"]//div/a)[2]/text()')[0]
        
        return {
            "home_team": home_team,
            "visitor_team": visitor_team,
            "home_score": home_score,
            "visitor_score": visitor_score,
            "stadium": stadium,
            "attendance": attendance.replace(",", "")
        }
    
    except Exception as e:
        print(f"Error fetching boxscore details: {e}")
        return {}

def scrape_nfl_games(year):
    url = f"https://www.pro-football-reference.com/years/{year}/games.htm"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive"
    }


    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        rows = tree.xpath('//table[@id="games"]/tbody/tr[not(contains(@class, "thead"))]')
        games = []

        for row in rows:
            date = row.xpath('./td[@data-stat="game_date"]/text()')
            boxscore_link = row.xpath('./td[@data-stat="boxscore_word"]/a/@href')

            if boxscore_link:
                details = get_details(boxscore_link[0])
                time.sleep(3)
                if details:
                    games.append({
                        "date": date[0] if date else None,
                        "visitor_team": details["visitor_team"],
                        "visitor_score": details["visitor_score"],
                        "home_team": details["home_team"],
                        "home_score": details["home_score"],
                        "stadium": details["stadium"],
                        "attendance": details["attendance"]
                    })

        return pd.DataFrame(games)
    except Exception as e:
        print(f"Error scraping games for {year}: {e}")
        return pd.DataFrame()


In [None]:
def scrape_nfl_games_all_years(start_year=2000):
    end_year = pd.Timestamp.now().year
    all_games = []

    for year in range(start_year, end_year + 1):
        print(f"Scraping {year}...")
        df = scrape_nfl_games(year)
        if not df.empty:
            all_games.append(df)
        time.sleep(3)  # Politeness delay

    return pd.concat(all_games, ignore_index=True)

# Example usage
df_all_nfl = scrape_nfl_games_all_years(start_year=2000)


In [70]:
df_all_nfl.to_csv('NFL.csv', index=False)