In [36]:
import requests
import pandas as pd
import time
from lxml import html

In [44]:
def get_details(url):
    full_url = f"https://www.baseball-reference.com{url}"
    try:
        response = requests.get(full_url)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        # Extract details
        date = tree.xpath('(//div[@class="scorebox"]/div[@class="scorebox_meta"]/div)[1]/text()')
        home_team = tree.xpath('(//div[@class="scorebox"]//strong/a)[2]/text()')
        visitor_team = tree.xpath('(//div[@class="scorebox"]//strong/a)[1]/text()')
        
        # Extract scores
        home_score = tree.xpath('(//div[@class="scorebox"]//div[@class="score"])[2]/text()')
        visitor_score = tree.xpath('(//div[@class="scorebox"]//div[@class="score"])[1]/text()')
        
        # Extract venue, attendance, start time, and time of game
        venue = tree.xpath('//div[@class="scorebox"]//div/strong[text()="Venue"]/following-sibling::text()')
        attendance = tree.xpath('//div[@class="scorebox"]//div/strong[text()="Attendance"]/following-sibling::text()')
        
        # Create the dictionary with default values if data is missing
        return {
            "date": date[0] if date else "N/A",
            "home_team": home_team[0] if home_team else "N/A",
            "visitor_team": visitor_team[0] if visitor_team else "N/A",
            "home_score": home_score[0] if home_score else "N/A",
            "visitor_score": visitor_score[0] if visitor_score else "N/A",
            "venue": venue[0].strip()[2:] if venue else "N/A",
            "attendance": attendance[0].strip()[2:].replace(",", "") if attendance else "N/A",
        }
    except Exception as e:
        print(f"Error fetching boxscore details: {e}")
        return {}
    

def scrape_mlb_games(year):
    url = f"https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        links = tree.xpath('//div[@class="section_content"]/div/p[@class="game"]/em/a/@href')
        games = []
        
        for link in links:

            time.sleep(3)
            if link:
                details = get_details(f'{link}')
                if details:
                    games.append({
                        "date": details['date'],
                        "visitor_team": details["visitor_team"],
                        "visitor_score": details["visitor_score"],
                        "home_team": details["home_team"],
                        "home_score": details["home_score"],
                        "stadium": details["venue"],
                        "attendance": details["attendance"]
                    })

        return pd.DataFrame(games)
    except Exception as e:
        print(f"Error scraping games for {year}: {e}")
        return pd.DataFrame()

In [None]:
def scrape_mlb_games_all_years(start_year=2000):
    end_year = pd.Timestamp.now().year
    all_games = []

    for year in range(start_year, end_year + 1):
        print(f"Scraping {year}...")
        df = scrape_mlb_games(year)
        if not df.empty:
            all_games.append(df)
        time.sleep(3)  # Politeness delay

    return pd.concat(all_games, ignore_index=True)

# Example usage
df_all_nfl = scrape_mlb_games_all_years(start_year=2000)
print(df_all_nfl.head())
