In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import time
import random

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
wait = WebDriverWait(driver, 10)

years = list(range(2025, 2022, -1))
all_matches = []

TEAM_NAME_MAP = {
    "Manchester Utd": "Manchester United",
    "Newcastle Utd": "Newcastle United",
    "Nott'ham Forest": "Nottingham Forest",
    "Tottenham": "Tottenham Hotspur",
    "West Ham": "West Ham United",
    "Brighton": "Brighton and Hove Albion",
    "Wolves": "Wolverhampton Wanderers",
}

def check_rate_limit(page_source):
    """Detects if the scraper has hit FBref rate limits."""
    if "Rate Limited Request" in page_source or "429 error" in page_source:
        print("    üö´ Rate limited! Sleeping for 5 minutes...")
        time.sleep(300)
        return True
    return False

try:
    for year in years:
        next_year = year + 1
        standings_url = f"https://fbref.com/en/comps/9/{year}-{next_year}/{year}-{next_year}-Premier-League-Stats"
        print(f"\nüîπ Scraping season {year}-{next_year}")
        
        driver.get(standings_url)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        page_source = driver.page_source
        if check_rate_limit(page_source):
            driver.get(standings_url)
            page_source = driver.page_source

        soup = BeautifulSoup(page_source, "html.parser")
        standings_table = soup.select('table.stats_table')[0]
        
        links = [l.get("href") for l in standings_table.find_all("a")]
        links = [l for l in links if '/squads' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]
        
        for team_url in team_urls:
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            team_name = TEAM_NAME_MAP.get(team_name, team_name)
            
            print(f"  üìä Processing {team_name}...")
            
            driver.get(team_url)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            page_source = driver.page_source
            if check_rate_limit(page_source):
                driver.get(team_url)
                page_source = driver.page_source

            soup = BeautifulSoup(page_source, "html.parser")
            links = [l.get("href") for l in soup.find_all("a")]
            shooting_links = [l for l in links if l and "all_comps/shooting/" in l]
            
            if not shooting_links:
                print(f"    ‚ö†Ô∏è No shooting data found for {team_name}")
                continue
            
            shooting_url = f"https://fbref.com{shooting_links[0]}"
            
            driver.get(shooting_url)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            shooting_page_source = driver.page_source
            if check_rate_limit(shooting_page_source):
                driver.get(shooting_url)
                shooting_page_source = driver.page_source

            shooting = pd.read_html(StringIO(shooting_page_source), match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()
            
            driver.get(team_url)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            matches_page_source = driver.page_source
            if check_rate_limit(matches_page_source):
                driver.get(team_url)
                matches_page_source = driver.page_source

            matches = pd.read_html(StringIO(matches_page_source), match="Scores & Fixtures")[0]
            
            try:
                team_data = matches.merge(
                    shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],
                    on="Date"
                )
            except ValueError as e:
                print(f"    ‚ö†Ô∏è Merge failed for {team_name} (date mismatch?): {e}")
                continue

            team_data = team_data[team_data["Comp"] == "Premier League"]
            if team_data.empty:
                print(f"    ‚ö†Ô∏è No Premier League data for {team_name}")
                continue

            team_data["Opponent"] = team_data["Opponent"].replace(TEAM_NAME_MAP)

            team_data["HomeTeam"] = team_data.apply(
                lambda row: team_name if row["Venue"] == "Home" else row["Opponent"], 
                axis=1
            )
            team_data["AwayTeam"] = team_data.apply(
                lambda row: team_name if row["Venue"] == "Away" else row["Opponent"], 
                axis=1
            )
            
            team_data["Season"] = f"{year}-{next_year}"
            team_data["Team"] = team_name

            team_data["MatchID"] = (
                team_data["Season"] + "_" +
                team_data["Date"].astype(str) + "_" +
                team_data["HomeTeam"] + "_vs_" + team_data["AwayTeam"]
            )

            all_matches.append(team_data)
            print(f"    ‚úÖ Added {len(team_data)} matches for {team_name}")
            time.sleep(random.uniform(30, 60))
        
        time.sleep(random.uniform(60, 120))
    
    if all_matches:
        final_df = pd.concat(all_matches, ignore_index=True)
        
        for col in ["HomeTeam", "AwayTeam", "Team", "Opponent"]:
            final_df[col] = final_df[col].replace(TEAM_NAME_MAP)

        final_df = final_df.drop_duplicates(subset=["Season", "Date", "HomeTeam", "AwayTeam"])

        print(f"\n‚úÖ Scraping complete! Total matches: {len(final_df)}")
        final_df.to_csv("premier_league_matches2.csv", index=False)
        print("\nDataset columns:", final_df.columns.tolist())
        print(final_df[["Season", "Date", "HomeTeam", "AwayTeam", "GF", "GA", "Result"]].head(10))
    else:
        print("‚ùå No data collected.")
        
finally:
    driver.quit()


üîπ Scraping season 2025-2026
  üìä Processing Arsenal...
    ‚úÖ Added 7 matches for Arsenal
  üìä Processing Liverpool...
    ‚úÖ Added 7 matches for Liverpool
  üìä Processing Tottenham Hotspur...
    ‚úÖ Added 7 matches for Tottenham Hotspur
  üìä Processing Bournemouth...
    ‚úÖ Added 7 matches for Bournemouth
  üìä Processing Manchester City...
    ‚úÖ Added 7 matches for Manchester City
  üìä Processing Crystal Palace...
    ‚úÖ Added 7 matches for Crystal Palace
  üìä Processing Chelsea...
    ‚úÖ Added 7 matches for Chelsea
  üìä Processing Everton...
    ‚úÖ Added 7 matches for Everton
  üìä Processing Sunderland...
    ‚úÖ Added 7 matches for Sunderland
  üìä Processing Manchester United...
    ‚úÖ Added 7 matches for Manchester United
  üìä Processing Newcastle United...
    ‚úÖ Added 7 matches for Newcastle United
  üìä Processing Brighton and Hove Albion...
    ‚úÖ Added 7 matches for Brighton and Hove Albion
  üìä Processing Aston Villa...
    ‚úÖ Added 7