In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import random
from fake_useragent import UserAgent

# Initialize a UserAgent instance for random User-Agent headers
ua = UserAgent()

# Function to introduce randomized delays
def randomized_delay(min_delay=1, max_delay=5):
    delay = random.uniform(min_delay, max_delay)
    print(f"Delaying for {delay:.2f} seconds...")
    time.sleep(delay)

# Enhanced fetch function
def fetch_with_backoff(url, retries=5, min_delay=1, max_delay=5, long_wait=3600, use_proxy=False):
    session = requests.Session()
    headers = {
        "User-Agent": ua.random,
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://www.google.com/"
    }
    
    proxies = {
        "http": "http://your_proxy:port",
        "https": "https://your_proxy:port"
    } if use_proxy else None

    for attempt in range(retries):
        try:
            response = session.get(url, headers=headers, proxies=proxies, timeout=10)
            if response.status_code == 200:
                return response
            elif response.status_code in [429, 500]:
                print(f"Rate limit or server error. Retrying in {min_delay} to {max_delay} seconds (Attempt {attempt + 1}/{retries})...")
                randomized_delay(min_delay, max_delay)
            else:
                print(f"Unexpected status code {response.status_code} for URL: {url}")
                break
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retrying...")
            randomized_delay(min_delay, max_delay)

    print(f"Max retries reached. Waiting for {long_wait} seconds before one final retry...")
    time.sleep(long_wait)
    try:
        response = session.get(url, headers=headers, proxies=proxies, timeout=10)
        if response.status_code == 200:
            return response
        else:
            print(f"Final attempt failed with status code {response.status_code} for URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Final request failed: {e}")

    return None

# Function to scrape Premier League data
def scrape_premier_league(seasons):
    print("Starting scrape_premier_league function...")
    all_matches = []

    for season in seasons:
        print(f"Scraping data for season: {season}")
        standings_url = f"https://fbref.com/en/comps/9/{season}/Premier-League-Stats"
        response = fetch_with_backoff(standings_url)
        if not response:
            print(f"Failed to fetch standings page for season {season}.")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        standings_table = soup.select_one('table.stats_table')
        if not standings_table:
            print(f"Standings table not found on page for season {season}.")
            continue

        team_links = [a['href'] for a in standings_table.find_all('a', href=True) if '/squads/' in a['href']]
        team_urls = [f"https://fbref.com{link}" for link in team_links]

        for team_url in team_urls:
            print(f"Fetching data from: {team_url}")
            team_response = fetch_with_backoff(team_url)
            if not team_response:
                print(f"Skipping team due to repeated failures: {team_url}")
                continue

            team_soup = BeautifulSoup(team_response.text, 'html.parser')
            team_name = team_soup.select_one("h1").text.split(" Stats")[0]  # Extract team name
            print(f"Processing team: {team_name}")

            try:
                matches = pd.read_html(StringIO(team_response.text), match="Scores & Fixtures")[0]
            except ValueError:
                print(f"No 'Scores & Fixtures' table found for {team_url}. Skipping...")
                continue

            matches = matches[matches["Comp"] == "Premier League"]

            shooting_links = [a['href'] for a in team_soup.find_all('a', href=True) if 'all_comps/shooting/' in a['href']]
            if not shooting_links:
                print(f"No shooting stats link found for {team_url}. Skipping shooting data...")
                continue

            shooting_url = f"https://fbref.com{shooting_links[0]}"
            shooting_response = fetch_with_backoff(shooting_url)
            if not shooting_response:
                print(f"Skipping shooting data due to repeated failures: {shooting_url}")
                continue

            try:
                shooting = pd.read_html(StringIO(shooting_response.text), match="Shooting")[0]
            except ValueError:
                print(f"No 'Shooting' table found at {shooting_url}. Skipping...")
                continue

            shooting.columns = shooting.columns.droplevel()

            try:
                team_data = matches.merge(
                    shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],
                    on="Date"
                )
            except ValueError:
                print(f"Error merging match and shooting data for {team_url}. Skipping...")
                continue

            team_data["Season"] = season
            team_data["Team"] = team_name
            all_matches.append(team_data)
            print(f"Data collected for {len(team_data)} matches for {team_name}.")

            randomized_delay()

    if all_matches:
        match_df = pd.concat(all_matches, ignore_index=True)
        match_df.columns = [c.lower() for c in match_df.columns]  # Standardize column names
        print(f"Data collected for {len(match_df)} matches across all teams.")
        match_df.to_csv("matches.csv", index=False)
    else:
        print("No match data collected.")

    print("Scraping complete!")

# Specify the seasons you want to scrape
seasons_to_scrape = ["2021-2022", "2022-2023", "2023-2024", "2024-2025"]

# Run the scraping function
scrape_premier_league(seasons_to_scrape)


Starting scrape_premier_league function...
Scraping data for season: 2021-2022
Fetching data from: https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
Processing team: 
2021-2022 Manchester City
Data collected for 38 matches for 
2021-2022 Manchester City.
Delaying for 1.73 seconds...
Fetching data from: https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
Processing team: 
2021-2022 Liverpool
Data collected for 38 matches for 
2021-2022 Liverpool.
Delaying for 2.59 seconds...
Fetching data from: https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
Processing team: 
2021-2022 Chelsea
Data collected for 38 matches for 
2021-2022 Chelsea.
Delaying for 2.86 seconds...
Fetching data from: https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
Processing team: 
2021-2022 Tottenham Hotspur
Data collected for 38 matches for 
2021-2022 Tottenham Hotspur.
Delaying for 1.63 seconds...
Fetching data from: https://fbref.com/en/squads/18bb7c10/2021

In [3]:
from IPython.display import FileLink

# Display a download link for the CSV file
FileLink('matches.csv')


In [5]:
matches

NameError: name 'matches' is not defined