In [6]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import random
from IPython.display import display
import re

# 1) Create a Session with a random or fixed User-Agent
s = requests.Session()
s.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/109.0.0.0 Safari/537.36"
    )
})

def get_soup(session, url):
    """Fetch a URL, return BeautifulSoup or None on failure."""
    try:
        resp = session.get(url, timeout=10)
        # Check status code
        if resp.status_code == 200:
            return BeautifulSoup(resp.text, 'html.parser')
        elif resp.status_code == 429:
            print("Received 429. Too many requests. Backing off.")
            # Wait longer or do an exponential backoff
            time.sleep(60)
            return None
        else:
            print(f"Error: got status {resp.status_code} for {url}")
            return None
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def get_team_links(soup):
    """Extract team links from the league table soup."""
    tables = soup.select('table.stats_table')
    if not tables:
        print("No stats_table found!")
        return []
    league_table = tables[0]
    links = league_table.find_all('a')
    hrefs = [l.get('href') for l in links if l.get('href')]
    hrefs = [h for h in hrefs if 'squads' in h]
    team_urls = [f"https://fbref.com{h}" for h in hrefs]
    return team_urls

def get_player_links(session, team_url):
    soup = get_soup(session, team_url)
    if not soup:
        return []
    tables = soup.select('table')
    if not tables:
        return []
    player_table = tables[0]
    links = player_table.find_all('a')
    hrefs = [l.get('href') for l in links if l.get('href')]
    hrefs = [h for h in hrefs if 'players' in h and 'matchlogs' not in h]
    return [f"https://fbref.com{h}" for h in hrefs]

def get_player_data(session, player_url):
    # Fetch and parse the page
    soup = get_soup(session, player_url)
    if not soup:
        print(f"Failed to retrieve page for {player_url}")
        return None

    # Extract player name
    player_name = None
    h1_tag = soup.find("h1", {"itemprop": "name"}) or soup.find("h1")
    player_name = h1_tag.get_text(strip=True) if h1_tag else "Unknown"

    try:
        # Extract player stats table
        tables = pd.read_html(str(soup), match=player_name)
        if not tables:
            print(f"No matching table found for {player_name}.")
            return None

        df = tables[0]  # Assume first table is relevant

        # Extract minutes played
        minutes = None
        footer_div = soup.find("div", id=re.compile("^tfooter_scout_summary_"))
        if footer_div:
            strong_tag = footer_div.find("strong")
            if strong_tag:
                minutes_text = strong_tag.get_text(strip=True)
                minutes = int(minutes_text.split()[0])  # Extract numeric value safely

        # Add "Minutes Played" as a new row if minutes were found
        if minutes is not None:
            print(f"Found {minutes} minutes for {player_name}")
            new_row = pd.DataFrame({"Stat": ["Minutes Played"], "Value": [minutes]})
            df = pd.concat([df, new_row], ignore_index=True)

        return df

    except ValueError as e:
        print(f"Error processing {player_name}: {e}")
        return None
    
def convert_csv(df):
    return df.to_csv(index=False)

def main():
    team_data = {}
    league_table_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
    
    soup_league = get_soup(s, league_table_url)
    if not soup_league:
        return
    
    team_urls = get_team_links(soup_league)
    
    # Example: just scrape first few teams to limit requests
    # Create one list per team to store player data
    
    for team_url in team_urls[:1]:
        team_name = team_url.split("/")[-1].replace("-", " ")
        print(f"Scraping {team_name}")
        team_data[team_name] = {'players': [], 'keepers': []}   
        player_urls = get_player_links(s, team_url)
        for purl in player_urls:
            print(f"Scraping {purl}")
            df = get_player_data(s, purl)
            if df is not None:
                if df.iloc[0, 0] == "PSxG-GA":
                    team_data[team_name]['keepers'].append(df)
                else:
                    team_data[team_name]['players'].append(df)
            else:
                print(f"No data for {purl}")
            # random sleep
            time.sleep(random.uniform(2, 5))
    
    print(f"Scraped data for {len(team_data)} teams.")
    print("Data keys:", team_data.keys())
    print("Data for first team:", team_data['Liverpool Stats'])



    # Convert each team's data to a single CSV for players and another for keepers
    for team, data in team_data.items():
        for key, dfs in data.items():
            if dfs:  # Ensure there is data to concatenate
                combined_df = pd.concat(dfs, ignore_index=True)  # Combine all DataFrames
                print(f"\n===== {team} - {key.upper()} =====")
                display(combined_df)
                csv = convert_csv(combined_df)  # Convert to CSV format
                filename = f"{team}_{key}.csv"
                
                with open(filename, "w") as f:
                    f.write(csv)
                    print(f"Saved {filename}")


if __name__ == "__main__":
    main()


Scraping Liverpool Stats
Scraping https://fbref.com/en/players/e06683ca/Virgil-van-Dijk


  tables = pd.read_html(str(soup), match=player_name)


Found 4386 minutes for Virgil van Dijk
Scraping https://fbref.com/en/players/e342ad68/Mohamed-Salah


  tables = pd.read_html(str(soup), match=player_name)


Found 3642 minutes for Mohamed Salah
Scraping https://fbref.com/en/players/b8e740fb/Ryan-Gravenberch


  tables = pd.read_html(str(soup), match=player_name)


Found 3037 minutes for Ryan Gravenberch
Scraping https://fbref.com/en/players/cd1acf9d/Trent-Alexander-Arnold


  tables = pd.read_html(str(soup), match=player_name)


Found 2851 minutes for Trent Alexander-Arnold
Scraping https://fbref.com/en/players/2e4f5f03/Andrew-Robertson


  tables = pd.read_html(str(soup), match=player_name)


Found 3244 minutes for Andrew Robertson
Scraping https://fbref.com/en/players/83d074ff/Alexis-Mac-Allister


  tables = pd.read_html(str(soup), match=player_name)


Found 3687 minutes for Alexis Mac Allister
Scraping https://fbref.com/en/players/4a1a9578/Luis-Diaz


  tables = pd.read_html(str(soup), match=player_name)


Found 3384 minutes for Luis Díaz
Scraping https://fbref.com/en/players/934e1968/Dominik-Szoboszlai


  tables = pd.read_html(str(soup), match=player_name)


Found 2587 minutes for Dominik Szoboszlai
Scraping https://fbref.com/en/players/5ed9b537/Ibrahima-Konate


  tables = pd.read_html(str(soup), match=player_name)


Found 2584 minutes for Ibrahima Konaté
Scraping https://fbref.com/en/players/7a2e46a8/Alisson


  tables = pd.read_html(str(soup), match=player_name)


Found 2508 minutes for Alisson
Scraping https://fbref.com/en/players/1971591f/Cody-Gakpo


  tables = pd.read_html(str(soup), match=player_name)


Found 2797 minutes for Cody Gakpo
Scraping https://fbref.com/en/players/4fb9c88f/Curtis-Jones


  tables = pd.read_html(str(soup), match=player_name)


Found 1782 minutes for Curtis Jones
Scraping https://fbref.com/en/players/62d7ef38/Caoimhin-Kelleher


  tables = pd.read_html(str(soup), match=player_name)


Found 2070 minutes for Caoimhín Kelleher
Scraping https://fbref.com/en/players/4d77b365/Darwin-Nunez


  tables = pd.read_html(str(soup), match=player_name)


Found 2148 minutes for Darwin Núñez
Scraping https://fbref.com/en/players/178ae8f8/Diogo-Jota


  tables = pd.read_html(str(soup), match=player_name)


Found 1087 minutes for Diogo Jota
Scraping https://fbref.com/en/players/7a11550b/Joe-Gomez


  tables = pd.read_html(str(soup), match=player_name)


Found 1536 minutes for Joe Gomez
Scraping https://fbref.com/en/players/f315ca93/Kostas-Tsimikas


  tables = pd.read_html(str(soup), match=player_name)


Found 891 minutes for Kostas Tsimikas
Scraping https://fbref.com/en/players/4125cb98/Jarell-Quansah


  tables = pd.read_html(str(soup), match=player_name)


Found 1493 minutes for Jarell Quansah
Scraping https://fbref.com/en/players/bbd67769/Conor-Bradley


  tables = pd.read_html(str(soup), match=player_name)


Found 1278 minutes for Conor Bradley
Scraping https://fbref.com/en/players/c149016b/Wataru-Endo


  tables = pd.read_html(str(soup), match=player_name)


Found 1435 minutes for Wataru Endo
Scraping https://fbref.com/en/players/b9e1436c/Harvey-Elliott


  tables = pd.read_html(str(soup), match=player_name)


Found 1291 minutes for Harvey Elliott
Scraping https://fbref.com/en/players/b0f7e36c/Federico-Chiesa


  tables = pd.read_html(str(soup), match=player_name)


Found 1176 minutes for Federico Chiesa
Scraping https://fbref.com/en/players/12bb4d6a/Vitezslav-Jaros


  tables = pd.read_html(str(soup), match=player_name)


Error processing Vitezslav Jaros: No tables found matching pattern 'Vitezslav Jaros'
No data for https://fbref.com/en/players/12bb4d6a/Vitezslav-Jaros
Scraping https://fbref.com/en/players/003cf4d1/Jayden-Danns
Error processing Jayden Danns: No tables found matching pattern 'Jayden Danns'
No data for https://fbref.com/en/players/003cf4d1/Jayden-Danns
Scraping https://fbref.com/en/players/fd08a24b/Harvey-Davies
Error processing Harvey Davies: No tables found matching pattern 'Harvey Davies'
No data for https://fbref.com/en/players/fd08a24b/Harvey-Davies
Scraping https://fbref.com/en/players/bf973eeb/James-McConnell
Error processing James McConnell: No tables found matching pattern 'James McConnell'
No data for https://fbref.com/en/players/bf973eeb/James-McConnell
Scraping https://fbref.com/en/players/2bc28bb9/Tyler-Morton
Found 1141 minutes for Tyler Morton
Scraping https://fbref.com/en/players/398a24f6/Amara-Nallo


  tables = pd.read_html(str(soup), match=player_name)


Error processing Amara Nallo: No tables found matching pattern 'Amara Nallo'
No data for https://fbref.com/en/players/398a24f6/Amara-Nallo
Scraping https://fbref.com/en/players/1d3b3d77/Trey-Nyoni
Error processing Trey Nyoni: No tables found matching pattern 'Trey Nyoni'
No data for https://fbref.com/en/players/1d3b3d77/Trey-Nyoni
Scraped data for 1 teams.
Data keys: dict_keys(['Liverpool Stats'])
Data for first team: {'players': [                   Statistic Per 90  Percentile            Stat   Value
0          Non-Penalty Goals   0.06        72.0             NaN     NaN
1       npxG: Non-Penalty xG   0.07        85.0             NaN     NaN
2                Shots Total   0.96        96.0             NaN     NaN
3                    Assists   0.02        53.0             NaN     NaN
4   xAG: Exp. Assisted Goals   0.02        52.0             NaN     NaN
5                 npxG + xAG   0.09        74.0             NaN     NaN
6      Shot-Creating Actions   1.31        89.0             N

Unnamed: 0,Statistic,Per 90,Percentile,Stat,Value
0,Non-Penalty Goals,0.06,72.0,,
1,npxG: Non-Penalty xG,0.07,85.0,,
2,Shots Total,0.96,96.0,,
3,Assists,0.02,53.0,,
4,xAG: Exp. Assisted Goals,0.02,52.0,,
...,...,...,...,...,...
457,Interceptions,0.63,13.0,,
458,Blocks,1.58,84.0,,
459,Clearances,1.89,85.0,,
460,Aerials Won,0.63,36.0,,


Saved Liverpool Stats_players.csv

===== Liverpool Stats - KEEPERS =====


Unnamed: 0,Statistic,Per 90,Percentile,Stat,Value
0,PSxG-GA,-0.10,26.0,,
1,Goals Against,1.00,93.0,,
2,Save Percentage,70.8%,51.0,,
3,PSxG/SoT,0.27,26.0,,
4,Clean Sheet Percentage,37.0%,85.0,,
5,,,,,
6,Touches,38.50,61.0,,
7,Launch %,18.1%,11.0,,
8,Goal Kicks,2.58,1.0,,
9,Avg. Length of Goal Kicks,28.8,3.0,,


Saved Liverpool Stats_keepers.csv
