The goal is to develop custom web scraping function(s) for fbref.com for future use cases.

### Testing with the Premier League

In [3]:
# Import modules

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time

In [4]:
# Player_season_stats tables

def fbref_player_season_stats_scrape(url):
    # Set up headless browser
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=options)

    # Load the page
    url = url
    driver.get(url)

    # Wait for the page to load
    time.sleep(5)

    # Scroll to the Player Standard Stats table
    try:
        table_element = driver.find_element(By.ID, "stats_standard")
        actions = ActionChains(driver)
        actions.move_to_element(table_element).perform()
        time.sleep(3)  # let JS populate the table
    except Exception as e:
        print("Could not scroll to table:", e)

    # Get page source
    html = driver.page_source
    driver.quit()

    # Parse the correct table
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", {"id": "stats_standard"})
    if table is None:
        raise Exception("Player Standard Stats table not found in page source!")

    # Convert to DataFrame
    df = pd.read_html(str(table))[0]

    return df

In [5]:
# Testing the function

df = fbref_player_season_stats_scrape("https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats")
df.head()

  df = pd.read_html(str(table))[0]


In [26]:
# Cleaning the table

def fbref_player_season_stats_cleaning(df):
    # Make a copy of the original dataframe
    dfa = df.copy()
    
    # Modify the column names from a tuple
    dfa.columns = [col[1] if isinstance(col, tuple) else col for col in dfa.columns]

    # Drop unwanted columns
    dfa.drop(columns = ['Rk', 'Matches'], inplace = True)
    dfa = dfa.iloc[:, :-10]

    # Drop unwanted rows
    dfa.drop(dfa.loc[dfa.Player == 'Player'].index, inplace=True)

    # Modify the 'Nation' entries
    dfa["Nation"] = dfa["Nation"].str[-3:]

    # Modify the 'Position' entries to only be the players' main position
    dfa['Pos'] = dfa['Pos'].replace({'FW,MF':'FW', 'MF,FW':'MF', 'DF,MF':'DF', 'MF,DF':'MF', 'FW,DF':'FW', 'DF,FW':'DF'})

    # Modify the 'Age' entries to be only the first 2 digits
    dfa['Age'] = dfa['Age'].str[0:2]

    return dfa

In [11]:
# Testing the function

df_clean = fbref_player_season_stats_cleaning(df)
df_clean.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
0,Max Aarons,ENG,DF,Bournemouth,24,2000,3,1,86,1.0,...,0,0,0,0.0,0.0,0.0,0.0,1,8,3
1,Joshua Acheampong,ENG,DF,Chelsea,18,2006,4,2,170,1.9,...,0,1,0,0.2,0.2,0.0,0.2,0,8,0
2,Tyler Adams,USA,MF,Bournemouth,25,1999,28,21,1965,21.8,...,0,7,0,1.6,1.6,1.0,2.6,14,76,10
3,Tosin Adarabioyo,ENG,DF,Chelsea,26,1997,22,15,1409,15.7,...,0,4,0,0.9,0.9,0.2,1.2,5,42,1
4,Simon Adingra,CIV,FW,Brighton,22,2002,29,12,1097,12.2,...,0,0,0,2.5,2.5,2.5,4.9,50,18,136


In [13]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 574 entries, 0 to 595
Data columns (total 25 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Player    574 non-null    object
 1   Nation    570 non-null    object
 2   Pos       574 non-null    object
 3   Squad     574 non-null    object
 4   Age       570 non-null    object
 5   Born      570 non-null    object
 6   MP        574 non-null    object
 7   Starts    574 non-null    object
 8   Min       574 non-null    object
 9   90s       574 non-null    object
 10  Gls       574 non-null    object
 11  Ast       574 non-null    object
 12  G+A       574 non-null    object
 13  G-PK      574 non-null    object
 14  PK        574 non-null    object
 15  PKatt     574 non-null    object
 16  CrdY      574 non-null    object
 17  CrdR      574 non-null    object
 18  xG        574 non-null    object
 19  npxG      574 non-null    object
 20  xAG       574 non-null    object
 21  npxG+xAG  574 non-nul

The two functions look like they are working for the Premier League tables, let's try to use them on a different league to see if any adjustments need to be made.

### Testing with MLS

In [22]:
df_mls = fbref_player_season_stats_scrape("https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats")
df_mls_clean = fbref_player_season_stats_cleaning(df_mls)
df_mls_clean.head()

  df = pd.read_html(str(table))[0]


Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
0,Paxten Aaronson,USA,MF,Colorado Rapids,22-031,2003,4,3,301,3.3,...,0,1,0,1.9,1.9,0.3,2.2,14,29,14
1,Liel Abada,ISR,FW,Charlotte,23-358,2001,27,17,1417,15.7,...,0,1,0,6.9,6.9,2.7,9.6,62,47,119
2,Wessam Abou Ali,PLE,FW,Columbus Crew,26-265,1999,5,4,305,3.4,...,0,0,0,0.8,0.8,0.6,1.4,6,8,30
3,Luis Abram,PER,DF,Atlanta Utd,29-211,1996,21,16,1491,16.6,...,0,1,0,0.1,0.1,0.0,0.1,15,75,4
4,Lalas Abubakar,GHA,DF,FC Dallas,30-275,1994,22,15,1398,15.5,...,0,3,1,0.5,0.5,0.1,0.6,6,25,7


They work! Although, `Age` from MLS looks like they have it in a different format, so I'll go back into the cleaning function to take care of that and rerun the code below.

In [28]:
df_mls = fbref_player_season_stats_scrape("https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats")
df_mls_clean = fbref_player_season_stats_cleaning(df_mls)
df_mls_clean.head()

  df = pd.read_html(str(table))[0]


Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
0,Paxten Aaronson,USA,MF,Colorado Rapids,22,2003,4,3,301,3.3,...,0,1,0,1.9,1.9,0.3,2.2,14,29,14
1,Liel Abada,ISR,FW,Charlotte,23,2001,27,17,1417,15.7,...,0,1,0,6.9,6.9,2.7,9.6,62,47,119
2,Wessam Abou Ali,PLE,FW,Columbus Crew,26,1999,5,4,305,3.4,...,0,0,0,0.8,0.8,0.6,1.4,6,8,30
3,Luis Abram,PER,DF,Atlanta Utd,29,1996,21,16,1491,16.6,...,0,1,0,0.1,0.1,0.0,0.1,15,75,4
4,Lalas Abubakar,GHA,DF,FC Dallas,30,1994,22,15,1398,15.5,...,0,3,1,0.5,0.5,0.1,0.6,6,25,7


Looks like now everything is working great.

The caveat here is that the url that gets plugged into the scraping function needs to be from the `Squads & Player Stats` page on the league's (competition's) main page.

Next steps would be to:
* Handle missings
* Handle duplicates
* Modify the data types for columns from objects to ones that better suit each attribute