# Capology Team Salaries Web Scraping
## Premier League and Championship (2013‚Äì2026)

### Notebook to scrape raw data from [Capology](https://www.capology.com/) using [Selenium](https://www.selenium.de)

## By [Victoria Friss de Kereki](https://www.linkedin.com/in/victoria-friss-de-kereki/)

##### Notebook first written: 11/01/2026
##### Notebook last updated: 11/01/2026

### 1) Import packages

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

### Define _i_fetch_payrolls_for_season_i_. This is the function that will perform the scraping.

In [45]:
def fetch_payrolls_for_season(url):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")

    service = Service(ChromeDriverManager().install())

    driver = webdriver.Chrome(
        service=service,
        options=options
    )

    driver.get(url)

    # Wait until data rows exist (JS-rendered)
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#table tbody tr"))
    )

    table = driver.find_element(By.ID, "table")

    # ---- HEADERS (2nd header row) ----
    header_rows = table.find_elements(By.CSS_SELECTOR, "thead tr")
    columns = [
        th.text.strip()
        for th in header_rows[1].find_elements(By.TAG_NAME, "th")
    ]

    # ---- ROWS ----
    rows = []
    for tr in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
        rows.append([
            td.text.strip()
            for td in tr.find_elements(By.TAG_NAME, "td")
        ])

    driver.quit()

    return pd.DataFrame(rows, columns=columns)


### Download Premier League Salaries

In [None]:
league = "premier-league"   # premier-league or championship

start_year = 2013 # 2013 is original one
end_year = 2025  # adjust if needed

In [None]:
def season_url(season):
    if season == "current":
        return f"https://www.capology.com/uk/{league}/payrolls/"
    return f"https://www.capology.com/uk/{league}/payrolls/{season}/"

seasons = [
    f"{y}-{y+1}" for y in range(start_year, end_year)
]
seasons.append("current")

In [75]:
all_dfs = {}

for season in seasons:
    url = season_url(season)
    print(f"Fetching {season} ‚Üí {url}")

    df = fetch_payrolls_for_season(url)

    if df.empty:
        print(f"‚ö†Ô∏è No data for {season}, skipping")
        continue

    # Determine actual season
    actual_season = "2025-2026" if season == "current" else season
    df["Season"] = actual_season

    # Store in dictionary (canonical storage)
    all_dfs[actual_season] = df

    # üîπ Create a valid Python variable name
    start_year = actual_season.split("-")[0]
    globals()[f"salaries_{start_year}"] = df

    print(f"‚úÖ salaries_{start_year} created for season {actual_season}")

    time.sleep(240)

Fetching 2022-2023 ‚Üí https://www.capology.com/uk/premier-league/payrolls/2022-2023/
Fetching 2023-2024 ‚Üí https://www.capology.com/uk/premier-league/payrolls/2023-2024/
Fetching 2024-2025 ‚Üí https://www.capology.com/uk/premier-league/payrolls/2024-2025/
Fetching current ‚Üí https://www.capology.com/uk/premier-league/payrolls/


In [76]:
payrolls_all_premierleague = pd.concat(
    [globals()[f"salaries_{y}"] for y in range(2013, 2025)],
    ignore_index=True
)

(80, 10)


Unnamed: 0,CLUB,Unnamed: 2,GROSS P/W\n(GBP),GROSS P/Y\n(GBP),ADJ. GROSS\n(GBP),KEEPER\n(GBP),DEFENSE\n(GBP),MIDFIELD\n(GBP),FORWARD\n(GBP),Season
0,Manchester United,MUN,"¬£ 4,657,500","¬£ 242,190,000","¬£ 242,190,000","¬£ 25,740,000","¬£ 72,930,000","¬£ 52,520,000","¬£ 91,000,000",2022-2023
1,Chelsea,CHE,"¬£ 4,357,000","¬£ 226,564,000","¬£ 226,564,000","¬£ 14,560,000","¬£ 80,964,000","¬£ 54,080,000","¬£ 76,960,000",2022-2023
2,Manchester City,MCI,"¬£ 3,830,000","¬£ 199,160,000","¬£ 199,160,000","¬£ 9,620,000","¬£ 71,500,000","¬£ 39,780,000","¬£ 78,260,000",2022-2023
3,Liverpool,LIV,"¬£ 3,285,000","¬£ 170,820,000","¬£ 170,820,000","¬£ 11,440,000","¬£ 49,660,000","¬£ 54,340,000","¬£ 55,380,000",2022-2023
4,Arsenal,ARS,"¬£ 2,558,000","¬£ 133,016,000","¬£ 133,016,000","¬£ 8,060,000","¬£ 38,896,000","¬£ 37,960,000","¬£ 48,100,000",2022-2023


In [None]:
payrolls_all_premierleague.shape

### Download Championship Salaries

In [125]:
league = "championship"   # premier-league or championship

start_year = 2018 # 2013 is original one
end_year = 2025  # adjust if needed

In [126]:
def season_url(season):
    if season == "current":
        return f"https://www.capology.com/uk/{league}/payrolls/"
    return f"https://www.capology.com/uk/{league}/payrolls/{season}/"

seasons = [
    f"{y}-{y+1}" for y in range(start_year, end_year)
]
seasons.append("current")

In [None]:
all_dfs = {}

for season in seasons:
    url = season_url(season)
    print(f"Fetching {season} ‚Üí {url}")

    df = fetch_payrolls_for_season(url)

    if df.empty:
        print(f"‚ö†Ô∏è No data for {season}, skipping")
        continue

    # Determine actual season
    actual_season = "2025-2026" if season == "current" else season
    df["Season"] = actual_season

    # Store in dictionary (canonical storage)
    all_dfs[actual_season] = df

    # üîπ Create a valid Python variable name
    start_year = actual_season.split("-")[0]
    globals()[f"salaries_{start_year}"] = df

    print(f"‚úÖ salaries_{start_year} created for season {actual_season}")

    time.sleep(60)

Fetching 2018-2019 ‚Üí https://www.capology.com/uk/championship/payrolls/2018-2019/
‚úÖ salaries_2018 created for season 2018-2019


In [121]:
payrolls_all_championship = pd.concat(
    [globals()[f"salaries_{y}"] for y in range(2013, 2017)],
    ignore_index=True
)

payrolls_all_championship.shape

(96, 10)