In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from time import sleep



In [3]:
#Grab urls for each team in a league for a certain year
def grab_urls(league, year):
    #format url using league and year variables
    url = 'https://www.capology.com{}/salaries/{}-{}/'.format(league, year, year + 1)
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    scripts = soup.find('div', {'class':"col s12 team-row"})
    salary_pages = []
    #Find team salary pages within the HTML, append to salary_pages list
    for a in scripts.find_all('a', href=True):
        salary_pages.append(['https://www.capology.com/' + a['href'],re.findall(r'/club/(.*)/salaries',a['href'])[0], year])
    return salary_pages

#Grab salary data for a certain team in a certain year
def grab_salary_data(url, team, year):
    #Load URL
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    #Find script containing salary information
    scripts = soup.find_all('script')[12]
    strings = scripts.string
    #Use Regex to find all players
    players = re.findall(r"""loading='lazy'>(.*)</a>""",strings)
    #Use Regex to find positions of players
    positions = re.findall(r'position.: "(.*)"',strings)
    #Use Regex to find salaries
    salaries = re.findall(r'annual_gross_eur.: accounting.formatMoney."(.*)", "',strings)
    #Create dataframe using these values
    df = pd.DataFrame(list(zip(players, salaries, positions)),
               columns =['Player', 'Salary (€/year)', 'Position'])
    #Add Descriptive columns
    df['Year'] = year
    df['Team'] = team
    df['Salary (€/year)'] = df['Salary (€/year)'].astype('int64')
    return df

In [7]:
#Example - Premier League salary data from 2013-2022
#Create blank dataframe
premier_league = pd.DataFrame(columns=['Player', 'Salary (€/year)', 'Position', 'Year', 'Team'])
#Iterate through each year
for year in range(2013, 2022):
    #Grab premier league urls for that year
    urls = grab_urls('/uk/premier-league', year)
    #Iterate through urls for each team
    for i in urls:
        #Grab salary data for that team and add it to the combined dataframe
        premier_league = pd.concat([grab_salary_data(i[0], i[1], i[2]), premier_league], ignore_index=False)
premier_league['League'] = 'Premier League'

In [8]:
premier_league

Unnamed: 0,Player,Salary (€/year),Position,Year,Team,League
0,Trincão,7944137,F,2021,wolverhampton,Premier League
1,João Moutinho,6074929,M,2021,wolverhampton,Premier League
2,Leander Dendoncker,5467436,M,2021,wolverhampton,Premier League
3,Fábio Silva,4859943,F,2021,wolverhampton,Premier League
4,Nélson Semedo,4673022,D,2021,wolverhampton,Premier League
...,...,...,...,...,...,...
36,Ju-yeong Park,0,F,2013,arsenal,Premier League
37,Wellington Silva,0,F,2013,arsenal,Premier League
38,Emmanuel Frimpong,0,M,2013,arsenal,Premier League
39,Héctor Bellerín,0,D,2013,arsenal,Premier League
