In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

from datetime import datetime

import pandas as pd

In [7]:
def convert_values(value):
    if isinstance(value, str):
        value = value.replace("€", "").strip()
        if 'k' in value:
            return int(float(value.replace("k", "")) * 1_000)
        elif 'm' in value:
            return int(float(value.replace("m", "")) * 1_000_000)
        elif 'bn' in value:
            return int(float(value.replace("bn", "")) * 1_000_000_000)
    return value

In [5]:
c_options = webdriver.ChromeOptions()
c_options.add_argument("--start-maximized")
c_driver = webdriver.Chrome(options=c_options)

c_driver.get('https://www.transfermarkt.com/')


current_year = datetime.now().year
years = list(range(2010, int(current_year)))
top_leagues = {
    'Premier-League': {'name':'GB1', 'tier': 1},
    'Championship': {'name':'GB2', 'tier': 2},
    'LaLiga':  {'name':'ES1', 'tier': 1},
    'LaLiga2': {'name':'ES2', 'tier': 2},
    'Bundesliga': {'name':'L1', 'tier': 1},
    '2-Bundesliga': {'name':'L2', 'tier': 2},
    'Serie-A': {'name':'IT1', 'tier': 1},
    'Serie-B': {'name':'IT2', 'tier': 2},
    'Ligue-1': {'name':'FR1', 'tier': 1},
    'Ligue-2': {'name':'FR2', 'tier': 2},
}

In [9]:
teams_list = []
for k, v in tqdm(top_leagues.items()):
    league_id = v['name']
    league_tier = v['tier']
    for year in years:
        url = f'https://www.transfermarkt.com/{k.lower()}/startseite/wettbewerb/{league_id}/plus/?saison_id={year}'
        c_driver.get(url=url)
        time.sleep(15)
        teams_element = c_driver.find_elements(By.XPATH, '/html/body/div/main/div[1]/div[1]/div[2]/div[2]/div/table/tbody/tr')
        for team in teams_element:

            has_won_titles, has_icon_absteiger, has_icon_aufsteiger = False, False, False
            
            titles = team.find_elements(By.XPATH, './td[2]/a[position()>1]/img')
            has_won_titles = len(titles) > 0

            try:
                element = team.find_element(By.XPATH, './td[2]/span')
                classes = element.get_attribute('class').split()
                has_icon_absteiger = 'icon_absteiger' in classes # Relegated
                has_icon_aufsteiger = 'icon_aufsteiger' in classes # Promoted
            except:
                has_icon_absteiger, has_icon_aufsteiger = False, False

            team_info = {
                'year': year,
                'league': k,
                'tier': league_tier,
                'team': team.find_element(By.XPATH, './td[2]/a[1]').text,
                'squad_depth': team.find_element(By.XPATH, './td[3]/a').text,
                'avg_age': team.find_element(By.XPATH, './td[4]').text,
                'foreigners': team.find_element(By.XPATH, './td[5]').text,
                'avg_market_value': convert_values(team.find_element(By.XPATH, './td[6]').text),
                'market_value': convert_values(team.find_element(By.XPATH, './td[7]/a').text),
                'has_relegated': has_icon_absteiger,
                'has_promoted': has_icon_aufsteiger,
                'has_won_titles': has_won_titles
            }
            teams_list.append(team_info)

100%|██████████| 10/10 [1:00:33<00:00, 363.35s/it]


In [10]:
df = pd.DataFrame(teams_list)
df

Unnamed: 0,year,league,tier,team,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
0,2010,Premier-League,1,Chelsea FC,33,25.9,23,12850000,424100000,False,False,True
1,2010,Premier-League,1,Manchester City,45,24.9,28,8980000,404180000,False,False,False
2,2010,Premier-League,1,Manchester United,43,25.7,29,9020000,388000000,False,False,False
3,2010,Premier-League,1,Liverpool FC,42,24.9,25,8080000,339200000,False,False,False
4,2010,Premier-League,1,Arsenal FC,34,24.8,27,9530000,324000000,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2836,2023,Ligue-2,2,FC Annecy,32,27.0,10,430000,13750000,False,False,False
2837,2023,Ligue-2,2,Grenoble Foot 38,30,26.3,16,445000,13350000,False,False,False
2838,2023,Ligue-2,2,Quevilly - Rouen Métropole,33,25.9,13,400000,13200000,False,False,False
2839,2023,Ligue-2,2,Stade Lavallois,31,28.4,11,397000,12300000,False,False,False


In [11]:
df.to_csv('../Leagues/1st_2nd_tiers_top_5_leagues.csv')