In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

from datetime import datetime

import numpy as np

import pandas as pd

In [3]:
def convert_values(value):
    if isinstance(value, str):
        value = value.replace("€", "").strip()
        if 'k' in value:
            return int(float(value.replace("k", "")) * 1_000)
        elif 'm' in value:
            return int(float(value.replace("m", "")) * 1_000_000)
        elif 'bn' in value:
            return int(float(value.replace("bn", "")) * 1_000_000_000)
    return value

In [4]:
c_options = webdriver.ChromeOptions()
c_options.add_argument("--start-maximized")
c_driver = webdriver.Chrome(options=c_options)

c_driver.get('https://www.transfermarkt.com/')


current_year = datetime.now().year
years = list(range(2004, int(current_year) + 1))
top_leagues = {
    'Premier-League': {'name':'GB', 'tier': 1},
    'Championship': {'name':'GB', 'tier': 2},
    'LaLiga':  {'name':'ES', 'tier': 1},
    'LaLiga2': {'name':'ES', 'tier': 2},
    'Bundesliga': {'name':'L', 'tier': 1},
    '2-Bundesliga': {'name':'L', 'tier': 2},
    'Serie-A': {'name':'IT', 'tier': 1},
    'Serie-B': {'name':'IT', 'tier': 2},
    'Ligue-1': {'name':'FR', 'tier': 1},
    'Ligue-2': {'name':'FR', 'tier': 2},
}

In [5]:
teams_list = []
for k, v in tqdm(top_leagues.items()):
    league_tier = v['tier']
    league_id = v['name'] + str(league_tier)
    for year in years:
        url = f'https://www.transfermarkt.com/{k.lower()}/startseite/wettbewerb/{league_id}/plus/?saison_id={year}'
        c_driver.get(url=url)
        time.sleep(15)
        teams_element = c_driver.find_elements(By.XPATH, '/html/body/div/main/div[1]/div[1]/div[2]/div[2]/div/table/tbody/tr')
        teams_results = c_driver.find_elements(By.XPATH, '/html/body/div/main/div[1]/div[2]/div[2]/div[2]/table/tbody/tr')
        for team in teams_element:

            team_name = team.find_element(By.XPATH, './td[2]/a[1]').get_attribute('title')
            team_pos = None

            try:
                for result in teams_results:
                    team_id = result.find_element(By.XPATH, './td[3]/a[1]')
                    if team_id.get_attribute('title') == team_name:
                        team_pos = result.find_element(By.XPATH, './td[1]').text
                        break
            except Exception as e:
                team_pos = None


            
            titles = team.find_elements(By.XPATH, './td[2]/a[position()>1]/img')
            has_won_titles = len(titles) > 0

            team_info = {
                'year': year,
                'country': v['name'],
                'league': k,
                'tier': league_tier,
                'team': team_name,
                'position': team_pos,
                'squad_depth': team.find_element(By.XPATH, './td[3]/a').text,
                'avg_age': team.find_element(By.XPATH, './td[4]').text,
                'foreigners': team.find_element(By.XPATH, './td[5]').text,
                'avg_market_value': convert_values(team.find_element(By.XPATH, './td[6]').text),
                'market_value': convert_values(team.find_element(By.XPATH, './td[7]/a').text),
                'has_relegated': np.nan,
                'has_promoted': np.nan,
                'has_won_titles': has_won_titles
            }
            teams_list.append(team_info)

100%|██████████| 10/10 [1:35:31<00:00, 573.18s/it]


In [6]:
df = pd.DataFrame(teams_list)
df

Unnamed: 0,year,country,league,tier,team,position,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
0,2004,GB,Premier-League,1,Chelsea FC,1,31,24.9,24,10690000,331480000,,,False
1,2004,GB,Premier-League,1,Manchester United,3,37,24.7,25,7930000,293230000,,,True
2,2004,GB,Premier-League,1,Arsenal FC,2,37,23.9,29,6680000,247000000,,,True
3,2004,GB,Premier-League,1,Liverpool FC,5,38,25.3,26,5850000,222130000,,,False
4,2004,GB,Premier-League,1,Tottenham Hotspur,9,36,25.2,21,3530000,127200000,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4258,2024,FR,Ligue-2,2,Red Star FC,,28,26.7,10,371000,10400000,,,False
4259,2024,FR,Ligue-2,2,Stade Lavallois,,24,28.3,11,427000,10250000,,,False
4260,2024,FR,Ligue-2,2,AC Ajaccio,,26,27.0,11,390000,10150000,,,False
4261,2024,FR,Ligue-2,2,FC Annecy,,22,25.9,6,357000,7850000,,,False


In [7]:
df.to_csv('../../Leagues/1st_2nd_tiers_top_5_leagues.csv')