# Installation
1. Install the request library
2. Install Beautiful Soup library

In [2]:
!pip install requests
!pip install bs4
!pip install tqdm

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


# Extracting Individual Player Data
In this section, we are interested in extracting data specific to each player. Data will include player attributes like: **name, birthday, height, club, league, position, appearances, injuries, etc**. Our source mainly comes from TransferMrkt.com.
## Defining constants
1. We need to define positions of players for parsing
    - This is based on the positions used in TransferMrkt.com
2. If data does not exist, we replace it with an `UNK` tag

In [3]:
# Define some constants
POSITIONS = ['Goalkeeper', 'Sweeper', 
             'Defender', 'Centre-Back', 'Left-Back', 'Right-Back',
             'Midfielder', 'Defensive Midfield', 'Central Midfield', 'Right Midfield', 'Left Midfield', 'Attacking Midfield',
             'Striker', 'Left Winger', 'Right Winger', 'Second Striker', 'Centre-Forward']
UNK = 'UNK'

# Get links of top 5 European Leagues from [transfermrket](https://www.transfermarkt.com/)
Here, we are only focusing on player data from the top 5 leagues in Europe. This could be open to further extension in the future.

In [4]:
import requests

# Specify the URL to web scrap from and include necessary headers
premier_league = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1"
laliga = "https://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1"
bundesliga = "https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1"
serie_a = "https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1"
ligue_1 = "https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
premier_league_data = requests.get(premier_league, headers = headers)
laliga_data = requests.get(laliga, headers = headers)
bundesliga_data = requests.get(bundesliga, headers = headers)
serie_a_data = requests.get(serie_a, headers = headers)
ligue_1_data = requests.get(ligue_1, headers = headers)

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
from tqdm import tqdm

# Function to get team URLs from soup
def get_team_urls(soup):
    # Get team URLs for league
    table = soup = soup.select("table.items")[0]
    stats = table.find_all("a")
    links = [l.get("href") for l in stats]
    links = set([l for l in links if '/kader/' in l])
    team_urls = [f"https://www.transfermarkt.com{l}" for l in links]
    return team_urls

# Function to get a generic profile of players
def get_generic_player_urls(team_urls):
    team_player_urls = []
    with tqdm(total=len(team_urls)) as progress_bar:
        for team_url in team_urls:
            team_data = requests.get(team_url, headers = headers)
            team_soup = BeautifulSoup(team_data.text)
            player_stats = team_soup.select("table.items")[0]
            links = player_stats.find_all("a")
            links = [l.get("href") for l in links]
            links = [l for l in links if '/profil/' in l]
            player_urls = [f"https://www.transfermarkt.com{l}" for l in links]
            team_player_urls.extend(player_urls)
            progress_bar.update(1)
    return team_player_urls
        
# Function to get detailed profile of player
def get_detailed_player_urls(team_player_urls):
    team_player_detailed_urls = []
    with tqdm(total=len(team_player_urls)) as progress_bar:
        for player_url in team_player_urls:
            player_data = requests.get(player_url, headers = headers)
            # Read the html from generic player profile page
            player_data_soup = BeautifulSoup(player_data.text)
            player_detailed_links = player_data_soup.find_all("a")
            # Find the page that contains a more detailed profile
            player_detailed_links = [l.get("href") for l in player_detailed_links]
            player_detailed_links = [l for l in player_detailed_links if '/leistungsdaten/' in l]
            # This is only for season 23/24 (change suffix to get other seasons)
            player_detailed_urls = [f"https://www.transfermarkt.com{l}/plus/0?saison=2023" for l in player_detailed_links]
            team_player_detailed_urls.extend(player_detailed_urls)
            progress_bar.update(1)
    return team_player_detailed_urls

# Function to get injury history url of player
def get_injury_history_url(team_player_urls):
    injury_history_urls = []
    with tqdm(total=len(team_player_urls)) as progress_bar:
        for player_url in team_player_urls:
            player_injury_link = player_url.replace("profil", "verletzungen")
            injury_history_urls.append(player_injury_link)
            progress_bar.update(1)
    return injury_history_urls

# Function to get injury stats
def get_injury_stats(player_injury_history):
    player_injury_stats = requests.get(player_injury_history, headers = headers)
    player_injury_stats_soup = BeautifulSoup(player_injury_stats.text)
    try:
        read = pd.read_html(StringIO(player_injury_stats.text), attrs = {'class': 'items'})
    except:
        # Means that player has no injury record
        pass

    injury = {}
    injury_stats = read[0]

    # We are only interested in this particular season
    injury_stats = (injury_stats.loc[injury_stats['Season'] == '23/24'])
    for index, row in injury_stats.iterrows():
        injury_details = {}
        key = row['Injury']
        injury_details['Days'] = row['Days']
        injury_details['Games missed'] = row['Games missed']
        injury[key] = injury_details

    general_injury_stats = read[1]
    general_injury_stats = (general_injury_stats.loc[general_injury_stats['Season'] == '23/24'])
    for index, row in general_injury_stats.iterrows():
        total_days = row['Days']
        total_games_missed = row['Games missed']
        
    return injury, total_days, total_games_missed

# Function to get injury records of player
def get_injury_record(injury_history_urls):
    injury_record_dict = {'Name': [], 'Injury': [], 'Total Days': [], 'Total Games Missed': []}
    with tqdm(total=len(injury_history_urls)) as progress_bar:
        for player_injury_history in injury_history_urls:
            player_injury_stats = requests.get(player_injury_history, headers = headers)
            player_injury_stats_soup = BeautifulSoup(player_injury_stats.text)
            try:
                read = pd.read_html(StringIO(player_injury_stats.text), attrs = {'class': 'items'})
            except:
                # Means that player has no injury record
                pass
              
            injury = {}
            name = player_injury_stats_soup.find_all("img", {"class": "data-header__profile-image"})[0].get("title")
            if not name:
                injury_record_dict['Name'].append(UNK)
            else:
                injury_record_dict['Name'].append(name)
            injury_stats = read[0]
            
            # We are only interested in this particular season
            injury_stats = (injury_stats.loc[injury_stats['Season'] == '23/24'])
            for index, row in injury_stats.iterrows():
                injury_details = {}
                key = row['Injury']
                injury_details['Days'] = row['Days']
                injury_details['Games missed'] = row['Games missed']
                injury[key] = injury_details
            injury_record_dict['Injury'].append(injury)
            
            general_injury_stats = read[1]
            general_injury_stats = (general_injury_stats.loc[general_injury_stats['Season'] == '23/24'])
            for index, row in general_injury_stats.iterrows():
                injury_record_dict['Total Days'] = row['Days']
                injury_record_dict['Total Games Missed'] = row['Games missed']
            progress_bar.update(1)
    players_injury_df = pd.DataFrame(injury_record_dict)
    return players_injury_df

def get_players_info(player_detailed_url):
    player_stats = requests.get(player_detailed_url, headers = headers)
    player_stats_soup = BeautifulSoup(player_stats.text)
    try:
        read = pd.read_html(StringIO(player_stats.text), attrs = {'class': 'items'})
    except:
        # Means that player made no appearances in the season
        print("Player made no appearances in the season")
    
    name = player_stats_soup.find_all("img", {"class": "data-header__profile-image"})[0].get("title")
    if not name:
        name = UNK

    birthday = player_stats_soup.select_one("span[itemprop*=birthDate]")
    if not birthday:
        print(name, "No Birthday") # For debugging
        birthday = UNK
    else:
        birthday = birthday.text.strip()
        
    height = player_stats_soup.select_one("span[itemprop*=height]")
    if not height:
        print(name, "No Height") # For debugging
        height = UNK
    else:
        height = height.text.strip()
    
    club = player_stats_soup.find_all("span", attrs= {"class": "data-header__club"})
    if len(club) == 0:
        print(name, "No Club") # For debugging
        club = UNK
    else:
        club = club[0].select("a")[0].get_text().strip()
        
    league = player_stats_soup.find_all("span", attrs = {"class": "data-header__league"})
    if len(league) == 0:
        print(name, "No League") # For debugging
        league = UNK
    else:
        league = league[0].get_text().strip()
        
    labels = player_stats_soup.find_all("span", attrs= {"class": "data-header__content"})
    if len([l.get_text().strip() for l in labels if any(pos in l.get_text().strip() for pos in POSITIONS)]) == 0:
        print(labels)
    position = [l.get_text().strip() for l in labels if any(pos in l.get_text().strip() for pos in POSITIONS)][0]
    
    stats = read[0]
    rows, cols = [-1], [1, 3, -1]
    season = stats.iloc[-1, 1]
    appearances = stats.iloc[-1, 3]
    
    minutes_played_label = "Minutes Played " + season
    minutes_played = stats.iloc[-1, -1]
    
    return name, birthday, height, club, league, position, appearances, minutes_played_label, minutes_played    

# Function to get players' info as 
def get_players_df(mapping):
    # Initialise a default dictionary to store player attributes
    player_dict = {'Name': [], 'Birthday': [], 'Height': [], 'League': [], 'Club': [], 'Position': [], 'Appearances': [],
                  'Injuries': [], 'Total Days': [], 'Total Games Missed': []}

    # Extract attributes from player urls
    with tqdm(total=len(mapping)) as progress_bar:
        for player_detailed_url, player_injury_history in mapping.items():
            try:
                name, birthday, height, club, league, position, appearances, minutes_played_label, minutes_played = get_players_info(player_detailed_url)
            except:
                continue 
                
            player_dict['Name'].append(name)
            player_dict['Birthday'].append(birthday)
            player_dict['Height'].append(height)
            player_dict['Club'].append(club)
            player_dict['League'].append(league)
            player_dict['Position'].append(position)
            player_dict['Appearances'].append(appearances)
            
            if minutes_played_label not in player_dict.keys():
                player_dict[minutes_played_label] = [minutes_played]
            else:
                player_dict[minutes_played_label].append(minutes_played)            
            
            try:
                injuries, total_days, total_games_missed = get_injury_stats(player_injury_history)
            except:
                # No injury record
                player_dict['Injuries'].append({})
                player_dict['Total Days'].append('0 days')
                player_dict['Total Games Missed'].append(0)
                assert(same_length(player_dict))
                progress_bar.update(1)
                continue
            player_dict['Injuries'].append(injuries)
            player_dict['Total Days'].append(total_days)
            player_dict['Total Games Missed'].append(total_games_missed)
            assert(same_length(player_dict))
            progress_bar.update(1)
            
    # Convert dictionary into data frame
    players_df = pd.DataFrame(player_dict)
    return players_df

# Helper function to check consistency of dictionary
def same_length(dict):
    keys = list(dict.keys())
    for i in range(len(keys) - 1):
        curr = keys[i]
        next = keys[i + 1]
        if not len(dict[curr]) == len(dict[next]):
            print(dict[keys[0]], curr, next)
            return False
    return True

In [7]:
# Premier League Dataframe
premier_league_soup = BeautifulSoup(premier_league_data.text)
team_urls = get_team_urls(premier_league_soup)
generic_player_urls = get_generic_player_urls(team_urls)
detailed_player_urls = get_detailed_player_urls(generic_player_urls)
player_injury_history_urls = get_injury_history_url(generic_player_urls)
assert(len(detailed_player_urls) == len(player_injury_history_urls))

# Map each player profile to the injury history
mapping = {}
for i in range(len(player_injury_history_urls)):
    mapping[detailed_player_urls[i]] = player_injury_history_urls[i]

prem_players_df = get_players_df(mapping)
prem_players_df.to_csv('prem_player_data.csv', index=False)
prem_players_df.head()

100%|██████████| 20/20 [00:19<00:00,  1.03it/s]
100%|██████████| 601/601 [06:50<00:00,  1.46it/s]
100%|██████████| 601/601 [00:00<00:00, 412093.62it/s]
  7%|▋         | 43/601 [01:46<38:45,  4.17s/it]

Player made no appearances in the season


 19%|█▉        | 116/601 [06:29<28:11,  3.49s/it]

Player made no appearances in the season


 21%|██        | 124/601 [07:00<28:03,  3.53s/it]

Daniel Gore No Height


 48%|████▊     | 290/601 [16:00<14:36,  2.82s/it]

Player made no appearances in the season


 51%|█████     | 305/601 [16:46<12:23,  2.51s/it]

Player made no appearances in the season


 52%|█████▏    | 313/601 [17:19<19:05,  3.98s/it]

Player made no appearances in the season


 52%|█████▏    | 315/601 [17:30<23:37,  4.95s/it]

Player made no appearances in the season


 66%|██████▌   | 398/601 [21:40<12:55,  3.82s/it]

Joe Taylor No Height


 76%|███████▋  | 459/601 [24:47<06:34,  2.78s/it]

Ryan Trevitt No Height


 87%|████████▋ | 524/601 [28:09<03:16,  2.55s/it]

Player made no appearances in the season


 95%|█████████▌| 573/601 [30:32<01:24,  3.04s/it]

Player made no appearances in the season


 99%|█████████▊| 593/601 [31:32<00:25,  3.19s/it]


Unnamed: 0,Name,Birthday,Height,League,Club,Position,Appearances,Injuries,Total Days,Total Games Missed,Minutes Played Total 23/24:
0,Bernd Leno,"Mar 4, 1992 (32)","1,90 m",Premier League,Fulham,Goalkeeper,41,"{'Nose surgery': {'Days': '16 days', 'Games mi...",16 days,3,3.690'
1,Marek Rodák,"Dec 13, 1996 (27)","1,94 m",Premier League,Fulham,Goalkeeper,5,{},0 days,0,450'
2,Steven Benda,"Oct 1, 1998 (25)","1,92 m",Premier League,Fulham,Goalkeeper,3,{},0 days,0,270'
3,Tosin Adarabioyo,"Sep 24, 1997 (26)","1,96 m",Premier League,Fulham,Centre-Back,25,"{'Knock': {'Days': '12 days', 'Games missed': ...",99 days,12,1.995'
4,Issa Diop,"Jan 9, 1997 (27)","1,94 m",Premier League,Fulham,Centre-Back,25,"{'Calf problems': {'Days': '35 days', 'Games m...",112 days,16,1.949'


In [8]:
# LaLiga Dataframe
laliga_soup = BeautifulSoup(laliga_data.text)
team_urls = get_team_urls(laliga_soup)
generic_player_urls = get_generic_player_urls(team_urls)
detailed_player_urls = get_detailed_player_urls(generic_player_urls)
player_injury_history_urls = get_injury_history_url(generic_player_urls)
assert(len(detailed_player_urls) == len(player_injury_history_urls))

# Map each player profile to the injury history
mapping = {}
for i in range(len(player_injury_history_urls)):
    mapping[detailed_player_urls[i]] = player_injury_history_urls[i]

laliga_players_df = get_players_df(mapping)
laliga_players_df.to_csv('laliga_player_data.csv', index=False)
laliga_players_df.head()

100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
100%|██████████| 511/511 [06:04<00:00,  1.40it/s]
100%|██████████| 511/511 [00:00<00:00, 561511.49it/s]
  4%|▍         | 21/511 [00:58<20:19,  2.49s/it]

Player made no appearances in the season


 12%|█▏        | 59/511 [02:58<39:45,  5.28s/it]

Carlos Romero No Height


 19%|█▉        | 96/511 [04:41<20:22,  2.95s/it]

Damián Rodríguez No Height


 34%|███▍      | 176/511 [08:38<11:44,  2.10s/it]

Marcos Peña No Height
Marcos Peña No League


 62%|██████▏   | 315/511 [15:15<07:23,  2.26s/it]

Juanma Herzog No League


 79%|███████▉  | 406/511 [20:47<05:15,  3.00s/it]

Player made no appearances in the season
Player made no appearances in the season


 88%|████████▊ | 448/511 [24:05<03:02,  2.90s/it]

Pablo Gozálbez No Height


 89%|████████▊ | 453/511 [24:20<02:46,  2.87s/it]

Hugo González No Height


 99%|█████████▉| 508/511 [27:32<00:09,  3.25s/it]


Unnamed: 0,Name,Birthday,Height,League,Club,Position,Appearances,Injuries,Total Days,Total Games Missed,Minutes Played Total 23/24:
0,Álex Remiro,"Mar 24, 1995 (29)","1,91 m",LaLiga,Real Sociedad,Goalkeeper,49,{},0 days,0,4.386'
1,Unai Marrero,"Oct 9, 2001 (22)","1,89 m",LaLiga,Real Sociedad,Goalkeeper,5,{},0 days,0,411'
2,Robin Le Normand,"Nov 11, 1996 (27)","1,87 m",LaLiga,Real Sociedad,Centre-Back,43,"{'muscular problems': {'Days': '6 days', 'Game...",6 days,1,3.623'
3,Igor Zubeldia,"Mar 30, 1997 (27)","1,81 m",LaLiga,Real Sociedad,Centre-Back,43,"{'Hamstring injury': {'Days': '28 days', 'Game...",28 days,5,3.698'
4,Aritz Elustondo,"Mar 28, 1994 (30)","1,80 m",LaLiga,Real Sociedad,Centre-Back,25,"{'Toe injury': {'Days': '50 days', 'Games miss...",111 days,18,1.234'


In [9]:
# Bundesliga Dataframe
bundesliga_soup = BeautifulSoup(bundesliga_data.text)
team_urls = get_team_urls(bundesliga_soup)
generic_player_urls = get_generic_player_urls(team_urls)
detailed_player_urls = get_detailed_player_urls(generic_player_urls)
player_injury_history_urls = get_injury_history_url(generic_player_urls)
assert(len(detailed_player_urls) == len(player_injury_history_urls))

# Map each player profile to the injury history
mapping = {}
for i in range(len(player_injury_history_urls)):
    mapping[detailed_player_urls[i]] = player_injury_history_urls[i]

bundesliga_players_df = get_players_df(mapping)
bundesliga_players_df.to_csv('bundesliga_player_data.csv', index=False)
bundesliga_players_df.head()

100%|██████████| 18/18 [00:15<00:00,  1.14it/s]
100%|██████████| 504/504 [06:35<00:00,  1.27it/s]
100%|██████████| 504/504 [00:00<00:00, 541200.52it/s]
  4%|▍         | 19/504 [01:06<30:59,  3.83s/it]

Player made no appearances in the season


 31%|███       | 154/504 [07:58<13:35,  2.33s/it]

Player made no appearances in the season


 34%|███▎      | 170/504 [08:57<11:46,  2.12s/it]

Player made no appearances in the season


 35%|███▍      | 174/504 [09:09<14:00,  2.55s/it]

Player made no appearances in the season


 44%|████▎     | 220/504 [11:33<13:47,  2.91s/it]

Player made no appearances in the season


 48%|████▊     | 243/504 [12:42<10:17,  2.37s/it]

Player made no appearances in the season


 50%|█████     | 253/504 [13:08<10:06,  2.42s/it]

Player made no appearances in the season


 57%|█████▋    | 286/504 [14:48<07:02,  1.94s/it]

Player made no appearances in the season


 74%|███████▎  | 371/504 [18:59<12:30,  5.64s/it]

Player made no appearances in the season


 87%|████████▋ | 436/504 [22:15<02:42,  2.39s/it]

Player made no appearances in the season
Max Wendt No League


 98%|█████████▊| 494/504 [25:02<00:30,  3.04s/it]


Unnamed: 0,Name,Birthday,Height,League,Club,Position,Appearances,Injuries,Total Days,Total Games Missed,Minutes Played Total 23/24:
0,Koen Casteels,"Jun 25, 1992 (31)","1,97 m",Bundesliga,VfL Wolfsburg,Goalkeeper,27,"{'Shoulder injury': {'Days': '40 days', 'Games...",40 days,6,2.460'
1,Pavao Pervan,"Nov 13, 1987 (36)","1,94 m",Bundesliga,VfL Wolfsburg,Goalkeeper,10,"{'Finger injury': {'Days': '10 days', 'Games m...",10 days,1,900'
2,Niklas Klinger,"Oct 13, 1995 (28)","1,87 m",Bundesliga,VfL Wolfsburg,Goalkeeper,-,{},0 days,0,-
3,Maxence Lacroix,"Apr 6, 2000 (24)","1,90 m",Bundesliga,VfL Wolfsburg,Centre-Back,31,{},0 days,0,2.665'
4,Moritz Jenz,"Apr 30, 1999 (25)","1,90 m",Bundesliga,VfL Wolfsburg,Centre-Back,23,"{'Ill': {'Days': '9 days', 'Games missed': 2},...",23 days,4,1.864'


In [6]:
# Serie A Dataframe
serie_a_soup = BeautifulSoup(serie_a_data.text)
team_urls = get_team_urls(serie_a_soup)
generic_player_urls = get_generic_player_urls(team_urls)
detailed_player_urls = get_detailed_player_urls(generic_player_urls)
player_injury_history_urls = get_injury_history_url(generic_player_urls)
assert(len(detailed_player_urls) == len(player_injury_history_urls))

# Map each player profile to the injury history
mapping = {}
for i in range(len(player_injury_history_urls)):
    mapping[detailed_player_urls[i]] = player_injury_history_urls[i]

serie_a_players_df = get_players_df(mapping)
serie_a_players_df.to_csv('serie_a_player_data.csv', index=False)
serie_a_players_df.head()

100%|██████████| 20/20 [00:18<00:00,  1.08it/s]
100%|██████████| 551/551 [05:39<00:00,  1.62it/s]
100%|██████████| 551/551 [00:00<00:00, 437038.86it/s]
 12%|█▏        | 64/551 [02:12<15:16,  1.88s/it]

Diego Fornari No Height


 19%|█▊        | 103/551 [03:31<15:54,  2.13s/it]

David Pejičić No Height


 20%|█▉        | 109/551 [03:48<17:16,  2.34s/it]

Player made no appearances in the season


 29%|██▉       | 159/551 [06:01<14:41,  2.25s/it]

Player made no appearances in the season


 49%|████▉     | 271/551 [10:09<10:19,  2.21s/it]

Pasquale Allocca No Height


 53%|█████▎    | 291/551 [10:48<07:41,  1.78s/it]

Andres Sfait No Height


 54%|█████▍    | 298/551 [11:01<07:34,  1.80s/it]

Player made no appearances in the season


 55%|█████▍    | 301/551 [11:07<07:30,  1.80s/it]

Lovro Stubljar No Height


 56%|█████▌    | 306/551 [11:16<07:22,  1.81s/it]

Player made no appearances in the season


 57%|█████▋    | 315/551 [11:34<06:50,  1.74s/it]

Player made no appearances in the season


 90%|████████▉ | 495/551 [17:36<02:05,  2.23s/it]

Player made no appearances in the season


 99%|█████████▉| 545/551 [19:12<00:12,  2.11s/it]


Unnamed: 0,Name,Birthday,Height,League,Club,Position,Appearances,Injuries,Total Days,Total Games Missed,Minutes Played Total 23/24:
0,Mike Maignan,"Jul 3, 1995 (28)","1,91 m",Serie A,AC Milan,Goalkeeper,42,"{'Finger injury': {'Days': '13 days', 'Games m...",47 days,9,3.702'
1,Marco Sportiello,"May 10, 1992 (32)","1,92 m",Serie A,AC Milan,Goalkeeper,9,"{'muscular problems': {'Days': '88 days', 'Gam...",88 days,18,708'
2,Lapo Nava,"Jan 22, 2004 (20)","1,97 m",Serie A,AC Milan,Goalkeeper,4,{},0 days,0,272'
3,Antonio Mirante,"Jul 8, 1983 (40)","1,93 m",Serie A,AC Milan,Goalkeeper,3,"{'unknown injury': {'Days': '55 days', 'Games ...",63 days,10,268'
4,Fikayo Tomori,"Dec 19, 1997 (26)","1,85 m",Serie A,AC Milan,Centre-Back,35,{'Hamstring muscle injury': {'Days': '65 days'...,65 days,13,2.843'


In [8]:
# Ligue 1 Dataframe
ligue_1_soup = BeautifulSoup(ligue_1_data.text)
team_urls = get_team_urls(ligue_1_soup)
generic_player_urls = get_generic_player_urls(team_urls)
detailed_player_urls = get_detailed_player_urls(generic_player_urls)
player_injury_history_urls = get_injury_history_url(generic_player_urls)
assert(len(detailed_player_urls) == len(player_injury_history_urls))

# Map each player profile to the injury history
mapping = {}
for i in range(len(player_injury_history_urls)):
    mapping[detailed_player_urls[i]] = player_injury_history_urls[i]

ligue_1_players_df = get_players_df(mapping)
ligue_1_players_df.to_csv('ligue_1_player_data.csv', index=False)
ligue_1_players_df.head()

100%|██████████| 18/18 [00:18<00:00,  1.00s/it]
100%|██████████| 467/467 [04:41<00:00,  1.66it/s]
100%|██████████| 467/467 [00:00<00:00, 393637.45it/s]
 10%|█         | 47/467 [01:26<11:13,  1.60s/it]

Player made no appearances in the season


 17%|█▋        | 81/467 [02:31<13:02,  2.03s/it]

Player made no appearances in the season


 27%|██▋       | 124/467 [03:56<10:27,  1.83s/it]

Daouda Traoré No League


 39%|███▉      | 184/467 [05:56<09:13,  1.95s/it]

Player made no appearances in the season


 40%|████      | 188/467 [06:05<09:12,  1.98s/it]

Player made no appearances in the season


 51%|█████     | 236/467 [07:52<08:53,  2.31s/it]

Player made no appearances in the season


 71%|███████   | 330/467 [11:06<04:15,  1.86s/it]

Maïdine Douane No Height


 89%|████████▊ | 414/467 [13:38<01:48,  2.04s/it]

Player made no appearances in the season


 92%|█████████▏| 428/467 [14:05<01:16,  1.96s/it]

Amadou Koné No Height


 99%|█████████▊| 461/467 [15:12<00:11,  1.98s/it]


Unnamed: 0,Name,Birthday,Height,League,Club,Position,Appearances,Injuries,Total Days,Total Games Missed,Minutes Played Total 23/24:
0,Alaa Bellaarouch,"Feb 1, 2002 (22)","1,88 m",Ligue 1,R. Strasbourg,Goalkeeper,19,{},0 days,0,1.710'
1,Matthieu Dreyer,"Mar 20, 1989 (35)","1,88 m",Ligue 1,R. Strasbourg,Goalkeeper,2,{},0 days,0,119'
2,Alexandre Pierre,"Feb 25, 2001 (23)","1,91 m",Ligue 1,R. Strasbourg,Goalkeeper,2,{},0 days,0,180'
3,Abakar Sylla,"Dec 25, 2002 (21)","1,88 m",Ligue 1,R. Strasbourg,Centre-Back,25,{},0 days,0,2.062'
4,Ismaël Doukouré,"Jul 24, 2003 (20)","1,83 m",Ligue 1,R. Strasbourg,Centre-Back,24,"{'Ligament stretching': {'Days': '0 days', 'Ga...",0 days,1,1.776'


# Extracting Team Data
In this section, we are interested in extracting data specific to each team. Data will include team statistics like: **name, birthday, height, club, league, position, appearances, injuries, etc**. Our source mainly comes from [whoscored.com](https://www.whoscored.com/).

In [6]:
# Specify the URL to web scrap from and include necessary headers
premier_league = "https://fbref.com/en/comps/9/Premier-League-Stats"
laliga = "https://fbref.com/en/comps/12/La-Liga-Stats"
bundesliga = "https://fbref.com/en/comps/20/Bundesliga-Stats"
serie_a = "https://fbref.com/en/comps/11/Serie-A-Stats"
ligue_1 = "https://fbref.com/en/comps/13/Ligue-1-Stats"

premier_league_data = requests.get(premier_league, headers = headers)
laliga_data = requests.get(laliga, headers = headers)
bundesliga_data = requests.get(bundesliga, headers = headers)
serie_a_data = requests.get(serie_a, headers = headers)
ligue_1_data = requests.get(ligue_1, headers = headers)

In [7]:
# Function to get team URLs from soup
def get_team_stats_urls(soup):
    # Get team URLs for league
    table = soup.select("table.stats_table")[0]
    stats = table.find_all("a")
    # Extract team data (text and href)
    team_data = []
    for link in stats:
        if '/squads/' in link.get("href"):
            team_name = link.text.strip()  # Extract and clean team name text
            team_url = f"https://fbref.com{link.get('href')}"
            team_data.append((team_name, team_url))
    return team_data

def get_goalkeeping_stats(team_stats_page):
    try:
        read = pd.read_html(StringIO(team_stats_page.text), match = "Goalkeeping")
        basic_goalkeeping_stats = read[0]
    
        # We are only interested in overall team stats
        row = -2
        # Extract some basic statistics
        goals_against_per_90 = basic_goalkeeping_stats.iloc[row, 9]
        shots_on_target_against = basic_goalkeeping_stats.iloc[row, 10]
        saves_percentage = basic_goalkeeping_stats.iloc[row, 12]

        advanced_goalkeeping_stats = read[1]
        # Extract some advanced statistics
        passes_attempted = advanced_goalkeeping_stats.iloc[row, 17]
        throws_attempted = advanced_goalkeeping_stats.iloc[row, 18]
        crosses_faced = advanced_goalkeeping_stats.iloc[row, 24]
        sweeper_per_90 = advanced_goalkeeping_stats.iloc[row, 28]
        return goals_against_per_90, shots_on_target_against, saves_percentage, passes_attempted, throws_attempted, crosses_faced, sweeper_per_90
    except:
        print(team_stats_page)
        pass

def get_defending_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Defensive")
    defensive_actions_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    players_tackled = defensive_actions_stats.iloc[row, 5]
    dribblers_tackled = defensive_actions_stats.iloc[row, 10]
    blocks = defensive_actions_stats.iloc[row, 14]
    interceptions = defensive_actions_stats.iloc[row, 17]
    clearances = defensive_actions_stats.iloc[row, 19]
    return players_tackled, dribblers_tackled, blocks, interceptions, clearances
    
def get_shooting_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Shooting")
    shooting_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    goals = shooting_stats.iloc[row, 5]
    shots = shooting_stats.iloc[row, 6]
    shots_on_target = shooting_stats.iloc[row, 7]
    shots_per_90 = shooting_stats.iloc[row, 9]
    shots_on_target_per_90 = shooting_stats.iloc[row, 10]
    expected_goals = shooting_stats.iloc[row, 17]
    return goals, shots, shots_on_target, shots_per_90, shots_on_target_per_90, expected_goals

def get_passing_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Passing")
    passing_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    passes_completed = passing_stats.iloc[row, 5]
    passes_attempted = passing_stats.iloc[row, 6]
    crosses = passing_stats.iloc[row, 26]
    progressive_passes = passing_stats.iloc[row, 27]
    return passes_completed, passes_attempted, crosses, progressive_passes

def get_shot_creation_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Goal and Shot Creation")
    shot_creation_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    shot_creating_actions = shot_creation_stats.iloc[row, 5]
    shot_creating_actions_per_90 = shot_creation_stats.iloc[row, 6]
    goal_creating_actions = shot_creation_stats.iloc[row, 13]
    goal_creating_actions_per_90 = shot_creation_stats.iloc[row, 14]
    return shot_creating_actions, shot_creating_actions_per_90, goal_creating_actions, goal_creating_actions_per_90

def get_possession_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Possession")
    possession_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    defensive_touches = possession_stats.iloc[row, 6] + possession_stats.iloc[row, 7]
    attacking_touches = possession_stats.iloc[row, 8] + possession_stats.iloc[row, 9] + possession_stats.iloc[row, 10]
    take_ons_attempted = possession_stats.iloc[row, 12]
    take_ons_tackled = possession_stats.iloc[row, 15]
    return defensive_touches, attacking_touches, take_ons_attempted, take_ons_tackled

def get_misc_stats(team_stats_page):
    read = pd.read_html(StringIO(team_stats_page.text), match = "Miscellaneous")
    misc_stats = read[0]
    # We are only interested in overall team stats
    row = -2
    yellow_cards = misc_stats.iloc[row, 5]
    red_cards = misc_stats.iloc[row, 6]
    fouls_committed = misc_stats.iloc[row, 8]
    fouls_drawn = misc_stats.iloc[row, 9]
    aerial_won = misc_stats.iloc[row, 18]
    aerial_lost = misc_stats.iloc[row, 19]
    return yellow_cards, red_cards, fouls_committed, fouls_drawn, aerial_won, aerial_lost

def trim_team_name(header):
    to_remove = ['2023-2024', 'Stats']
    header = header.split()
    resultwords  = [word for word in header if word not in to_remove]
    result = ' '.join(resultwords)
    return result

In [8]:
def get_team_df(team_stats_urls):
    team_dict = {'Team': [], 'Goals Against Per 90': [], 'Shots On Target Against': [], 'Saves Percentage': [], 'GK Passes attemped': [], 
                  'GK Throws Attempted': [], 'GK Crosses Faced': [], 'Sweeper per 90': [], 'Players Tackled': [], 'Dribblers Tackled': [],
                  'Blocks': [], 'Interceptions': [], 'Clearances': [], 'Goals': [], 'Shots': [], 'Shots on Target': [], 'Shots per 90': [],
                  'Shots on Target per 90': [], 'Expected Goals': [], 'Passes Completed': [], 'Passes Attempted': [], 'Crosses': [],
                  'Progressive Passes': [], 'Shot Creating Actions': [], 'Shot Creating Actions per 90': [], 'Goal Creating Actions': [],
                  'Goal Creating Actions per 90': [], 'Defensive Touches': [], 'Attacking Touches': [], 'Take Ons Attempted': [],
                  'Take Ons Tackled': [], 'Yellow Cards': [], 'Red Cards': [], 'Fouls Committed': [], 'Fouls Drawn': [], 'Aerial Won': [],
                  'Aerial Lost': []}
    with tqdm(total=len(team_stats_urls)) as progress_bar:
        for team_name, team_stats_url in team_stats_urls:
            team_dict['Team'].append(team_name)
            team_stats_page = requests.get(team_stats_url, headers = headers)
            team_stats_soup = BeautifulSoup(team_stats_page.text)

            # Get the statistics
            goals_against_per_90, shots_on_target_against, saves_percentage, gk_passes_attempted, gk_throws_attempted, gk_crosses_faced, sweeper_per_90 = get_goalkeeping_stats(team_stats_page)
            team_dict['Goals Against Per 90'].append(goals_against_per_90)
            team_dict['Shots On Target Against'].append(shots_on_target_against)
            team_dict['Saves Percentage'].append(saves_percentage)
            team_dict['GK Passes attemped'].append(gk_passes_attempted)
            team_dict['GK Throws Attempted'].append(gk_throws_attempted)
            team_dict['GK Crosses Faced'].append(gk_crosses_faced)
            team_dict['Sweeper per 90'].append(sweeper_per_90)

            players_tackled, dribblers_tackled, blocks, interceptions, clearances = get_defending_stats(team_stats_page)
            team_dict['Players Tackled'].append(players_tackled)
            team_dict['Dribblers Tackled'].append(dribblers_tackled)
            team_dict['Blocks'].append(blocks)
            team_dict['Interceptions'].append(interceptions)
            team_dict['Clearances'].append(clearances)

            goals, shots, shots_on_target, shots_per_90, shots_on_target_per_90, expected_goals = get_shooting_stats(team_stats_page)
            team_dict['Goals'].append(goals)
            team_dict['Shots'].append(shots)
            team_dict['Shots on Target'].append(shots_on_target)
            team_dict['Shots per 90'].append(shots_per_90)
            team_dict['Shots on Target per 90'].append(shots_on_target_per_90)
            team_dict['Expected Goals'].append(expected_goals)

            passes_completed, passes_attempted, crosses, progressive_passes = get_passing_stats(team_stats_page)
            team_dict['Passes Completed'].append(passes_completed)
            team_dict['Passes Attempted'].append(passes_attempted)
            team_dict['Crosses'].append(crosses)
            team_dict['Progressive Passes'].append(progressive_passes)

            shot_creating_actions, shot_creating_actions_per_90, goal_creating_actions, goal_creating_actions_per_90 = get_shot_creation_stats(team_stats_page)
            team_dict['Shot Creating Actions'].append(shot_creating_actions)
            team_dict['Shot Creating Actions per 90'].append(shot_creating_actions_per_90)
            team_dict['Goal Creating Actions'].append(goal_creating_actions)
            team_dict['Goal Creating Actions per 90'].append(goal_creating_actions_per_90)

            defensive_touches, attacking_touches, take_ons_attempted, take_ons_tackled = get_possession_stats(team_stats_page)
            team_dict['Defensive Touches'].append(defensive_touches)
            team_dict['Attacking Touches'].append(attacking_touches)
            team_dict['Take Ons Attempted'].append(take_ons_attempted)
            team_dict['Take Ons Tackled'].append(take_ons_tackled)

            yellow_cards, red_cards, fouls_committed, fouls_drawn, aerial_won, aerial_lost = get_misc_stats(team_stats_page)
            team_dict['Yellow Cards'].append(yellow_cards)
            team_dict['Red Cards'].append(red_cards)
            team_dict['Fouls Committed'].append(fouls_committed)
            team_dict['Fouls Drawn'].append(fouls_drawn)
            team_dict['Aerial Won'].append(aerial_won)
            team_dict['Aerial Lost'].append(aerial_lost)

            assert(same_length(team_dict))
            progress_bar.update(1)
    
    # Convert dictionary into data frame
    team_df = pd.DataFrame(team_dict)
    return team_df

In [11]:
premier_league_team_soup = BeautifulSoup(premier_league_data.text)
team_stats_urls = get_team_stats_urls(premier_league_team_soup)
prem_team_df = get_team_df(team_stats_urls)
prem_team_df.to_csv('prem_team_data.csv', index=False)
prem_team_df.head()

100%|██████████| 20/20 [00:35<00:00,  1.77s/it]


Unnamed: 0,Team,Goals Against Per 90,Shots On Target Against,Saves Percentage,GK Passes attemped,GK Throws Attempted,GK Crosses Faced,Sweeper per 90,Players Tackled,Dribblers Tackled,...,Defensive Touches,Attacking Touches,Take Ons Attempted,Take Ons Tackled,Yellow Cards,Red Cards,Fouls Committed,Fouls Drawn,Aerial Won,Aerial Lost
0,Manchester City,0.89,109,71.6,1342,161,312,1.66,517,222,...,9174,26074,731,282,53,2,289,410,325,288
1,Arsenal,0.76,83,67.5,1058,189,375,1.55,610,271,...,8688,21447,671,304,64,2,391,394,503,499
2,Liverpool,1.08,147,72.8,1348,192,376,1.21,677,341,...,9583,23047,737,331,69,5,463,376,583,441
3,Aston Villa,1.61,173,65.3,1386,176,419,1.84,547,260,...,10977,16105,757,315,94,2,412,473,334,363
4,Tottenham,1.61,167,67.7,1308,193,511,2.13,724,360,...,11310,21587,812,363,92,4,425,510,359,388


In [None]:
laliga_team_soup = BeautifulSoup(laliga_data.text)
team_stats_urls = get_team_stats_urls(laliga_team_soup)
laliga_team_df = get_team_df(team_stats_urls)
laliga_team_df.to_csv('laliga_team_data.csv', index=False)
laliga_team_df.head()

In [12]:
bundesliga_team_soup = BeautifulSoup(bundesliga_data.text)
team_stats_urls = get_team_stats_urls(bundesliga_team_soup)
bundesliga_team_df = get_team_df(team_stats_urls)
bundesliga_team_df.to_csv('bundesliga_team_data.csv', index=False)
bundesliga_team_df.head()

100%|██████████| 18/18 [00:43<00:00,  2.41s/it]


Unnamed: 0,Team,Goals Against Per 90,Shots On Target Against,Saves Percentage,GK Passes attemped,GK Throws Attempted,GK Crosses Faced,Sweeper per 90,Players Tackled,Dribblers Tackled,...,Defensive Touches,Attacking Touches,Take Ons Attempted,Take Ons Tackled,Yellow Cards,Red Cards,Fouls Committed,Fouls Drawn,Aerial Won,Aerial Lost
0,Leverkusen,0.71,105,77.1,715,171,316,1.18,458,197,...,7550,23370,727,328,59,0,305,342,363,316
1,Stuttgart,1.15,129,72.1,1180,147,396,1.82,537,253,...,9620,20007,673,279,56,1,329,373,430,429
2,Bayern Munich,1.32,124,66.1,1072,170,341,1.41,520,245,...,9238,20993,907,374,47,2,304,306,410,319
3,RB Leipzig,1.15,117,70.1,1189,147,426,1.5,538,244,...,8885,17559,633,238,61,1,354,388,426,394
4,Dortmund,1.26,145,73.8,1267,135,466,0.71,548,292,...,10640,17350,712,312,59,4,315,391,390,426


In [13]:
serie_a_team_soup = BeautifulSoup(serie_a_data.text)
team_stats_urls = get_team_stats_urls(serie_a_team_soup)
serie_a_team_df = get_team_df(team_stats_urls)
serie_a_team_df.to_csv('serie_a_team_data.csv', index=False)
serie_a_team_df.head()

100%|██████████| 20/20 [00:40<00:00,  2.02s/it]


Unnamed: 0,Team,Goals Against Per 90,Shots On Target Against,Saves Percentage,GK Passes attemped,GK Throws Attempted,GK Crosses Faced,Sweeper per 90,Players Tackled,Dribblers Tackled,...,Defensive Touches,Attacking Touches,Take Ons Attempted,Take Ons Tackled,Yellow Cards,Red Cards,Fouls Committed,Fouls Drawn,Aerial Won,Aerial Lost
0,Inter,0.58,113,82.3,1196.0,151.0,453.0,0.42,540.0,234.0,...,10817.0,19019.0,435.0,190.0,46,1,405,408,535.0,370.0
1,Milan,1.29,163,74.8,1248.0,233.0,440.0,1.21,576.0,273.0,...,9907.0,18103.0,765.0,324.0,82,8,429,410,432.0,410.0
2,Juventus,0.82,116,75.0,768.0,144.0,601.0,0.61,577.0,264.0,...,8748.0,16601.0,596.0,269.0,87,2,478,376,486.0,434.0
3,Atalanta,1.11,142,74.6,974.0,213.0,444.0,1.45,613.0,291.0,...,9716.0,17756.0,618.0,270.0,76,1,486,367,627.0,491.0
4,Bologna,0.84,125,76.8,941.0,159.0,505.0,0.5,633.0,303.0,...,11262.0,18781.0,673.0,322.0,85,2,458,456,416.0,413.0


In [14]:
ligue_1_team_soup = BeautifulSoup(ligue_1_data.text)
team_stats_urls = get_team_stats_urls(ligue_1_team_soup)
ligue_1_team_df = get_team_df(team_stats_urls)
ligue_1_team_df.to_csv('ligue_1_team_data.csv', index=False)
ligue_1_team_df.head()

100%|██████████| 18/18 [00:33<00:00,  1.86s/it]


Unnamed: 0,Team,Goals Against Per 90,Shots On Target Against,Saves Percentage,GK Passes attemped,GK Throws Attempted,GK Crosses Faced,Sweeper per 90,Players Tackled,Dribblers Tackled,...,Defensive Touches,Attacking Touches,Take Ons Attempted,Take Ons Tackled,Yellow Cards,Red Cards,Fouls Committed,Fouls Drawn,Aerial Won,Aerial Lost
0,Paris S-G,0.97,147,80.3,879,144,402,0.97,585,283,...,9455,22092,815,364,51,3,353,362,275,267
1,Monaco,1.24,151,75.5,813,159,442,0.76,664,322,...,7820,17558,593,255,79,8,481,400,402,394
2,Brest,1.0,133,75.9,978,144,406,1.27,679,306,...,8874,16028,690,320,84,5,414,422,598,509
3,Lille,1.0,128,75.8,1030,122,399,0.94,594,278,...,8663,17184,680,293,67,3,376,489,355,346
4,Nice,0.85,105,77.1,1025,158,376,0.62,538,285,...,9586,15678,752,342,62,4,402,373,374,360
