In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
def transfer_webscrap(season):
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"}
    page = f"https://www.transfermarkt.co.uk/premier-league/transfers/wettbewerb/GB1?saison_id={season}&s_w=s5"
    pageTree = requests.get(page, headers= headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    headers = pageSoup.find_all("h2", class_= "content-box-headline content-box-headline--inverted content-box-headline--logo")
    team_list = []
    for all in headers:
        header = str(all).split('title="')[1].split('"><')[0]
        team_list.append(header)
        # Fix Brighton's name display
        if "Brighton &amp; Hove Albion" in team_list:
            Brighton_index = team_list.index("Brighton &amp; Hove Albion")
            team_list[Brighton_index] = "Brighton & Hove Albion"
    tables = pageSoup.find_all("table")

    def df_maker(table):
        rows = table.find_all('tr')    
        table_data = []
        player_class = "hide-for-small"
        flag_class = "flaggenrahmen" # Nationality
        team_class = "tiny_wappen"

        for row in rows:
                cells = row.find_all({'th', 'td'})
                row_data = []

                for cell in cells:
                    text = cell.get_text(strip=True)

                    # For the player info
                    player_tag = cell.find('span', class_=player_class)
                    if player_tag:
                        player = str(player_tag).split('title="')[1].split('">')[0]
                        row_data.append(player)
                    else:
                        # For the nationality
                        img_tag = cell.find('img', class_=flag_class)
                        if img_tag:
                            nation = str(img_tag).split('title="')[1].split('"/>')[0]
                            row_data.append(nation)
                        else:
                            # For the former teams
                            img_tag = cell.find('img', class_="") #To fix missing team in some seasons' first entry
                            if img_tag:
                                team = str(img_tag).split('alt="')[1].split('"')[0]
                                row_data.append(team)
                            else:
                                img_tag_2 = cell.find('img', class_=team_class)
                                if img_tag_2:
                                    team = str(img_tag_2).split('title="')[1].split('"/>')[0]
                                    row_data.append(team)
                                else:
                                    row_data.append(text)

                table_data.append(row_data)

        df = pd.DataFrame(table_data)
        df = df.drop(df.columns[0], axis=0) # To fix headers
        df['Season'] = (f"{season}/{season+1}") # Adding season column
        header = ['Name', 'Age', 'Nationality', 'Position', 'Position_Shortcut', 'Market_Value', 'Former_Team', 'Former_Team_Country', 'Fee', 'Season']
        df.columns = header

        # Fixing fee and date columns
        date_pattern = r'End of loan(\w+ \d{1,2}, \d{4})'
        df['Loan_End_Date'] = df['Fee'].str.extract(date_pattern)
        df['Loan_End_Date'] = pd.to_datetime(df['Loan_End_Date'], format='%b %d, %Y')

        # "Market Value" column
        def price_converter(value):
            if value:
                value = value.replace('€', "")
            if value == "-":
                return np.nan
            else:
                return value
        df['Market_Value'] = df['Market_Value'].apply(price_converter)

        # "Fee" column
        def modify_row(row):
            others = ['free transfer', 'loan transfer']
            nulls = ['?', '-']
            if row['Fee'] in others:
                return row['Fee']
            if 'End of loan' in row['Fee']:
                row['Fee'] = 'End of loan' # for the ones end of loan
            if row["Fee"] in nulls:
                return np.nan
            return row['Fee'].replace('€', '')
        df["Fee"] = df.apply(modify_row, axis=1)
        
        return(df)
    
    
    # Creating joined team column (matching which team and which transfer)
    def transfer_table_teams(team_name):
        my_dict = {}
        for i, j in enumerate(range(1, 40, 2)):
            my_dict[team_list[i]] = j

        if team_name in my_dict.keys():
            number = my_dict[team_name]
            answer = df_maker(tables[number])
            answer["New_Team"] = team_name
        
        return(answer)

    dataframes = [transfer_table_teams(team) for team in team_list]

    # Combine all, the result and change the headers' 
    combined_df = pd.concat(dataframes)
    combined_df = combined_df[['Name', 'Age', 'Nationality', 'Position', 'Position_Shortcut', 'Market_Value', 'Former_Team', 'Former_Team_Country', 'Fee', 'Loan_End_Date', 'New_Team', 'Season']]
    return(combined_df)


In [3]:
seasons = list(range(2020,2025))

In [4]:
transfers_20_to_24 = [transfer_webscrap(season) for season in seasons]
transfers_20_to_24 = pd.concat(transfers_20_to_24)
transfers_20_to_24 = transfers_20_to_24.reset_index(drop=True)

In [10]:
transfers_20_to_24.to_csv("transfer_EPL_20-24.csv")