In [1]:
import requests # to make the request to the web address
from bs4 import BeautifulSoup as bs # to pull data from HTML
import csv # to put the data in the csv file
import re # to handle regex
import pandas as pd # to show result
from tqdm.auto import tqdm

from pathlib import Path

KeyboardInterrupt: 

In [None]:
'''
1. format_text function that takes a string and removes some chars like double with spaces or escape sequences
'''
def format_text(text):
    regex = re.compile(r'[\n\r\t]')
    text = regex.sub('', text)
    return " ".join(text.split())

'''
2. format_currency function process the currency
'''
def format_currency(value):
    value = value.replace('€', '')
    value = value.replace('-', '0')
    value = value.replace('Loan fee:', '')
    value = value.replace('-', '0')
    value = value.replace('?', '0')
    value = value.replace('loan transfer', '0')
    value = value.replace('free transfer', '0')
    value = value.replace('draft', '0')
    
    if value[-1] == 'm':
        value = value.replace('m', '')
        return float(value) 

    if value[-1] == '.':
        value = value.replace('.', '')
        if value[-2:] == 'Th':
            value = value.replace('Th', '')
            return float(value) / 1000
    if value[-1] == 'k':
        value = value.replace('k', '')
        return float(value) / 1000
    return float(value)

'''
3. create a new column loan
'''
def loan_transform(value):
    if bool(re.match('loan', value, re.I)):
        bool_value = True
        return bool_value
    else:
        bool_value = False
        return bool_value

'''
4. get_data function responsible for accessing the pages, transforming the HTML into a soup object, looking for an element
with the responsive-table class,  iterate all the even and odd classes to get the ‘tds’ or cell and then create
a dictionary with the information we need, appending the var player to the players_list and finally return on
''' 
def get_data(pages, seasons):
    players_list = []
    for season in tqdm(seasons, desc='seasons'):
        for window in tqdm(['sommertransfers', 'wintertransfers'], desc='windows', leave=False):
            for page in tqdm(range(1, pages+1), desc='pages', leave=False):
                headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
                url = f'https://www.transfermarkt.com/transfers/saisontransfers/statistik/top/plus/1/galerie/0?saison_id={season}&transferfenster={window}&land_id=&ausrichtung=&spielerposition_id=&altersklasse=&leihe=&page={page}'
                #print(url)

                html = requests.get(url, headers=headers)
                soup = bs(html.content)
                soup = soup.select('.responsive-table > .grid-view > .items > tbody')[0]

                try:
                    for cells in soup.find_all(True, {"class": re.compile("^(even|odd)$")}):
                        fee = cells.find_all('td')[16].text
                        loan = cells.find_all('td')[16].text
                        position = cells.find_all('td')[4].text
                        age = cells.find_all('td')[5].text
                        market_value = cells.find_all('td')[6].text
                        try:
                            country_from = cells.find_all('td')[11].img['title']
                        except:
                            country_from = None
                            pass
                        league_from = cells.find_all('td')[11].a.text if cells.find_all('td')[11].a != None else 'Without League'
                        club_from = cells.find_all('td')[9].img['alt'] if cells.find_all('td')[9].img != None else 'Without Club'
                        country_to = cells.find_all('td')[15].img['alt'] if cells.find_all('td')[15].img != None else' Without Country'
                        league_to = cells.find_all('td')[15].a.text if cells.find_all('td')[15].a != None else 'Without League'
                        club_to = cells.find_all('td')[13].img['alt'] if cells.find_all('td')[13].img != None else 'Without Club'
                        transfer_window = 'summer' if window == 'sommertransfers' else 'winter'

                        player = {
                            'name': cells.find_all('td')[1].select('td > img')[0]['title'],
                            'position': position,
                            'age': age,
                            'season':  season,
                            'market_value': format_currency(market_value),
                            'country_from': country_from,
                            'league_from': format_text(league_from),
                            'club_from': club_from,
                            'country_to': country_to,
                            'league_to': format_text(league_to),
                            'club_to': club_to,
                            'window': transfer_window,
                            'fee': format_currency(fee),
                            'loan': loan_transform(loan),
                        }

                        players_list.append(player)
                except IndexError:
                    pass

    return players_list

'''
5. data_to_csv function which receives a list to save a csv output file
'''
def data_to_csv(data):
    keys = data[0].keys()
    with open('data.csv', 'w', newline='')  as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

In [None]:
seasons = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
data = get_data(80, seasons)

seasons:   0%|          | 0/7 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

windows:   0%|          | 0/2 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

pages:   0%|          | 0/80 [00:00<?, ?it/s]

In [None]:
data_to_csv(data)

In [None]:
data = pd.read_csv(Path.cwd()/'data.csv')
data

Unnamed: 0,name,position,age,season,market_value,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan
0,Neymar,Left Winger,25,2017,100.00,Spain,LaLiga,FC Barcelona,France,Ligue 1,Paris Saint-Germain,summer,222.0,False
1,Ousmane Dembélé,Right Winger,20,2017,33.00,Germany,Bundesliga,Borussia Dortmund,Spain,LaLiga,FC Barcelona,summer,135.0,False
2,Romelu Lukaku,Centre-Forward,24,2017,50.00,England,Premier League,Everton FC,England,Premier League,Manchester United,summer,84.7,False
3,Álvaro Morata,Centre-Forward,24,2017,40.00,Spain,LaLiga,Real Madrid,England,Premier League,Chelsea FC,summer,66.0,False
4,Benjamin Mendy,Left-Back,23,2017,13.00,Monaco,Ligue 1,AS Monaco,England,Premier League,Manchester City,summer,57.5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27982,Henry Addo,Left Winger,20,2023,0.25,Slovakia,Nike Liga,MSK Zilina,Israel,Ligat ha'Al,Maccabi Tel Aviv,winter,0.5,False
27983,Reda Belahyane,Defensive Midfield,19,2023,0.20,France,Ligue 1,OGC Nice,Italy,Serie A,Hellas Verona,winter,0.5,False
27984,Albert Posiadała,Goalkeeper,20,2023,0.50,Poland,Ekstraklasa,Radomiak Radom,Norway,Eliteserien,Molde FK,winter,0.5,False
27985,Lequincio Zeefuik,Centre-Forward,19,2023,0.50,Netherlands,Eredivisie,FC Volendam,Netherlands,Eredivisie,AZ Alkmaar,winter,0.5,False


In [None]:
def parse_names(x, full=False):
        splitter = x.find('.')
        if splitter == -1:
            return x[:len(x)//2]
        elif(x.count('.') > 1):
            x = x.split('.')
            if full:
                return '.'.join(x[len(x)//2:])[:-1]
            else:
                return  '.'.join(x[:len(x)//2])[:-1]
        else:
            if full: 
                return x[splitter-1:]
            else:
                return x[:splitter-1]
            


In [None]:
data[data.country_to == 'Italy']

Unnamed: 0,name,position,age,season,market_value,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan
10,Leonardo Bonucci,Centre-Back,30,2017,45.0,Italy,Serie A,Juventus FC,Italy,Serie A,AC Milan,summer,42.0,False
17,Federico Bernardeschi,Right Winger,23,2017,30.0,Italy,Serie A,ACF Fiorentina,Italy,Serie A,Juventus FC,summer,40.0,False
19,André Silva,Centre-Forward,21,2017,22.0,Portugal,Liga NOS,FC Porto,Italy,Serie A,AC Milan,summer,38.0,False
27,Milan Skriniar,Centre-Back,22,2017,7.0,Italy,Serie A,UC Sampdoria,Italy,Serie A,Inter Milan,summer,34.0,False
28,Alessandro Bastoni,Centre-Back,18,2017,1.5,Italy,Serie A,Atalanta BC,Italy,Serie A,Inter Milan,summer,31.1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27717,Karlo Lulic,Central Midfield,27,2023,0.6,Italy,Serie A,Frosinone Calcio,Italy,Serie B,SSC Bari,winter,0.0,False
27721,Valerio Mantovani,Centre-Back,27,2023,0.6,Italy,Serie B,Ternana Calcio,Italy,Serie B,Ascoli Calcio,winter,0.0,False
27749,Peter Kovacik,Right-Back,22,2023,0.6,Slovakia,Nike Liga,FK Zeleziarne Podbrezova,Italy,Serie B,Como 1907,winter,0.0,False
27791,Jacopo Petriccione,Defensive Midfield,28,2023,0.6,Italy,Serie C - C,FC Crotone,Italy,Serie B,US Catanzaro,winter,0.0,False


In [None]:
data = data[data.league_to.isin(['Ligue 1', 'LaLiga', 'Bundesliga', 'Premier League', 'Serie A'])]

In [None]:
data.to_csv(Path.cwd()/'transfers.csv')

In [None]:
data.league_from.value_counts().head(20)

Serie A               1059
Premier League         749
Bundesliga             638
LaLiga                 636
Ligue 1                628
Championship           277
Serie B                190
Jupiler Pro League     170
Ligue 2                156
Eredivisie             151
2. Bundesliga          146
Liga NOS               120
LaLiga SmartBank       102
Super League            98
Süper Lig               97
Premier Liga            95
Série A                 94
Superliga               83
Liga Portugal           58
LaLiga 1|2|3            58
Name: league_from, dtype: int64

In [None]:
data[data.league_from == 'Série A'] 

Unnamed: 0,name,position,age,season,market_value,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan
130,Thiago Maia,Defensive Midfield,20,2017,4.5,Brazil,Série A,Santos FC,France,Ligue 1,LOSC Lille,summer,14.00,False
151,Richarlison,Centre-Forward,20,2017,2.0,Brazil,Série A,Fluminense Football Club,England,Premier League,Watford FC,summer,12.40,False
155,Douglas Luiz,Central Midfield,19,2017,0.0,Brazil,Série A,Clube de Regatas Vasco da Gama,England,Premier League,Manchester City,summer,12.00,False
185,Luiz Araújo,Right Winger,21,2017,0.5,Brazil,Série A,São Paulo Futebol Clube,France,Ligue 1,LOSC Lille,summer,10.50,False
224,Thiago Mendes,Defensive Midfield,25,2017,4.0,Brazil,Série A,São Paulo Futebol Clube,France,Ligue 1,LOSC Lille,summer,9.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26092,Lucas Perri,Goalkeeper,26,2023,8.0,Brazil,Série A,Botafogo de Futebol e Regatas,France,Ligue 1,Olympique Lyon,winter,3.25,False
26132,Johnny Cardoso,Central Midfield,22,2023,6.0,Brazil,Série A,Sport Club Internacional,Spain,LaLiga,Real Betis Balompié,winter,6.00,False
26154,Bruno Méndez,Centre-Back,24,2023,6.0,Brazil,Série A,Sport Club Corinthians Paulista,Spain,LaLiga,Granada CF,winter,0.00,False
26624,Jhoanner Chávez,Left-Back,21,2023,1.8,Brazil,Série A,Esporte Clube Bahia,France,Ligue 1,RC Lens,winter,0.00,True
