Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import re
import pandas as pd

Take data from requests and write it in csv format

In [None]:
def data_to_csv(data):
    keys = data[0].keys()
    with open('data.csv', 'w', newline='')  as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

Function that format text elimninating \n \r \t character

In [2]:
def format_text(text):
    regex = re.compile(r'[\n\r\t]')
    text = regex.sub('', text)

    return " ".join(text.split())

Function that format currency, eliminating euro symbol, milion and dot

In [3]:
def format_currency(value):
    value = value.replace('€', '')
    value = value.replace('Loan fee:', '')
    
    if value[-1] == 'm':
        value = value.replace('m', '')
        return int(float(value)) * 1000000

    if value[-1] == '.':
        value = value.replace('.', '')
        if value[-2:] == 'Th':
            value = value.replace('Th', '')
            return int(value) * 1000
    
    return value

Function that given header and url access the webpage. It makes the request and then use beautiful soup to parse it. Then acess the right element (from responsive table to tbody), and store the information into a dataframe. The input is the namber of pages to be parsed

In [4]:
def get_data(pages):
    players_list = []
    for page in range(1, pages+1):
        headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

        url  = f'https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page={page}&plus=1&saison-id=2020&spielerposition_id=&transferfenster='

        print(url)

        html = requests.get(url, headers=headers)
        soup = bs(html.content)

        soup = soup.select('.responsive-table > .grid-view > .items > tbody')[0]

        try:
            for cells in soup.find_all(True, {"class": re.compile("^(even|odd)$")}):
                fee = cells.find_all('td')[16].text
                position = cells.find_all('td')[4].text
                age = cells.find_all('td')[5].text
                market_value = cells.find_all('td')[6].text
                country_from = cells.find_all('td')[11].img['title']
                league_from = cells.find_all('td')[11].a.text if cells.find_all('td')[11].a != None else 'Without League'
                club_from = cells.find_all('td')[9].img['alt']
                country_to = cells.find_all('td')[15].img['alt']
                league_to = cells.find_all('td')[15].a.text if cells.find_all('td')[15].a != None else 'Without League'
                club_to = cells.find_all('td')[13].img['alt']

                player = {
                    'name': cells.find_all('td')[1].select('td > img')[0]['title'],
                    'position': position,
                    'age': age,
                    'market_value': format_currency(market_value),
                    'country_from': country_from,
                    'league_from': format_text(league_from),
                    'club_from': club_from,
                    'country_to': country_to,
                    'league_to': format_text(league_to),
                    'club_to': club_to,
                    'fee': format_currency(fee),
                }

                players_list.append(player)
        except IndexError:
            pass

    return players_list

In [5]:
data = get_data(100)
df = pd.DataFrame(data)

df.to_csv('transfer_window.csv')

https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=1&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=2&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=3&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=4&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=5&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&

https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=46&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=47&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=48&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=49&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=50&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax

https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=91&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=92&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=93&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=94&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax=yw0&altersklasse=&ausrichtung=&land_id=0&leihe=&page=95&plus=1&saison-id=2020&spielerposition_id=&transferfenster=
https://www.transfermarkt.com/transfers/saisontransfers/statistik?ajax

In [6]:
df

Unnamed: 0,name,position,age,market_value,country_from,league_from,club_from,country_to,league_to,club_to,fee
0,Erling Haaland,Centre-Forward,21,150000000,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60000000
1,Enzo Fernández,Central Midfield,22,55000000,Portugal,Liga Portugal,SL Benfica,England,Premier League,Chelsea FC,121000000
2,Antony,Right Winger,22,35000000,Netherlands,Eredivisie,Ajax Amsterdam,England,Premier League,Manchester United,95000000
3,Wesley Fofana,Centre-Back,21,40000000,England,Premier League,Leicester City,England,Premier League,Chelsea FC,80000000
4,Aurélien Tchouaméni,Defensive Midfield,22,60000000,Monaco,Ligue 1,AS Monaco,Spain,LaLiga,Real Madrid,80000000
...,...,...,...,...,...,...,...,...,...,...,...
2495,Pep Chavarría,Left-Back,24,800k,Spain,LaLiga2,Real Zaragoza,Spain,LaLiga,Rayo Vallecano,1000000
2496,Nonato,Attacking Midfield,24,1000000,Brazil,Série A,Sport Club Internacional,Bulgaria,efbet Liga,Ludogorets Razgrad,1000000
2497,Frantzdy Pierrot,Centre-Forward,27,1000000,France,Ligue 2,EA Guingamp,Israel,Ligat ha'Al,Maccabi Haifa,1000000
2498,Caio Vidal,Right Winger,22,1000000,Brazil,Campeonato Gaúcho,Sport Club Internacional,Bulgaria,efbet Liga,Ludogorets Razgrad,1000000
