This code scrapes the SoFIFA website for the ratings of players from the Premier League clubs of 2020/21.

In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from unidecode import unidecode

Unfortunately the team names and team ids (that are used to access the webpage) need to be hard-coded in.

In [39]:
team_list = ['arsenal', 'aston-villa', 'brighton-hove-albion', 'burnley', 'chelsea', 
             'crystal-palace', 'everton', 'fulham', 'leeds-united', 'liverpool', 
             'manchester-city', 'manchester-united', 'leicester-city', 'newcastle-united', 'west-ham-united',
             'tottenham-hotspur', 'wolverhampton-wanderers', 'southampton', 'west-bromwich-albion', 'sheffield-united']
team_ids = [1, 2, 1808, 1796, 5, 
            1799, 7, 144, 8, 9, 
            10, 11, 95, 13, 19, 
            18, 110, 17, 109, 1794]

The function below scrapes the name, position, club, and rating information from the webpage.

In [42]:
def get_df(soup, soup2):
    title = soup.find('title').text.split(' ')
    club = []
    for i in title:
        if (i == 'FIFA'):
            break
        club.append(i)
    club = ' '.join(club)
    #The HTML code is different for starting players, subs and reserves, hence the three separate lists.
    starters_list = soup.find_all('tr', class_ = 'starting')
    sub_list = soup.find_all('tr', class_ = 'sub')
    res_list = soup.find_all('tr', class_ = 'res')
    name_arr = []
    pos_arr = []
    rat_arr = []
    #It seemed easier to deal with the attributes separately (since they were under different HTML tags).
    for i in range(len(starters_list)):
        #Note we convert the name to English characters since the Premier League names are mostly in English characters
        name_arr.append(unidecode(starters_list[i].find('div', class_ = 'ellipsis').text))
        pos_arr.append(starters_list[i].find('span', class_ = 'pos').text)
        rat_arr.append(starters_list[i].find('span', class_ = 'bp3-tag').text)
    for i in range(len(sub_list)):
        name_arr.append(unidecode(sub_list[i].find('div', class_ = 'ellipsis').text))
        pos_arr.append(sub_list[i].find('span', class_ = 'pos').text)
        rat_arr.append(sub_list[i].find('span', class_ = 'bp3-tag').text)
    for i in range(len(res_list)):
        name_arr.append(unidecode(res_list[i].find('div', class_ = 'ellipsis').text))
        pos_arr.append(res_list[i].find('span', class_ = 'pos').text)
        rat_arr.append(res_list[i].find('span', class_ = 'bp3-tag').text)
    
    #Now look at new players from January
    starters_list_2 = soup2.find_all('tr', class_ = 'starting')
    sub_list_2 = soup2.find_all('tr', class_ = 'sub')
    res_list_2 = soup2.find_all('tr', class_ = 'res')
    for i in range(len(starters_list_2)):
        name = unidecode(starters_list_2[i].find('div', class_ = 'ellipsis').text)
        if (name not in name_arr):
            name_arr.append(name)
            pos_arr.append(starters_list_2[i].find('span', class_ = 'pos').text)
            rat_arr.append(starters_list_2[i].find('span', class_ = 'bp3-tag').text)
    for i in range(len(sub_list_2)):
        name = unidecode(sub_list_2[i].find('div', class_ = 'ellipsis').text)
        if (name not in name_arr):
            name_arr.append(name)
            pos_arr.append(sub_list_2[i].find('span', class_ = 'pos').text)
            rat_arr.append(sub_list_2[i].find('span', class_ = 'bp3-tag').text)
    for i in range(len(res_list_2)):
        name = unidecode(res_list_2[i].find('div', class_ = 'ellipsis').text)
        if (name not in name_arr):
            name_arr.append(name)
            pos_arr.append(res_list_2[i].find('span', class_ = 'pos').text)
            rat_arr.append(res_list_2[i].find('span', class_ = 'bp3-tag').text)
    
    assert(len(name_arr) == len(pos_arr))
    assert(len(name_arr) == len(rat_arr))
    club_arr = [club] * (len(name_arr))
    df_players = pd.DataFrame()
    df_players['Name'] = name_arr
    df_players['Club'] = club_arr
    df_players['Position'] = pos_arr
    df_players['Rating'] = rat_arr
    return df_players

In [43]:
dfs_arr = []
for i in range(len(team_list)):
    #First url is from September
    url1 = "https://sofifa.com/team/"+str(team_ids[i])+"/"+team_list[i]+"/?r=210002&set=true"
    #Second url is from January
    url2 = "https://sofifa.com/team/"+str(team_ids[i])+"/"+team_list[i]+"/?r=210026&set=true"
    page1 = requests.get(url1)
    page2 = requests.get(url2)
    soup1 = BeautifulSoup(page1.content, 'html.parser')
    soup2 = BeautifulSoup(page2.content, 'html.parser')
    players_df = get_df(soup1, soup2)
    dfs_arr.append(players_df)

In [44]:
all_players_df = pd.concat(dfs_arr)

We save the dataframe in a csv file so that we don't have to do the scraping again next time.

In [46]:
all_players_df.to_csv('ratings.csv', index = False)

In [45]:
print(all_players_df.shape)

(924, 4)
