This code scrapes the Premier League website to get the lineups and scores of each Premier League match in 2020/21 season.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
start = 58896
end = 59275

Since we scrape the results one at a time, I thought the best way to store them were to store each attribute in a separate array, then combine the arrays into a dataframe at the end.

In [3]:
match_id = []
home_team = []
away_team = []
home_score = []
away_score = []
home_formation = []
away_formation = []
home_starters = []
home_subs = []
away_starters = []
away_subs = []

Since it takes more work to get the starting and substitute lineups, we write a separate function to do that. In particular, we size the arrays so that the subs have the same size for each match (for some matches it is 7, for others it is 9).

In [4]:
def get_lineup(soup, home, start):
    team = []
    if home:
        class_name = 'startingLineUpContainer squadList home'
    else:
        class_name = 'startingLineUpContainer squadList'
    if start:
        ind = 0
        size = 11
    else:
        ind = 1
        size = 9
    team_arr = soup.find_all('ul', class_ = class_name)[ind]
    players = team_arr.find_all('div', class_ = 'name')
    for player in players:
        player_name = player.text
        player_name = player_name.replace('\n', '')
        player_name = player_name.split(' ')
        clean_name = []
        for i in player_name:
            if (i != '' and i[0].isalpha()):
                clean_name.append(i)
        team.append(' '.join(clean_name))
    while (len(team) < size):
        team.append("Buffer")
    assert(len(team) == size)
    return team

In [5]:
for i in range(start, end + 1):
    if i % 10 == 0: print(i)
    match_id.append(i)
    url = 'https://www.premierleague.com/match/'+str(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    names = soup.find_all('a', class_ = 'teamName')
    home_team.append(names[0].find('span', class_ = 'long').text)
    away_team.append(names[1].find('span', class_ = 'long').text)
    

    score = soup.find_all('div', class_ = 'scoreboxContainer')
    full_time_score = score[0].find_all('div', class_ = 'score fullTime')
    scores = full_time_score[0].text.split('-')
    home_score.append(int(scores[0]))
    away_score.append(int(scores[1]))

    formations = soup.find_all('strong', class_ = 'matchTeamFormation')
    home_formation.append(formations[0].text)
    away_formation.append(formations[1].text)
    
    home_starters.append(get_lineup(soup, True, True))
    home_subs.append(get_lineup(soup, True, False))
    away_starters.append(get_lineup(soup, False, True))
    away_subs.append(get_lineup(soup, False, False))

58900
58910
58920
58930
58940
58950
58960
58970
58980
58990
59000
59010
59020
59030
59040
59050
59060
59070
59080
59090
59100
59110
59120
59130
59140
59150
59160
59170
59180
59190
59200
59210
59220
59230
59240
59250
59260
59270


In [12]:
home_starters = np.array(home_starters)
home_subs = np.array(home_subs)
away_starters = np.array(away_starters)
away_subs = np.array(away_subs)
lineup = np.concatenate((home_starters, home_subs, away_starters, away_subs), axis = 1)

Finally we create a dataframe which stores all the columns together.

In [13]:
matches_df = pd.DataFrame(lineup)
matches_df['match_id'] = match_id
matches_df['home_team'] = home_team
matches_df['away_team'] = away_team
matches_df['home_score'] = home_score
matches_df['away_score'] = away_score
matches_df['home_formation'] = home_formation
matches_df['away_formation'] = away_formation
print(matches_df.head())

                0             1                       2                3  \
0       Nick Pope       Ben Mee          Matthew Lowton  James Tarkowski   
1  Vicente Guaita    Scott Dann        Cheikhou Kouyaté        Joel Ward   
2     Marek Rodák     Joe Bryan              Denis Odoi   Michael Hector   
3         Alisson  Joseph Gomez  Trent Alexander-Arnold  Virgil van Dijk   
4         Ederson  João Cancelo             John Stones       Rúben Dias   

                  4                 5                   6  \
0      Erik Pieters      Robbie Brady  Jóhann Gudmundsson   
1   Tyrick Mitchell   Andros Townsend      James McCarthy   
2          Tim Ream       Josh Onomah      Ivan Cavaleiro   
3  Andrew Robertson  Jordan Henderson          Naby Keïta   
4       Kyle Walker   Kevin De Bruyne               Rodri   

                     7                8              9  ...  \
0       Josh Brownhill  Ashley Westwood  Ashley Barnes  ...   
1       James McArthur  Jeffrey Schlupp  Wilfried 

Again, we save the dataframe to a csv file so that we don't have to do the scraping again next time.

In [14]:
matches_df.to_csv('matches.csv', index = False)