all data was scraped from:
https://www.worldfootball.net

## Importing libraries 

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import itertools
import lxml
import requests
import matplotlib.pyplot as plt
import time
import sys
%matplotlib inline  

### Setting the number of players to embedded in each match

In [2]:
number_players = 11

## scraping a single match

In scrap_match method we are requstion a single match data from worldfootball.net, which include the match score, home team and away team number of players we requested (using the starting_lineup method).

Since some sources pages have different data, we are making sure we are indeed scraping the relevant data

In [3]:
def starting_lineup(table_players,players_list):
    counter = 0
    for i in table_players.contents:
        if (len(i) > 4) & (counter <number_players):
            players_list.append(i.contents[3].contents[1]['title'])
            counter +=1
    
def scrape_match(season,team_home,team_away, debug=False):
    html = 'https://www.worldfootball.net/report/premier-league-' + season + '-' + team_home + '-' +team_away
    if debug:
        print(html)
    if html == "https://www.worldfootball.net/report/premier-league-2017-2018-southampton-fc-west-ham-united":
        html = html + "_2"
    source = requests.get(html).text
    soup = BeautifulSoup(source,'lxml')
    score = soup.find_all(class_='standard_tabelle')[0]
    table_players_home = soup.find_all(class_='standard_tabelle')[2]
    if len(table_players_home.contents) > 10:
        table_players_away = soup.find_all(class_='standard_tabelle')[3]
    else:
        table_players_home = soup.find_all(class_='standard_tabelle')[3]
        table_players_away = soup.find_all(class_='standard_tabelle')[4]
    players_list = []
    starting_lineup(table_players_home,players_list)
    starting_lineup(table_players_away,players_list)
    text=score.contents[3].contents[3].contents[1].text
    score_arr = []
    for c in text: 
        if c.isdigit():
            score_arr.append(int(c))
    return players_list,score_arr

## Scraping a season

In the scrape_season method we are scarping an entire season.
after requesing a page with all the leauge teams in the relevant season, we are creating an array of all the combinations in that season.

Then we create a fixtures DataFrame which will hold every match starting lineup and score, and players_df DataFrame which hold every player preformence.

We call every possible match with our scrape_match method

In [4]:
def scrape_season(season, debug=False):
    seaon_source = str(season - 1) + '-' + str(season)
    source_team = requests.get('https://www.worldfootball.net/players/eng-premier-league-'+seaon_source +'/').text
    soup_team = BeautifulSoup(source_team,'lxml')
    table_teams = soup_team.find_all(class_='standard_tabelle')[0]
    teams_list = []
    for i in table_teams.contents:
        if len(i) > 1:
            teams_list.append(i.contents[1].contents[1]['href'].split('teams/')[1].split('/')[0])
    fixtures_columns = ['home_team','away_team']
    for which_team in ['home', 'away']:
        for i in range(1,number_players+1):
            fixtures_columns.append(which_team +(str(i)))
    fixtures_columns.append('home_score')
    fixtures_columns.append('away_score')
    fixtures = pd.DataFrame(columns=fixtures_columns)
    players_columns = ['name','team']
    players_df = pd.DataFrame(columns=players_columns)
    every_fixture = []
    for team_1 in teams_list:
        for team_2 in teams_list:
            if team_1 != team_2:
                every_fixture.append([team_1,team_2])
    for fixture in every_fixture:
        players_list,score_arr = scrape_match(seaon_source,fixture[0],fixture[1],debug)
        fixtures = fixtures.append(pd.Series(fixture +players_list +score_arr,index=fixtures.columns),ignore_index=True)
        players_df_current = pd.DataFrame(columns=players_columns)
        players_df_current['name'] = players_list
        players_df_current['team'] = [fixture[0]]*number_players+[fixture[1]]*number_players
        players_df = pd.concat([players_df,players_df_current])
    
    fixtures['season'] = str(season)
    players_df['season'] = str(season)
    players_df['count']=1
    
    return fixtures,players_df

## Getting our data

Finally we call each season and collect data using the scrape_season method.

each season fixtures and player_df is saved as a csv.

We concat all the season data

In [9]:
seasons = [2015,2016,2017,2018,2019]
seasons_dict = {}
for season in seasons:    
    start = time.time()
    seasons_dict[str(season)] = {}
    seasons_dict[str(season)]['fixtures'],seasons_dict[str(season)]['players_df'] = scrape_season(season)
    seasons_dict[str(season)]['fixtures'].to_csv('fixtures_' +str(season)+'.csv',index=False,encoding='utf-8')
    seasons_dict[str(season)]['players_df'].to_csv('players_df_'+str(season)+'.csv',index=False,encoding='utf-8')
    end = time.time()
    elapsed = end - start
    print("the time for " + str(season) + " season is " + str(elapsed))

all_fixtures = pd.concat([seasons_dict[str(season)]['fixtures'] for season in seasons])
all_players = pd.concat([seasons_dict[str(season)]['players_df'] for season in seasons])

the time for 2015 season is 493.22929191589355
the time for 2016 season is 517.2812554836273
the time for 2017 season is 506.21851801872253
the time for 2018 season is 494.5997040271759
the time for 2019 season is 503.86867809295654


In odrer to create an embedding of all the players, we are giving each player an ID.

We merge by name in order to give each player an unique ID and merge it with the season and team citeria in order to merge it later between the fixtures and player ID 

In [10]:
all_players_groupby = all_players.groupby(['name','team','season']).count().reset_index()
np.mean(all_players_groupby['count']),np.median(all_players_groupby['count']),np.std(all_players_groupby['count'])

(17.294166321886635, 17.0, 11.404835471538657)

In [11]:
all_season_names = all_players.groupby(['name']).sum().reset_index()
all_season_names['id'] = range(1,len(all_season_names)+1)
all_players_groupby=all_players_groupby.merge(all_season_names[['name','id']],on='name',how='left')

In [12]:
all_players_groupby.to_csv('all_players_groupby_2015_2019.csv',index=False,encoding='utf-8')
all_fixtures.to_csv('all_fixtures_2015_2019.csv',index=False,encoding='utf-8')