In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv


#Functions to get the data in a dataframe using BeautifulSoup

def get_tables(url,text):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("tbody")
    
    team_table = all_tables[0]
    team_vs_table = all_tables[1]
    player_table = all_tables[2]
    if text == 'for':
      return player_table, team_table
    if text == 'vs':
      return player_table, team_vs_table

def get_frame(features, player_table):
    pre_df_player = dict()
    features_wanted_player = features
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
    
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='team')&(f!='age')&(f!='birth_year')&(f!='matches')):
                    text = float(text.replace(',',''))
                if f in pre_df_player:
                    pre_df_player[f].append(text)
                else:
                    pre_df_player[f] = [text]
    df_player = pd.DataFrame.from_dict(pre_df_player)
    return df_player

def get_frame_team(features, team_table):
    pre_df_squad = dict()
    #Note: features does not contain squad name, it requires special treatment
    features_wanted_squad = features
    rows_squad = team_table.find_all('tr')
    for row in rows_squad:
        if(row.find('th',{"scope":"row"}) != None):
            name = row.find('th',{"data-stat":"squad"}).text.strip().encode().decode("utf-8")
            if 'squad' in pre_df_squad:
                pre_df_squad['squad'].append(name)
            else:
                pre_df_squad['squad'] = [name]
            for f in features_wanted_squad:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='squad')&(f!='age')&(f!='birth_year')&(f!='matches')):
                    text = float(text.replace(',',''))
                if f in pre_df_squad:
                    pre_df_squad[f].append(text)
                else:
                    pre_df_squad[f] = [text]
    df_squad = pd.DataFrame.from_dict(pre_df_squad)
    return df_squad

def get_features_per_frame(table):
    td_elements = table.find_all('td')
    features = []
    for td in td_elements:
        data_stat = td.get('data-stat')
        if data_stat not in features:
            features.append(data_stat)
    return features

def frame_for_category(category,top,end):
    url = (top + category + end)
    player_table, team_table = get_tables(url,'for')
    features = get_features_per_frame(player_table)
    df_player = get_frame(features, player_table)
    return df_player

def frame_for_category_team(category,top,end,text):
    url = (top + category + end)
    player_table, team_table = get_tables(url,text)
    features = get_features_per_frame(team_table)
    df_team = get_frame_team(features, team_table)
    return df_team
    

def get_outfield_data(top, end):
    df1 = frame_for_category('stats',top,end)
    df2 = frame_for_category('shooting',top,end)
    df3 = frame_for_category('passing',top,end)
    df4 = frame_for_category('passing_types',top,end)
    df5 = frame_for_category('gca',top,end)
    df6 = frame_for_category('defense',top,end)
    df7 = frame_for_category('possession',top,end)
    df8 = frame_for_category('misc',top,end)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df


#Function to get keeping and advance goalkeeping data
def get_keeper_data(top,end):
    df1 = frame_for_category('keepers',top,end)
    df2 = frame_for_category('keepersadv',top,end)
    df = pd.concat([df1, df2], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df

#Function to get team-wise data accross all categories as mentioned above
def get_team_data(top,end,text):
    df1 = frame_for_category_team('stats',top,end,text)
    df2 = frame_for_category_team('keepers',top,end,text)
    df3 = frame_for_category_team('keepersadv',top,end,text)
    df4 = frame_for_category_team('shooting',top,end,text)
    df5 = frame_for_category_team('passing',top,end,text)
    df6 = frame_for_category_team('passing_types',top,end,text)
    df7 = frame_for_category_team('gca',top,end,text)
    df8 = frame_for_category_team('defense',top,end,text)
    df9 = frame_for_category_team('possession',top,end,text)
    df10 = frame_for_category_team('misc',top,end,text)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df

In [6]:
df_outfield = get_outfield_data('https://fbref.com/en/comps/11/','/Serie-A-Stats')
df_outfield

Unnamed: 0,player,nationality,position,team,age,birth_year,games,games_starts,minutes,minutes_90s,...,fouls,fouled,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
0,Tammy Abraham,eng ENG,FW,Roma,25,1997,8.0,2.0,242.0,2.7,...,6.0,5.0,2.0,1.0,0.0,0.0,1.0,6.0,5.0,54.5
1,Francesco Acerbi,it ITA,DF,Inter,35,1988,29.0,26.0,2388.0,26.5,...,19.0,24.0,3.0,0.0,0.0,0.0,102.0,70.0,37.0,65.4
2,Yacine Adli,fr FRA,MF,Milan,23,2000,24.0,17.0,1407.0,15.6,...,26.0,9.0,1.0,0.0,1.0,0.0,94.0,15.0,10.0,60.0
3,Michel Aebischer,ch SUI,MF,Bologna,26,1997,36.0,26.0,2230.0,24.8,...,37.0,30.0,0.0,0.0,0.0,0.0,135.0,11.0,16.0,40.7
4,Lucien Agoume,fr FRA,MF,Inter,21,2002,1.0,0.0,5.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,Nadir Zortea,it ITA,DF,Atalanta,24,1999,5.0,0.0,149.0,1.7,...,4.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0
612,Szymon Żurkowski,pl POL,MF,Empoli,25,1997,13.0,10.0,868.0,9.6,...,19.0,17.0,2.0,0.0,0.0,0.0,37.0,7.0,17.0,29.2
613,Milan Đurić,ba BIH,FW,Hellas Verona,33,1990,20.0,13.0,1204.0,13.4,...,26.0,19.0,6.0,0.0,0.0,0.0,25.0,135.0,43.0,75.8
614,Milan Đurić,ba BIH,FW,Monza,33,1990,17.0,13.0,1257.0,14.0,...,23.0,26.0,1.0,0.0,0.0,0.0,29.0,86.0,40.0,68.3


In [7]:
df_outfield.describe()

Unnamed: 0,games,games_starts,minutes,minutes_90s,goals,assists,goals_assists,goals_pens,pens_made,pens_att,...,fouls,fouled,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
count,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,...,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0
mean,19.334416,13.571429,1219.387987,13.551623,1.571429,1.11039,2.681818,1.405844,0.165584,0.212662,...,14.86526,13.967532,1.939935,0.149351,0.212662,0.038961,55.24513,17.107143,17.107143,43.485877
std,11.893368,11.455311,973.48,10.815326,2.837856,1.662838,4.014939,2.48067,0.75127,0.865923,...,13.861558,15.03683,3.814559,0.460222,0.499009,0.217394,51.578655,21.556616,18.51165,25.857383
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.75,3.0,318.25,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,0.0,11.0,2.0,2.0,30.0
50%,21.0,11.0,1060.0,11.8,0.0,0.0,1.0,0.0,0.0,0.0,...,12.0,9.0,1.0,0.0,0.0,0.0,41.0,9.0,13.0,46.2
75%,30.0,23.0,2046.0,22.725,2.0,2.0,4.0,2.0,0.0,0.0,...,25.0,21.0,2.0,0.0,0.0,0.0,85.0,24.0,25.0,57.175
max,38.0,38.0,3420.0,38.0,24.0,9.0,27.0,22.0,10.0,10.0,...,77.0,80.0,35.0,5.0,3.0,2.0,276.0,135.0,159.0,100.0


In [8]:
df_outfield.to_csv('scraped_data\\seriea.csv', index = False)

In [18]:
df_keeper = get_keeper_data('https://fbref.com/en/comps/11/','/Serie-A-Stats')
df_keeper

Unnamed: 0,player,nationality,position,team,age,birth_year,gk_games,gk_games_starts,gk_minutes,minutes_90s,...,gk_passes_length_avg,gk_goal_kicks,gk_pct_goal_kicks_launched,gk_goal_kick_length_avg,gk_crosses,gk_crosses_stopped,gk_crosses_stopped_pct,gk_def_actions_outside_pen_area,gk_def_actions_outside_pen_area_per90,gk_avg_distance_def_actions
0,Simone Aresti,it ITA,GK,Cagliari,37,1986,1.0,0.0,1.0,0.0,...,64.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1,Emil Audero,it ITA,GK,Inter,26,1997,4.0,4.0,337.0,3.7,...,24.6,18.0,16.7,27.2,54.0,6.0,11.1,2.0,0.53,10.2
2,Nicola Bagnolini,it ITA,GK,Bologna,19,2004,1.0,0.0,5.0,0.1,...,47.0,2.0,50.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Etrit Berisha,al ALB,GK,Empoli,34,1989,14.0,14.0,1260.0,14.0,...,33.0,90.0,50.0,40.8,236.0,11.0,4.7,10.0,0.71,12.4
4,Elia Caprile,it ITA,GK,Empoli,21,2001,23.0,23.0,2070.0,23.0,...,36.8,159.0,53.5,45.1,391.0,31.0,7.9,20.0,0.87,13.3
5,Marco Carnesecchi,it ITA,GK,Atalanta,23,2000,27.0,27.0,2425.0,26.9,...,33.8,193.0,67.4,51.7,332.0,20.0,6.0,39.0,1.45,14.7
6,Michele Cerofolini,it ITA,GK,Frosinone,24,1999,8.0,7.0,675.0,7.5,...,37.2,38.0,50.0,42.8,126.0,4.0,3.2,7.0,0.93,14.3
7,Oliver Christensen,dk DEN,GK,Fiorentina,24,1999,4.0,4.0,360.0,4.0,...,31.0,20.0,35.0,33.7,32.0,1.0,3.1,9.0,2.25,19.5
8,Andrea Consigli,it ITA,GK,Sassuolo,36,1987,35.0,35.0,3150.0,35.0,...,36.9,252.0,49.6,41.2,545.0,23.0,4.2,22.0,0.63,11.8
9,Nikita Contini Baranovsky,it ITA,GK,Napoli,27,1996,1.0,0.0,17.0,0.2,...,17.0,1.0,0.0,15.0,1.0,0.0,0.0,3.0,15.88,37.0


In [20]:
df_keeper.to_csv('scraped_data\\keepers_seriea.csv', index = False)

In [13]:
seriea = 'https://fbref.com/en/comps/11/stats/Serie-A-Stats'
epl = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats'
liga = 'https://fbref.com/en/comps/12/stats/La-Liga-Stats'
bundes = 'https://fbref.com/en/comps/20/stats/Bundesliga-Stats'
ligue1 = 'https://fbref.com/en/comps/13/stats/Ligue-1-Stats'
eredivisie = 'https://fbref.com/en/comps/23/stats/Eredivisie-Stats'
primeira = 'https://fbref.com/en/comps/32/stats/Primeira-Liga-Stats'
jupiler = 'https://fbref.com/en/comps/37/stats/Belgian-Pro-League-Stats'
championship = 'https://fbref.com/en/comps/10/stats/Championship-Stats'
serieb = 'https://fbref.com/en/comps/18/stats/Serie-B-Stats'

In [14]:
top5 = [seriea, epl, liga, bundes, ligue1]
other = [eredivisie, primeira, jupiler, championship, serieb]

In [24]:
import time
start_time = time.time()
for league in top5[1:]:
    start_it = time.time()
    start, end = league.split('stats')
    print(f'getting data from {league}...')
    df_outfield = get_outfield_data(start, end)
    end_scrape = time.time()
    print(f'scraping finished in {end_scrape-start_it}s')
    print(len(df_outfield))
    print(df_outfield.head())
    nome = 'scraped_data//' + end.strip('/').strip('-Stats') + '.csv'
    df_outfield.to_csv(nome, index = False)
    print(f'{league} finished in {time.time()-start_it}s')
    print('-'*40)

getting data from https://fbref.com/en/comps/9/stats/Premier-League-Stats...
scraping finished in 48.75112175941467s
580
               player nationality position           team age birth_year  \
0          Max Aarons     eng ENG       DF    Bournemouth  23       2000   
1   Joshua Acheampong     eng ENG       DF        Chelsea  17       2006   
2  Bénie Adama Traore      ci CIV    FW,MF  Sheffield Utd  20       2002   
3         Tyler Adams      us USA       MF    Bournemouth  24       1999   
4    Tosin Adarabioyo     eng ENG       DF         Fulham  25       1997   

   games  games_starts  minutes  minutes_90s  ...  fouls  fouled  offsides  \
0   20.0          13.0   1237.0         13.7  ...   12.0    26.0       2.0   
1    1.0           0.0      6.0          0.1  ...    0.0     0.0       0.0   
2    8.0           3.0    387.0          4.3  ...   10.0     4.0       1.0   
3    3.0           1.0    121.0          1.3  ...    0.0     3.0       0.0   
4   20.0          18.0   1617.0 

In [25]:
start_time = time.time()
for league in top5[1:]:
    start_it = time.time()
    start, end = league.split('stats')
    print(f'getting data from {league}...')
    df_keepers = get_keeper_data(start, end)
    end_scrape = time.time()
    print(f'scraping finished in {end_scrape-start_it}s')
    print(len(df_keepers))
    print(df_keepers.head())
    nome = 'scraped_data//keepers_' + end.strip('/').strip('-Stats') + '.csv'
    df_keepers.to_csv(nome, index = False)
    print(f'{league} finished in {time.time()-start_it}s')
    print('-'*40)

getting data from https://fbref.com/en/comps/9/stats/Premier-League-Stats...
scraping finished in 2.0206351280212402s
40
            player nationality position             team age birth_year  \
0          Alisson      br BRA       GK        Liverpool  30       1992   
1  Alphonse Areola      fr FRA       GK         West Ham  30       1993   
2   Daniel Bentley     eng ENG       GK           Wolves  30       1993   
3  Martin Dúbravka      sk SVK       GK    Newcastle Utd  34       1989   
4          Ederson      br BRA       GK  Manchester City  29       1993   

   gk_games  gk_games_starts  gk_minutes  minutes_90s  ...  \
0      28.0             28.0      2520.0         28.0  ...   
1      31.0             31.0      2699.0         30.0  ...   
2       5.0              3.0       383.0          4.3  ...   
3      23.0             22.0      1985.0         22.1  ...   
4      33.0             33.0      2785.0         30.9  ...   

   gk_passes_length_avg  gk_goal_kicks  gk_pct_goal_kic

In [26]:
start_time = time.time()
for league in other:
    start_it = time.time()
    start, end = league.split('stats')
    print(f'getting data from {league}...')
    df_outfield = get_outfield_data(start, end)
    end_scrape = time.time()
    print(f'scraping finished in {end_scrape-start_it}s')
    print(len(df_outfield))
    print(df_outfield.head())
    nome = 'scraped_data//' + end.strip('/').strip('-Stats') + '.csv'
    df_outfield.to_csv(nome, index = False)
    print(f'{league} finished in {time.time()-start_it}s')
    print('-'*40)

getting data from https://fbref.com/en/comps/23/stats/Eredivisie-Stats...
scraping finished in 29.26629400253296s
525
                player nationality position           team age birth_year  \
0  Patrick van Aanholt      nl NED       DF  PSV Eindhoven  32       1990   
1      Paxten Aaronson      us USA       MF        Vitesse  19       2003   
2         Jayden Addai      nl NED       FW     AZ Alkmaar  17       2005   
3       Bobby Adekanye      nl NED       FW   Go Ahead Eag  24       1999   
4        Shawn Adewoye      be BEL       DF   RKC Waalwijk  23       2000   

   games  games_starts  minutes  minutes_90s  ...  fouls  fouled  offsides  \
0   24.0           9.0   1114.0         12.4  ...    6.0     5.0       2.0   
1   14.0          14.0   1253.0         13.9  ...   16.0    14.0       0.0   
2    8.0           2.0    297.0          3.3  ...    5.0     8.0       0.0   
3   28.0          23.0   1804.0         20.0  ...   21.0    40.0       4.0   
4   29.0          26.0   2338

In [27]:
start_time = time.time()
for league in other:
    start_it = time.time()
    start, end = league.split('stats')
    print(f'getting data from {league}...')
    df_keepers = get_keeper_data(start, end)
    end_scrape = time.time()
    print(f'scraping finished in {end_scrape-start_it}s')
    print(len(df_keepers))
    print(df_keepers.head())
    nome = 'scraped_data//keepers_' + end.strip('/').strip('-Stats') + '.csv'
    df_keepers.to_csv(nome, index = False)
    print(f'{league} finished in {time.time()-start_it}s')
    print('-'*40)

getting data from https://fbref.com/en/comps/23/stats/Eredivisie-Stats...
scraping finished in 3.3655619621276855s
38
            player nationality position           team age birth_year  \
0     Mio Backhaus      de GER       GK       Volendam  19       2004   
1    Nordin Bakker      nl NED       GK    Almere City  25       1997   
2  Vasilios Barkas      gr GRE       GK        Utrecht  29       1994   
3   Walter Benítez      ar ARG       GK  PSV Eindhoven  30       1993   
4    Justin Bijlow      nl NED       GK      Feyenoord  25       1998   

   gk_games  gk_games_starts  gk_minutes  minutes_90s  ...  \
0      33.0             32.0      2881.0         32.0  ...   
1      29.0             29.0      2596.0         28.8  ...   
2      29.0             29.0      2610.0         29.0  ...   
3      33.0             33.0      2964.0         32.9  ...   
4      17.0             17.0      1462.0         16.2  ...   

   gk_passes_length_avg  gk_goal_kicks  gk_pct_goal_kicks_launched  \
