In [44]:
import requests

In [45]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [None]:
data = requests.get(standings_url)  

In [None]:
data.text

In [48]:
from bs4 import BeautifulSoup

In [49]:
soup = BeautifulSoup(data.text)

In [None]:
standings_table = soup.select('table.stats_table')[0]  

In [None]:
standings_table

In [None]:
links = standings_table.find_all('a') 

In [10]:
links = [l.get('href') for l in links]  # get all  links

In [11]:
links = [l for l in links if '/squads/' in l]  # filter link to only have squad links

In [12]:
team_urls = [f'https://fbref.com{l}' for l in links]  # take each link and add the website string to the beginning of that link

In [None]:
team_urls

In [14]:
team_url = team_urls[0]

In [15]:
data = requests.get(team_url)

In [None]:
import pandas as pd
from io import StringIO  # used to read literal strings in the new version

matches = pd.read_html(StringIO(data.text), match='Scores & Fixtures')[0]  

In [None]:
matches.head()  

In [18]:
soup = BeautifulSoup(data.text)

In [19]:
links = soup.find_all('a')

In [20]:
links = [l.get('href') for l in links]

In [21]:
shooting_links = [l for l in links if l and 'all_comps/shooting/' in l]  # filter links 

In [None]:
shooting_links

In [23]:
shooting_data = requests.get(f'https://fbref.com{shooting_links[0]}')

In [24]:
shooting = pd.read_html(StringIO(shooting_data.text), match='Shooting')[0]

In [None]:
shooting.head()

In [26]:
shooting.columns = shooting.columns.droplevel() # drop one index level to remove the first header row

In [None]:
shooting.head()

In [28]:
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")   # merge the matches and shooting dataframes together

In [None]:
team_data

In [None]:
shooting.shape

In [31]:
possession_links = [l for l in links if l and 'all_comps/possession/' in l]

In [None]:
possession_links

In [33]:
possession_data = requests.get(f'https://fbref.com{possession_links[0]}')

In [34]:
possession = pd.read_html(StringIO(possession_data.text), match='Possession')[0]

In [35]:
possession.columns = possession.columns.droplevel()

In [None]:
possession

In [37]:
team_data = matches.merge(possession[['Date', 'Poss', 'Succ%']], on='Date')

In [None]:
team_data

In [1]:
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup

In [2]:
years = list(range(2025, 2017, -1))

In [3]:
years

[2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018]

In [4]:
all_matches = []

In [5]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [6]:
import time

for year in years:
    data = requests.get(standings_url)  # get the standings url html
    soup = BeautifulSoup(data.text)  # parse html through BeautifulSoup
    standings_table = soup.select('table.stats_table')[0]  # select the stats table that contains the individual team links
    
    links = [l.get('href') for l in standings_table.find_all('a')]  # find all the team links and grab the href property
    links = [l for l in links if '/squads/' in l]  # filter the links so that we only have the links for the squads
    team_urls = [f'https://fbref.com{l}' for l in links] # turn the links from relative links into absolute links

    # everytime the loop continues it will get the standings url for the previous season adn scrape its data
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    # individually scrape the match logs for each team
    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ')

        data = requests.get(team_url)
        matches = pd.read_html(StringIO(data.text), match='Scores & Fixtures')[0]

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]

        shooting_links = [l for l in links if l and 'all_comps/shooting/' in l]
        shooting_data = requests.get(f"https://fbref.com{shooting_links[0]}")
        shooting = pd.read_html(StringIO(shooting_data.text), match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        
        # for some teams, the shooting stats are not available, so if we merge we get a value error because the shooting stats is empty
        try:
            team_data = matches.merge(shooting[["Date", "Gls", "Sh", "SoT%", "G/Sh", "G/SoT", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue  # ignore the team with the empty shooting dataframe
        
        team_data = team_data[team_data["Comp"] == "Premier League"]  # filter out competitions to only include the premier league
        team_data["Season"] = year  # add a new column to show the season
        team_data["Team"] = team_name  # add a new column to show the team
        all_matches.append(team_data)
        time.sleep(5)  # in order not to scrape too quickly in order to avoid being blocked from scraping



In [7]:
match_df = pd.concat(all_matches)  # takes a list of dataframes and returns a single dataframe

In [8]:
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,SoT%,G/Sh,G/SoT,SoT,Dist,FK,PK,PKatt,Season,Team
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,...,45.5,0.18,0.40,5.0,19.1,0.0,0,0,2025,Manchester City
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,...,30.8,0.23,0.75,4.0,17.8,1.0,1,1,2025,Manchester City
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,...,34.8,0.13,0.38,8.0,15.0,1.0,0,0,2025,Manchester City
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,27.8,0.11,0.40,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,42.1,0.11,0.25,8.0,13.6,1.0,0,0,2025,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2018-04-15,16:00,Premier League,Matchweek 34,Sun,Away,W,1,0,Manchester Utd,...,40.0,0.10,0.25,4.0,18.1,0.0,0,0,2018,West Bromwich Albion
39,2018-04-21,12:30,Premier League,Matchweek 35,Sat,Home,D,2,2,Liverpool,...,46.2,0.15,0.33,6.0,17.7,0.0,0,0,2018,West Bromwich Albion
40,2018-04-28,15:00,Premier League,Matchweek 36,Sat,Away,W,1,0,Newcastle Utd,...,22.2,0.11,0.50,2.0,20.1,0.0,0,0,2018,West Bromwich Albion
41,2018-05-05,15:00,Premier League,Matchweek 37,Sat,Home,W,1,0,Tottenham,...,11.1,0.11,1.00,1.0,10.2,0.0,0,0,2018,West Bromwich Albion


In [9]:
match_df.columns = [c.lower() for c in match_df.columns]  # go through all the columns and make them lowercase

In [10]:
match_df.to_csv("matches.csv")