In [1]:
# Use the requests library to scrape the page.
import requests
import pandas as pd
import time

# Parse HTML links using BeautifulSoup --> This will basically allow us to parse through the HTML of a webpage and select tables, tags etc.
from bs4 import BeautifulSoup

In [2]:
website_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
# We will work on getting the match data for each team over the last 5 years. --> 2023 means 2022-23 season.
all_years = list(range(2023, 2018, -1))

In [4]:
# This list will contain several dataframes, each dataframe will consist of the match logs for one team for one season.
all_matches = []

In [6]:
for year in all_years:

    # Makes a Request to the Server and download the HTML of the above URL.
    data = requests.get(website_url)

    # Initialize the Soup object.
    soup = BeautifulSoup(data.text)

    # Selecting the Season Standings Table from fbref --> Index is 0, cuz we want the first stats_table on the page  --> select() uses CSS selectors. This gives us the flexibility to select different classes, ID's, tables etc.
    standings = soup.select('table.stats_table')[0]

    # Selecting all the <a> tags from the table
    links_to_teams = standings.find_all('a')

    # Now we have the <a> tags, but we still need the actual links themselves --> So we'll get whatever is stored in the href in the <a> tags
    links_to_teams = [l.get('href') for l in links_to_teams]

    # This filters out the squad links (which give information on all matches of each squads) from all the other links in the standings table.
    links_to_teams = [l for l in links_to_teams if '/squads/' in l]

    # Now we can convert each of our links into full URLs
    team_urls = [f"https://fbref.com/{l}" for l in links_to_teams]

    # We get the link to the previous season from the page of the current season --> Stored in some <a> tag that has class = "prev" --> This is how you can retrieve the anchor tag pertaining to a class directly instead of getting all the <a> tags and THEN finding the <a> tag with the prev class
    previous_season = soup.select("a.prev")[0].get("href")
    website_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:

        team_info = requests.get(team_url)

        # Just gets the team name
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ')

        # All matches for a team for a season are in this matches variable
        matches_df = pd.read_html(team_url, match = "Scores & Fixtures")[0]

        # Now we extract the shooting stats for each team for each season for each match
        soup = BeautifulSoup(team_info.text)

        # This has all the different stat types for each team for each match for each season like shooting, passing, possession etc. --> We are interested in shooting only however --> Experiment Later
        match_log_types_links = soup.find_all('a')
        match_log_types_links = [l.get("href") for l in match_log_types_links]

        # All the shooting stats for each team for each match for each season are in the shooting_stats variable
        shooting_stats = [l for l in match_log_types_links if l and "all_comps/shooting/" in l]
        
        # Now we make a URL for the shooting_stats link and use pd.read_html() to extract all the shooting data into a dataframe
        shooting_stats_URL = f"https://fbref.com{shooting_stats[0]}"
        shooting_df = pd.read_html(shooting_stats_URL, match = "Shooting")[0]
        shooting_df.columns = shooting_df.columns.droplevel()

        # Now we merge the matches dataframe and the shooting dataframe using a try except block
        try:
            team_data = matches_df.merge(shooting_df[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")
        except ValueError:
            continue

        # Filtering out the matches that are not the PL from the data
        team_data = team_data[team_data["Comp"] == "Premier League"]

        # Adding Season and Team Name to the data. --> This is done cuz the team name and data isn't mentioned anywhere in the table data --> So we explicitly mention this by adding that col to the table
        # This is really important since we will be merging the data of all the matches of all teams of all seasons together in the end

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)

        #This is done to ensure that you aren't scraping the data too quickly --> If you scrape too quickly, it slows down the site and you may be blocked from scraping further data as well.
        time.sleep(10)

HTTPError: HTTP Error 404: Not Found

In [6]:
# This will ensure that all data frames of all teams of all years are put into a single big dataframe
all_matches_dataframe = pd.concat(all_matches)

In [7]:
# This just converts the headings of each column to lower_case --> Done just to make things uniform
all_matches_dataframe.columns = [c.lower() for c in all_matches_dataframe.columns]

In [9]:
# Convert the Pandas Dataframe to a CSV File
all_matches_dataframe.to_csv("all_matches_2.csv")