In [None]:
!apt-get update
!apt-get install -y chromium-chromedriver
!pip install selenium

**Web Scraping for Goals For and Goals Against:**

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from IPython.display import display

def scrape_GF_GA(start_season_year, end_season_year):

    # Initialize an empty DataFrame to store all seasons' data
    all_seasons_df = pd.DataFrame()

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        # Loop through each season in the range
        for year in range(start_season_year, end_season_year + 1):

            # Generate the season string
            season = f"{year}-{year + 1}"

            # Construct the URL and table ID
            url = f"https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats#all_rank_key"
            table_id = f"results{season}91_overall"

            print(f"Scraping data for season: {season}...")  # Progress indicator

            # Open the webpage
            driver.get(url)

            # Locate the table
            table = driver.find_element(By.ID, table_id)
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract table data
            data = []
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                data.append([col.text for col in cols])

            # Convert to DataFrame
            df = pd.DataFrame(data)

            # Select specific columns
            df = df.iloc[1:, [0, 5, 6]]
            df = df.reset_index(drop=True)

            # Rename columns
            df.columns = ["Squad", "Goals For", "Goals Against"]

            # Add a new column for the season
            df.insert(0, "Season", season)

            # Append to the combined DataFrame
            all_seasons_df = pd.concat([all_seasons_df, df], ignore_index=True)

    finally:
        # Close the WebDriver
        driver.quit()

    # Display the combined DataFrame
    print(f"Completed!")
    display(all_seasons_df)

    # Return the combined DataFrame
    return all_seasons_df

start_season_year = 2013
end_season_year = 2023
all_data = scrape_GF_GA(start_season_year, end_season_year)

# Save to CSV and download the data
all_data.to_csv(f"Premier_League_Data_GF_GA_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv", index=False)
from google.colab import files
files.download(f"Premier_League_Data_GF_GA_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv")

Scraping data for season: 2013-2014...
Scraping data for season: 2014-2015...
Scraping data for season: 2015-2016...
Scraping data for season: 2016-2017...
Scraping data for season: 2017-2018...
Scraping data for season: 2018-2019...
Scraping data for season: 2019-2020...
Scraping data for season: 2020-2021...
Scraping data for season: 2021-2022...
Scraping data for season: 2022-2023...
Scraping data for season: 2023-2024...
Completed!


Unnamed: 0,Season,Squad,Goals For,Goals Against
0,2013-2014,Manchester City,102,37
1,2013-2014,Liverpool,101,50
2,2013-2014,Chelsea,71,27
3,2013-2014,Arsenal,68,41
4,2013-2014,Everton,61,39
...,...,...,...,...
215,2023-2024,Brentford,56,65
216,2023-2024,Nott'ham Forest,49,67
217,2023-2024,Luton Town,52,85
218,2023-2024,Burnley,41,78


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Web Scraping for Possession, Yellow Card, and Penalty:**

In [None]:
def scrape_POSS_YC_PEN(start_season_year, end_season_year):

    # Initialize an empty DataFrame to store all seasons' data
    all_seasons_df = pd.DataFrame()

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        # Loop through each season in the range
        for year in range(start_season_year, end_season_year + 1):

            # Generate the season string
            season = f"{year}-{year + 1}"

            # Construct the URL and table ID
            url = f"https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats#all_rank_key"
            table_id = "stats_squads_standard_for"

            print(f"Scraping data for season: {season}...")  # Progress indicator

            # Open the webpage
            driver.get(url)

            # Locate the table
            table = driver.find_element(By.ID, table_id)
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract table data
            data = []
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "th") + row.find_elements(By.TAG_NAME, "td")
                data.append([col.text for col in cols])

            # Convert to DataFrame
            df = pd.DataFrame(data)

            # Select specific columns
            df = df.iloc[2:, [0, 3, 12, 14]]
            df = df.reset_index(drop=True)

            # Rename columns
            df.columns = ["Squad", "Possession", "Penalty Kicks (Made)", "Yellow Cards"]

            # Add a new column for the season
            df.insert(0, "Season", season)

            # Append to the combined DataFrame
            all_seasons_df = pd.concat([all_seasons_df, df], ignore_index=True)

    finally:
        # Close the WebDriver
        driver.quit()

    # Display the combined DataFrame
    print(f"Completed!")
    display(all_seasons_df)

    # Return the combined DataFrame
    return all_seasons_df

start_season_year = 2013
end_season_year = 2023
all_data = scrape_POSS_YC_PEN(start_season_year, end_season_year)

# Save to CSV and download the data
all_data.to_csv(f"Premier_League_Data_POSS_YC_PEN_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv", index=False)
from google.colab import files
files.download(f"Premier_League_Data_POSS_YC_PEN_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv")

Scraping data for season: 2013-2014...
Scraping data for season: 2014-2015...
Scraping data for season: 2015-2016...
Scraping data for season: 2016-2017...
Scraping data for season: 2017-2018...
Scraping data for season: 2018-2019...
Scraping data for season: 2019-2020...
Scraping data for season: 2020-2021...
Scraping data for season: 2021-2022...
Scraping data for season: 2022-2023...
Scraping data for season: 2023-2024...
Completed!


Unnamed: 0,Season,Squad,Possession,Penalty Kicks (Made),Yellow Cards
0,2013-2014,Arsenal,,3,54
1,2013-2014,Aston Villa,,2,78
2,2013-2014,Cardiff City,,1,49
3,2013-2014,Chelsea,,5,62
4,2013-2014,Crystal Palace,,5,55
...,...,...,...,...,...
215,2023-2024,Nott'ham Forest,40.6,1,82
216,2023-2024,Sheffield Utd,35.8,5,101
217,2023-2024,Tottenham,61.6,2,92
218,2023-2024,West Ham,41.1,4,82


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Web Scraping for Match Records:**

In [None]:
def scrape_matches(start_season_year, end_season_year):

    # Initialize an empty DataFrame to store all seasons' data
    all_matches_df = pd.DataFrame()

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        # Loop through each season in the range
        for year in range(start_season_year, end_season_year + 1):

            # Generate the season string
            season = f"{year}-{year + 1}"

            # Construct the URL
            url = f"https://fbref.com/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures"
            table_id = "all_sched"  # Table ID for the fixtures

            print(f"Scraping matches for season: {season}...")  # Progress indicator

            # Open the webpage
            driver.get(url)

            # Locate the table
            table = driver.find_element(By.ID, table_id)
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract table data
            data = []
            for row in rows:

                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) > 6:
                    date = cols[1].text
                    home_team = cols[3].text
                    score = cols[5].text
                    away_team = cols[7].text

                    # Append only rows with non-empty values for all relevant fields
                    if date.strip() and home_team.strip() and score.strip() and away_team.strip():
                        data.append([season, date, home_team, score, away_team])

            # Convert to DataFrame
            df = pd.DataFrame(data, columns=["Season", "Date", "Home Team", "Score", "Away Team"])

            # Ensure "Score" is stored as a string
            df["Score"] = df["Score"].astype(str)

            # Append to the combined DataFrame
            all_matches_df = pd.concat([all_matches_df, df], ignore_index=True)

    finally:
        # Close the WebDriver
        driver.quit()

    # Display the combined DataFrame
    print(f"Completed!")
    display(all_matches_df)

    # Return the combined DataFrame
    return all_matches_df

start_season_year = 2017
end_season_year = 2023
matches_data = scrape_matches(start_season_year, end_season_year)

# Save to CSV and download the data
matches_data.to_csv(f"Premier_League_Data_Mathes_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv", index=False)

from google.colab import files
files.download(f"Premier_League_Data_Mathes_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv")

Scraping matches for season: 2017-2018...
Scraping matches for season: 2018-2019...
Scraping matches for season: 2019-2020...
Scraping matches for season: 2020-2021...
Scraping matches for season: 2021-2022...
Scraping matches for season: 2022-2023...
Scraping matches for season: 2023-2024...
Completed!


Unnamed: 0,Season,Date,Home Team,Score,Away Team
0,2017-2018,2017-08-11,Arsenal,4–3,Leicester City
1,2017-2018,2017-08-12,Watford,3–3,Liverpool
2,2017-2018,2017-08-12,Crystal Palace,0–3,Huddersfield
3,2017-2018,2017-08-12,West Brom,1–0,Bournemouth
4,2017-2018,2017-08-12,Chelsea,2–3,Burnley
...,...,...,...,...,...
2655,2023-2024,2024-05-19,Brentford,2–4,Newcastle Utd
2656,2023-2024,2024-05-19,Chelsea,2–1,Bournemouth
2657,2023-2024,2024-05-19,Crystal Palace,5–0,Aston Villa
2658,2023-2024,2024-05-19,Liverpool,2–0,Wolves


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
def scrape_matches(start_season_year, end_season_year):

    # Initialize an empty DataFrame to store all seasons' data
    all_matches_df = pd.DataFrame()

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        # Loop through each season in the range
        for year in range(start_season_year, end_season_year + 1):

            # Generate the season string
            season = f"{year}-{year + 1}"

            # Construct the URL
            url = f"https://fbref.com/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures"
            table_id = "all_sched"  # Table ID for the fixtures

            print(f"Scraping matches for season: {season}...")  # Progress indicator

            # Open the webpage
            driver.get(url)

            # Locate the table
            table = driver.find_element(By.ID, table_id)
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract table data
            data = []
            for row in rows:

                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) > 6:
                    date = cols[1].text
                    home_team = cols[3].text
                    score = cols[4].text
                    away_team = cols[5].text

                    # Append only rows with non-empty values for all relevant fields
                    if date.strip() and home_team.strip() and score.strip() and away_team.strip():
                        data.append([season, date, home_team, score, away_team])

            # Convert to DataFrame
            df = pd.DataFrame(data, columns=["Season", "Date", "Home Team", "Score", "Away Team"])

            # Ensure "Score" is stored as a string
            df["Score"] = df["Score"].astype(str)

            # Append to the combined DataFrame
            all_matches_df = pd.concat([all_matches_df, df], ignore_index=True)

    finally:
        # Close the WebDriver
        driver.quit()

    # Display the combined DataFrame
    print(f"Completed!")
    display(all_matches_df)

    # Return the combined DataFrame
    return all_matches_df

start_season_year = 2013
end_season_year = 2016
matches_data = scrape_matches(start_season_year, end_season_year)

# Save to CSV and download the data
matches_data.to_csv(f"Premier_League_Data_Mathes_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv", index=False)

from google.colab import files
files.download(f"Premier_League_Data_Mathes_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv")

Scraping matches for season: 2013-2014...
Scraping matches for season: 2014-2015...
Scraping matches for season: 2015-2016...
Scraping matches for season: 2016-2017...
Completed!


Unnamed: 0,Season,Date,Home Team,Score,Away Team
0,2013-2014,2013-08-17,Liverpool,1–0,Stoke City
1,2013-2014,2013-08-17,Norwich City,2–2,Everton
2,2013-2014,2013-08-17,Swansea City,1–4,Manchester Utd
3,2013-2014,2013-08-17,West Ham,2–0,Cardiff City
4,2013-2014,2013-08-17,West Brom,0–1,Southampton
...,...,...,...,...,...
1515,2016-2017,2017-05-21,Manchester Utd,2–0,Crystal Palace
1516,2016-2017,2017-05-21,Liverpool,3–0,Middlesbrough
1517,2016-2017,2017-05-21,Watford,0–5,Manchester City
1518,2016-2017,2017-05-21,Burnley,1–2,West Ham


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Web Scraping for Save Percentage and Clean Sheet Percentage:**

In [9]:
def scrape_SP_CSP(start_season_year, end_season_year):

    # Initialize an empty DataFrame to store all seasons' data
    all_seasons_df = pd.DataFrame()

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        # Loop through each season in the range
        for year in range(start_season_year, end_season_year + 1):

            # Generate the season string
            season = f"{year}-{year + 1}"

            # Construct the URL and table ID
            url = f"https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats#all_rank_key"
            table_id = "stats_squads_keeper_for"

            print(f"Scraping data for season: {season}...")  # Progress indicator

            # Open the webpage
            driver.get(url)

            # Locate the table
            table = driver.find_element(By.ID, table_id)
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract table data
            data = []
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "th") + row.find_elements(By.TAG_NAME, "td")
                data.append([col.text for col in cols])

            # Convert to DataFrame
            df = pd.DataFrame(data)

            # Select specific columns (Squad, GF, and GA)
            df = df.iloc[2:, [0, 10, 15]]
            df = df.reset_index(drop=True)

            # Rename columns
            df.columns = ["Squad", "Save Percentage", "Clean Sheet Percentage"]

            # Add a new column for the season
            df.insert(0, "Season", season)

            # Append to the combined DataFrame
            all_seasons_df = pd.concat([all_seasons_df, df], ignore_index=True)

    finally:
        # Close the WebDriver
        driver.quit()

    # Display the combined DataFrame
    print(f"Completed!")
    display(all_seasons_df)

    # Return the combined DataFrame
    return all_seasons_df

start_season_year = 2013
end_season_year = 2023
all_data = scrape_SP_CSP(start_season_year, end_season_year)

# Save to CSV and download the data
all_data.to_csv(f"Premier_League_Data_SP_CSP_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv", index=False)
from google.colab import files
files.download(f"Premier_League_Data_SP_CSP_{start_season_year}-{start_season_year+1}_to_{end_season_year}-{end_season_year+1}.csv")

Scraping data for season: 2013-2014...
Scraping data for season: 2014-2015...
Scraping data for season: 2015-2016...
Scraping data for season: 2016-2017...
Scraping data for season: 2017-2018...
Scraping data for season: 2018-2019...
Scraping data for season: 2019-2020...
Scraping data for season: 2020-2021...
Scraping data for season: 2021-2022...
Scraping data for season: 2022-2023...
Scraping data for season: 2023-2024...
Completed!


Unnamed: 0,Season,Squad,Save Percentage,Clean Sheet Percentage
0,2013-2014,Arsenal,72.1,44.7
1,2013-2014,Aston Villa,64.9,23.7
2,2013-2014,Cardiff City,66.4,18.4
3,2013-2014,Chelsea,77.7,47.4
4,2013-2014,Crystal Palace,71.6,31.6
...,...,...,...,...
215,2023-2024,Nott'ham Forest,61.3,10.5
216,2023-2024,Sheffield Utd,62.4,2.6
217,2023-2024,Tottenham,67.7,18.4
218,2023-2024,West Ham,72.6,13.2


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>