In [25]:
import pandas as pd
import numpy as np


from scrapping_utils import get_tables

## Scrapping football standings and prepare them directly

Leagues we want to scrape data with example link:

- English Premier league : https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021
- Spanish La Liga : https://www.espn.com/soccer/standings/_/league/esp.1/season/2020
- French Ligue 1 : https://www.espn.com/soccer/standings/_/league/fra.1/season/2020
- Italian Serie A : https://www.espn.com/soccer/standings/_/league/ita.1/season/2020
- German Bundesliga : https://www.espn.com/soccer/standings/_/league/ger.1/season/2020



In [26]:
def get_football_standings_df(url, year):
    # Transforming the data into a pandas dataframe
    ptables = get_tables(url)
    df = pd.DataFrame(ptables[1])
    # set first row as header‚
    df.columns = df.iloc[0]
    df["Team"] = np.array(ptables[0]).reshape(-1)
    df["Year"] = year
    df = df.drop(df.index[0])
    return df


def prepare_football_standings(df):

    """Prepare the data for the football standings

    Args:
        df (pd.DataFrame): Dataframe with the football standings for every team for every year

    Returns:
        df_prepared_data (pd.DataFrame): Dataframe with the football standings for every for every year"""

    df_prepared_data = pd.DataFrame(
        columns=["Year", "Teams", "Wins", "Losses", "Ties", "#Games"]
    )
    grouped = df.groupby("Year")

    for year, group in grouped:
        teams = group["Team"].tolist()
        wins = group["W"].tolist()
        losses = group["L"].tolist()
        ties = group["D"].tolist()

        ngames = wins[0] + losses[0] + ties[0]
        # concat to the dataframe

        df_tmp = (
            pd.Series(
                {
                    "Year": year,
                    "Teams": teams,
                    "Wins": wins,
                    "Losses": losses,
                    "Ties": ties,
                    "#Games": ngames,
                }
            )
            .to_frame()
            .T
        )
        df_prepared_data = pd.concat([df_prepared_data, df_tmp])
    return df_prepared_data

## Scrape available data from espn.com

In [27]:
# Years to scrape
years = range(2003, 2022)

# The urls of the websites that we will scrap
urls_dict = {
    "PML": {
        year: f"https://www.espn.com/soccer/standings/_/league/ENG.1/season/{year}"
        for year in years
    },
    "LaLiga": {
        year: f"https://www.espn.com/soccer/standings/_/league/esp.1/season/{year}"
        for year in years
    },
    "Ligue1": {
        year: f"https://www.espn.com/soccer/standings/_/league/fra.1/season/{year}"
        for year in years
    },
    "SerieA": {
        year: f"https://www.espn.com/soccer/standings/_/league/ita.1/season/{year}"
        for year in years
    },
    "Bundesliga": {
        year: f"https://www.espn.com/soccer/standings/_/league/ger.1/season/{year}"
        for year in years
    },
}

In [28]:
# Scrap the data, prepare it and save it in parquet format
for league_name in urls_dict.keys():
    df_list = []
    for year, url in urls_dict[league_name].items():
        print(url)
        df_list.append(get_football_standings_df(url, year))

    df_complete = pd.concat(df_list)
    df_prepared = prepare_football_standings(df_complete)
    df_prepared.to_parquet(
        f"../../prepared_data/{league_name}_data.parquet", index=False
    )

https://www.espn.com/soccer/standings/_/league/ENG.1/season/2003
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2004
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2005
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2006
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2007
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2008
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2009
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2010
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2011
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2012
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2013
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2014
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2015
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2016
https://www.espn.com/soccer/standings/_/league/ENG.1/season/2017
https://www.espn.com/socc