In [1]:
import pandas as pd
import numpy as np
from scrapping_utils import get_tables

## Functions for scrapping ESPN MLS standings

- example url: https://www.espn.com/soccer/standings/_/league/USA.1/season/2016

In [2]:
def get_mls_standings_df(url, year):
    # Transforming the data into a pandas dataframe
    ptables = get_tables(url)
    df = pd.DataFrame(ptables[1])
    # set first row as header
    df.columns = df.iloc[0]
    df["Team"] = np.array(ptables[0]).reshape(-1)
    df["Year"] = year
    df = df.drop(df.index[0])
    df = df[df["Team"] != "Western Conference"]
    return df

## Scrape available data from espn.com

In [3]:
# choose Years from 2006 to 2021 because NHL got rid of ties in 2005/06
years = range(2003, 2022)
urls = [
    f"https://www.espn.com/soccer/standings/_/league/USA.1/season/{year}"
    for year in years
]
df_list = []
for url, year in zip(urls, years):
    df_list.append(get_mls_standings_df(url, year))

df_complete = pd.concat(df_list)

In [4]:
df_complete.to_csv("../../raw_data/MLS/MLS_scrapped.csv", index=False)