In [11]:
import urllib.request
from pprint import pprint
from html_table_parser.parser import HTMLTableParser
import pandas as pd
import numpy as np

# Some code taken from https://www.geeksforgeeks.org/scrape-tables-from-any-website-using-python/


## Functions for scrapping ESPN MLS standings

- example url: https://www.espn.com/soccer/standings/_/league/USA.1/season/2016

In [12]:
# Opens a website and read its
# binary contents (HTTP Response Body)
def url_get_contents(url):

	# Opens a website and read its
	# binary contents (HTTP Response Body)

	#making request to the website
	req = urllib.request.Request(url=url)
	f = urllib.request.urlopen(req)

	#reading contents of the website
	return f.read()

def get_mls_standings(url):
	# defining the html contents of a URL.
	xhtml = url_get_contents(url).decode('utf-8')

	# Defining the HTMLTableParser object
	p = HTMLTableParser()

	# feeding the html contents in the
	# HTMLTableParser object
	p.feed(xhtml)
	return p.tables

def get_mls_standings_df(url, year):
	# Transforming the data into a pandas dataframe
	ptables = get_mls_standings(url)
	df = pd.DataFrame(ptables[1])
	# set first row as header
	df.columns = df.iloc[0]
	df["Team"] = np.array(ptables[0]).reshape(-1)
	df["Year"] = year
	df = df.drop(df.index[0])
	df = df[df["Team"] != "Western Conference"] 
	return df


## Scrape available data from espn.com

In [13]:
# choose Years from 2006 to 2021 because NHL got rid of ties in 2005/06
years = range(2003,2022)
urls = [f"https://www.espn.com/soccer/standings/_/league/USA.1/season/{year}" for year in years]
df_list = []
for url, year in zip(urls, years):
    df_list.append(get_mls_standings_df(url, year))

df_complete = pd.concat(df_list)

In [15]:
df_complete.to_csv("raw_data/MLS_scrapped_data_2003_to_2022.csv", index=False)