In [2]:
import urllib.request
from pprint import pprint
from html_table_parser.parser import HTMLTableParser
import pandas as pd
import numpy as np

from scrapping_utils import url_get_contents, get_tables

# Scrapping the schedule of the NFL season

from e.g here: http://www.espn.com/nfl/schedulegrid/_/year/2018 for the 2018 season.

In [3]:
def get_schedule_from_espn(url):
    """ Reformat schedule table from espn site to fit our needs"""
    # Scrape table from espn.com
    ptables = get_tables(url)

    # Preprocessing to remove unnecessary information after scrapping
    df = pd.DataFrame(ptables[0])
    df.columns = df.iloc[1]
    df = df.drop(df.index[0])
    df = df.drop(df.index[0])
    df = df.replace(np.nan, 'BYE')

    # replace all away games to BYE to remove them later to avoid duplicates
    for col in df.columns:
        if col == "TEAM":
            continue
        else:
            df.loc[df[col].str.contains("@"), col] = 'BYE'

    # melt the table to be in Home/Away format
    df = df.melt('TEAM', var_name='Week', value_name='Opponent')
    # remove BYE weeks and duplicates
    df = df[df["Opponent"] != "BYE"]
    df = df.drop("Week", axis=1)
    df = df.rename(columns={"TEAM": "Home", "Opponent": "Away"})
    df = df.drop_duplicates(subset=["Home", "Away"])
    df = df[["Home", "Away"]]
    return df


In [118]:
years = range(2002, 2022)
urls = [f'http://www.espn.com/nfl/schedulegrid/_/year/{year}' for year in years]

for year, url in zip(years, urls):
    df_tmp = get_schedule_from_espn(url)
    df_tmp.to_csv(f'../prepared_data/schedules/NFL_{year}.csv', index=False)

In [4]:
df_tmp = get_schedule_from_espn("http://www.espn.com/nfl/schedulegrid/_/year/2018")

In [5]:
df_tmp

Unnamed: 0,Home,Away
0,ARI,WSH
2,BAL,BUF
4,CAR,DAL
7,CLE,PIT
9,DEN,SEA
...,...,...
538,PIT,CIN
540,SEA,ARI
541,TB,ATL
542,TEN,IND
