In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json

In [10]:
def get_standings(soup, year):
    '''
    this will get the table and parse the data
    '''
    # get the table
    table = soup.find_all('table')[0]
    # convert table to list
    table_list = table.getText(separator='\n').split('\n')
    # create empty lists for standings and playoffs
    standings = []
    playoffs = []
    # Check which teams made the playoffs and add all teams to standings
    for i in range(len(table_list)):
        if '--' in table_list[i]:
            playoffs.append(table_list[i+1])
        else:
            standings.append(table_list[i])
    # standings has team abbreviations and team names - just keep abbreviation
    teams = standings[:64:2]
    # the table data is after the teams
    standings = standings[64:]
    # reshape and convert to dataframe
    standings_df = pd.DataFrame(np.reshape(standings, (33, 12)))
    # header is in first row
    standings_df.columns = standings_df.iloc[0]
    # drop the header row
    standings_df.drop(0, inplace=True)
    # insert the team and year columns
    standings_df.insert(0, 'Team', teams)
    standings_df.insert(1, 'Year', year)
    # create playoffs column, initialize to 0 (did not make playoffs)
    standings_df['Playoffs'] = 0
    # change to 1 (made playoffs) if in playoffs list
    standings_df.loc[standings_df['Team'].isin(playoffs), 'Playoffs'] = 1
    # only use some columns
    use_cols = ['Team', 'Year', 'W', 'L', 'T', 'PF', 'PA', 'DIFF', 'Playoffs']
    standings_df = standings_df[use_cols]
    
    return standings_df

In [11]:
def fix_team_names(standings_df):
    '''
    This will fix the team abbreviations so they are consistent
    throughout project
    '''
    standings_df['Team'] = standings_df['Team'].str.replace('STL', 'LA')
    standings_df['Team'] = standings_df['Team'].str.replace('LAR', 'LA')
    standings_df['Team'] = standings_df['Team'].str.replace('SD', 'LAC')
    standings_df['Team'] = standings_df['Team'].str.replace('JAX', 'JAC')
    standings_df['Team'] = standings_df['Team'].str.replace('WSH', 'WAS')
    
    return standings_df

In [12]:
url = 'http://www.espn.com/nfl/standings/_/season/{}/group/league'

In [13]:
years = ['2013', '2014', '2015', '2016', '2017']

standings_df = pd.DataFrame()

for year in years:
    page = requests.get(url.format(year))
    soup = BeautifulSoup(page.text, 'html.parser')
    # tmp holds the current year table
    tmp = get_standings(soup, year)
    # standings df holds all the tables (concatenated)
    standings_df = pd.concat([standings_df, tmp])

# change object columns to floats
for col in standings_df.columns[2:]:
    standings_df[col] = standings_df[col].astype(float)
# fix team abbreviations for consistency
standings_df = fix_team_names(standings_df)

## Add Superbowl Winners

In [14]:
# create column to hold superbowl winners (for logit)
standings_df['SB_win'] = 0
# hard code superbowl winners
standings_df.loc[(standings_df['Team'] == 'SEA') &
                 (standings_df['Year'] == '2013'), 
                 ['SB_win']] = 1
standings_df.loc[(standings_df['Team'] == 'NE') &
                 (standings_df['Year'] == '2014'), 
                 ['SB_win']] = 1
standings_df.loc[(standings_df['Team'] == 'DEN') &
                 (standings_df['Year'] == '2015'), 
                 ['SB_win']] = 1
standings_df.loc[(standings_df['Team'] == 'NE') &
                 (standings_df['Year'] == '2016'), 
                 ['SB_win']] = 1
standings_df.loc[(standings_df['Team'] == 'PHI') &
                 (standings_df['Year'] == '2017'), 
                 ['SB_win']] = 1

In [15]:
standings_df.reset_index(inplace=True, drop=True)
standings_df.head()

Unnamed: 0,Team,Year,W,L,T,PF,PA,DIFF,Playoffs,SB_win
0,SEA,2013,13.0,3.0,0.0,417.0,231.0,186.0,1.0,1
1,DEN,2013,13.0,3.0,0.0,606.0,399.0,207.0,1.0,0
2,SF,2013,12.0,4.0,0.0,406.0,272.0,134.0,1.0,0
3,NE,2013,12.0,4.0,0.0,444.0,338.0,106.0,1.0,0
4,CAR,2013,12.0,4.0,0.0,366.0,241.0,125.0,1.0,0


In [16]:
# save the standings to csv
standings_df.to_csv('data/standings.csv', index=False)