In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json

In [2]:
with open('data/team_names.json', 'r') as f:
    team_names = json.load(f)

In [3]:
def get_team_key(value):
    '''
    This will retrieve the dictionary key associated with a given value
    '''
    try:
        key = list(team_names.keys())[list(team_names.values()).index(value)]
    except:
        if value == 'st.-louis-rams':
            key = 'LA'
        elif value == 'san-diego-chargers':
            key = 'LAC'
    return key

In [4]:
def get_table(soup, cat, year):
    '''
    This function will get the specified table from the soup
    '''
    # find the table
    table = soup.find_all('table')[0]
    # split on newlines and tabs (there's a lot of them) and filter out
    # blank list entries
    table_list = list(filter(None, '\t'.join(table.text.split('\n')).split('\t')))
    # filter out list items that are just a space
    while ' ' in table_list:
        table_list.remove(' ')
    # reshape the table and convert to dataframe
    table_df = pd.DataFrame(np.reshape(table_list, (33, int(len(table_list) / 33))))
    # header is in the first row
    table_df.columns = table_df.iloc[0]
    # drop the header row and reset the index
    table_df = table_df.drop(0).reset_index(drop=True)
    # drop the rank column
    table_df.drop('Rk', axis=1, inplace=True)
    # convert team name to lowercase and join with a dash
    # this is the format that came from the salary scraping
    # so I'll stick with it here
    table_df['Team'] = table_df['Team'].apply(lambda x: '-'.join(x.lower().split(' ')))
    # look up the team abbreviation (key) from the team name (value)
    table_df['Team'] = table_df['Team'].apply(get_team_key)
    # insert the year
    table_df.insert(1, 'Year', year)
    
    return table_df

In [5]:
url = ('http://www.nfl.com/stats/categorystats?archive=false'
       '&conference=null&role=TM&offensiveStatisticCategory={}'
       '&defensiveStatisticCategory=null&season={}&seasonType=REG'
       '&tabSeq=2&qualified=false&Submit=Go')

In [6]:
years = ['2012', '2013', '2014', '2015', '2016', '2017']

categories = ['GAME_STATS', 'TEAM_PASSING', 'RUSHING']

use_cols = {
    'GAME_STATS': ['Team', 'Year', 'Pts/G', 'Pen Yds', 'TO'],
    'RUSHING': ['Team', 'Year', 'Yds/G', 'TD'],
    'TEAM_PASSING': ['Team', 'Year', 'Yds/G', 'Pct', 'TD', 'Sck', 'Rate']
}

offense_df = pd.DataFrame()

for year in years:
    # create dictionary to hold three different dataframes
    tables = {}
    for cat in categories:
        page = requests.get(url.format(cat, year))
        soup = BeautifulSoup(page.text, 'html.parser')
        # get the table and clip the columns
        tables[cat] = get_table(soup, cat, year)[use_cols[cat]]
    # merge rushing and passing tables
    merged_table = pd.merge(tables['RUSHING'], tables['TEAM_PASSING'], 
                            on=['Team', 'Year'], suffixes=('_rush', '_pass'))
    # merge the game stats table
    merged_table = merged_table.merge(tables['GAME_STATS'])
    # concatenate merged tables for each year
    offense_df = pd.concat([offense_df, merged_table])

# convert object columns to floats
for col in offense_df.columns[2:]:
    offense_df[col] = offense_df[col].astype(str).str.replace(',', '').astype(float)

In [7]:
offense_df.to_csv('data/offense.csv', index=False)