In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json

In [3]:
with open('data/team_names.json', 'r') as f:
    team_names = json.load(f)

In [4]:
page = requests.get('http://www.nfl.com/stats/categorystats?archive=false'
                    '&conference=null&role=TM&offensiveStatisticCategory=GAME_STATS'
                    '&defensiveStatisticCategory=null&season=2017&seasonType=REG'
                    '&tabSeq=2&qualified=false&Submit=Go')

soup = BeautifulSoup(page.text, 'html.parser')

In [83]:
def get_team_key(value):
    '''
    This will retrieve the dictionary key associated with a given value
    '''
    try:
        key = list(team_names.keys())[list(team_names.values()).index(value)]
    except:
        if value == 'st.-louis-rams':
            key = 'LA'
        elif value == 'san-diego-chargers':
            key = 'LAC'
    return key

def get_table(soup, cat, year):
    '''
    This function will get the specified table from the soup
    '''
    # find the table
    table = soup.find_all('table')[0]
    # split on newlines and tabs (there's a lot of them) and filter out
    # blank list entries
    table_list = list(filter(None, '\t'.join(table.text.split('\n')).split('\t')))
    # filter out list items that are just a space
    while ' ' in table_list:
        table_list.remove(' ')
    # reshape the table and convert to dataframe
    if cat in ['GAME_STATS', 'TEAM_PASSING']:
        table_df = pd.DataFrame(np.reshape(table_list, (33, 21)))
    else:
        table_df = pd.DataFrame(np.reshape(table_list, (33, 17)))
    # header is in the first row
    table_df.columns = table_df.iloc[0]
    # drop the header row and reset the index
    table_df = table_df.drop(0).reset_index(drop=True)
    # drop the rank column
    table_df.drop('Rk', axis=1, inplace=True)
    # convert team name to lowercase and join with a dash
    # this is the format that came from the salary scraping
    # so I'll stick with it here
    table_df['Team'] = table_df['Team'].apply(lambda x: '-'.join(x.lower().split(' ')))
    # look up the team abbreviation (key) from the team name (value)
    table_df['Team'] = table_df['Team'].apply(get_team_key)
    # insert the year
    table_df.insert(1, 'Year', year)
    
    return table_df

In [84]:
url = ('http://www.nfl.com/stats/categorystats?archive=false'
       '&conference=null&role=TM&offensiveStatisticCategory={}'
       '&defensiveStatisticCategory=null&season={}&seasonType=REG'
       '&tabSeq=2&qualified=false&Submit=Go')

In [104]:
years = ['2012', '2013', '2014', '2015', '2016', '2017']

categories = ['GAME_STATS', 'TEAM_PASSING', 'RUSHING']

use_cols = {
    'GAME_STATS': ['Team', 'Year', 'Pts/G', 'Pen Yds', 'TO'],
    'RUSHING': ['Team', 'Year', 'Pts/G', 'Yds/G', 'TD'],
    'TEAM_PASSING': ['Team', 'Year', 'Pts/G', 'Yds/G', 'TD', 'Sck', 'Rate']
}

offense_df = pd.DataFrame()

for year in years:
    # create dictionary to hold three different dataframes
    tables = {}
    for cat in categories:
        page = requests.get(url.format(cat, year))
        soup = BeautifulSoup(page.text, 'html.parser')
        # get the table and clip the columns
        tables[cat] = get_table(soup, cat, year)[use_cols[cat]]
    # merge rushing and passing tables
    merged_table = pd.merge(tables['RUSHING'], tables['TEAM_PASSING'], 
                            on=['Team', 'Year'], suffixes=('_rush', '_pass'))
    # merge the game stats table
    merged_table = merged_table.merge(tables['GAME_STATS'])
    # concatenate merged tables for each year
    offense_df = pd.concat([offense_df, merged_table])

In [105]:
offense_df

Unnamed: 0,Team,Year,Pts/G_rush,Yds/G_rush,TD_rush,Pts/G_pass,Yds/G_pass,TD_pass,Sck,Rate,Pts/G,Pen Yds,TO
0,WAS,2012,27.2,169.3,22,27.2,213.9,24,33,102.1,27.2,985,+17
1,MIN,2012,23.7,164.6,16,23.7,171.9,18,32,81.2,23.7,830,-1
2,SEA,2012,25.8,161.2,16,25.8,189.4,27,33,100.6,25.8,890,+13
3,SF,2012,24.8,155.7,17,24.8,206.1,23,41,101.2,24.8,960,+9
4,KC,2012,13.2,149.7,9,13.2,169.6,8,40,63.8,13.2,733,-24
5,BUF,2012,21.5,138.6,12,21.5,204.3,24,30,82.2,21.5,871,-13
6,NE,2012,34.8,136.5,25,34.8,291.4,34,27,97.7,34.8,840,+25
7,HOU,2012,26,132.7,19,26,239.4,22,28,89.2,26,873,+12
8,CAR,2012,22.3,130.5,21,22.3,230.2,19,36,86.5,22.3,835,+1
9,CHI,2012,23.4,123.1,11,23.4,187.4,21,44,80.4,23.4,811,+20
