In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json

## Get Team List

In [9]:
# get a page to pull the team names from first
page = requests.get('http://www.spotrac.com/nfl/dallas-cowboys/positional/2018/full-cap/')
# Create soup of page
soup = BeautifulSoup(page.text, 'html.parser')
# team list pulldown is the first select
teams = soup.find_all('select')[1]
# the team names and abbreviation are in the options
teams = teams.find_all('option')
# skip first option ("Team") and assign team abbreviation: team name to dict
team_names = {option.text: option['value'] for option in teams[1:]}
# save the team names dictionary for later
with open('data/team_names.json', 'w') as f:
    json.dump(team_names, f)

## Functions for Parsing Positional Salary Data

In [6]:
def get_positional(soup):
    '''
    Retrieve the positional salary table from the html soup
    '''
    # find the positional spending table
    table = soup.find_all('table')[1]
    # get the text
    table_text = table.text
    # remove the newlines and make into list
    table_text = list(filter(None, table.text.split('\n')))
    # reshape and convert to data frame
    table_df = pd.DataFrame(np.reshape(table_text, (10, 5)))
    # header is in the first row
    table_df.columns = table_df.iloc[0]
    # drop the first row and reset index
    table_df = table_df.drop(0).reset_index(drop=True)
    # return the final result
    return table_df


def parse_positional(df):
    '''
    Parse the salary and number of players for each position
    '''
    
    # abbreviate positions
    df['Position'] = ['QB', 'RB', 'WR', 'TE', 'OL', 
                      'DL', 'LB', 'S', 'ST']
    
    # drop unnecessary columns and rename kept columns
    df.drop(['% of  Cap', 'NFL Rank'], axis=1, inplace=True)
    df.columns = ['position', 'players', 'salary']
    
    # convert columns to float
    df['players'] = df['players'].astype('float')
    df['salary'] = df['salary'].apply(lambda x: float(re.sub(r'[^\d]', '', x)))
    
    # get the salary as a single row data frame
    salary = df.pivot_table(columns='position', values='salary', 
                            aggfunc='sum')
    salary.reset_index(drop=True, inplace=True)
    
    # get the number of positional players as a single row data frame
    players = df.pivot_table(columns='position', values='players', 
                             aggfunc='sum')
    # add n to front of columns (to denote number)
    players.columns = ['n{}'.format(pos) for pos in players.columns]
    players.reset_index(drop=True, inplace=True)
    
    # concatenate the salary and number of players
    row = pd.concat([salary, players], axis=1)
    
    return row

## Scrape the Data

In [82]:
# create list for years
years = ['2013', '2014', '2015', '2016', '2017', '2018']

# create blank format string for website crawl
url = 'http://www.spotrac.com/nfl/{}/positional/{}/full-cap/'

salary_df = pd.DataFrame()

# loop through years and teams
for year in years:
    print('Reading {} Salary Data: '.format(year))
    ctr = 0
    for team in team_names.keys():
        if (ctr+1) % 8 == 0:
            print('{:3s}'.format(team), end='\n')
            ctr += 1
        else:
            print('{:3s}'.format(team), end=' ')
            ctr += 1
        # get the page html
        # Chargers went from San Diego to LA in 2017
        if year in ['2013', '2014', '2015', '2016'] and team == 'LAC':
            page = requests.get(url.format('san-diego-chargers', year))
        # Rams went from St. Louis to LA in 2016
        elif year in ['2013', '2014', '2015'] and team == 'LA':
            page = requests.get(url.format('st.-louis-rams', year))
        else:        
            page = requests.get(url.format(team_names[team], year))
        # create the soup
        soup = BeautifulSoup(page.text, 'html.parser')
        # get the positional salary table
        table = get_positional(soup)
        # parse the table (salary and number of players)
        row = parse_positional(table)
        # add the team and year column to the front of the row
        row.insert(0, 'year', year)
        row.insert(0, 'team', team)
        # concatenate the new row to the salary dataframe
        salary_df = pd.concat([salary_df, row])
    print()
# reset the index of the salary dataframe
salary_df.reset_index(drop=True, inplace=True)

Reading 2013 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS

Reading 2014 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS

Reading 2015 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS

Reading 2016 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS

Reading 2017 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS

Reading 2018 Salary Data: 
ARI ATL BAL BUF CAR CHI CIN CLE
DAL DEN DET GB  HOU IND JAC KC 
LAC LA  MIA MIN NE  NO  NYG NYJ
OAK PHI PIT SF  SEA TB  TEN WAS



## Save the Data

In [85]:
# save the dataframe to csv for later reading
salary_df.to_csv('data/salary.csv')