## Pro Football Reference Dataset

In [3]:
import requests
import pandas as pd
import numpy as np
import random
from bs4 import BeautifulSoup

In [4]:
url = 'https://www.pro-football-reference.com'
fantasy_url = '/years/{}/fantasy.htm'
game_url = '/gamelog/'

In [24]:
# Option 1 for grabbing players: Grab a certain amount per position. More finicky towards parameters but 
# may not underrepresent a particular position
# Max number of players to gather for each position per year
position_limits = { 'QB': 32, 'RB': 60, 'WR': 80, 'TE': 25}
pos_file_key = '{}_pro_ftb_ref_per_position_{}_{}_{}_{}.csv'.format('{}', position_limits['QB'], position_limits['RB'],
                                                                 position_limits['WR'], position_limits['TE'])
# Option 2: just grab top n players
n = 220
n_file_key = '{}_pro_ftb_ref_top_{}.csv'.format('{}', str(n))
# True if using top_n else False
top_n = True

In [21]:
data = []
encountered = []
update_top_n = []

In [22]:
for year in range(2000, 2021):
    position_counts = { 'QB': 0, 'RB': 0, 'WR': 0, 'TE': 0, '': 0 }
    tot_players = 0
    print(year)
    r = requests.get(url + fantasy_url.format(year))
    soup = BeautifulSoup(r.content, 'html.parser')
    fantasy_table = soup.find_all('table')[0]
    for row in fantasy_table.find_all('tr')[2:]:
        player_html = row.find('td', attrs={'data-stat': 'player'})
        pos_html = row.find('td', attrs={'data-stat': 'fantasy_pos'})
        if player_html is None or pos_html is None:
            continue
            
        name = player_html.a.get_text()
        pos = pos_html.get_text()
        stub = player_html.a.get('href')
        
        # Check if exit condition is met
        if top_n:
            if tot_players % 150 == 0:
                print(tot_players)
            if tot_players >= n:
                break
            tot_players += 1
        else:
            if position_counts[pos] >= position_limits[pos]:
                print(position_counts)
                # See if all positions are filled
                greater = True
                for key in position_counts.keys():
                    if position_counts[key] < position_limits[key]:
                        greater = False
                        break
                if greater:
                    break
                else:
                    continue
            position_counts[pos] += 1
            
        # If player has been seen before, mark that year of career as
        # being in top_n
        if stub in encountered:
            update_top_n.append((stub, year))
            continue
        encountered.append(stub)
        
        
        player_url = url + stub + game_url
        r_player = requests.get(player_url)
        player_soup = BeautifulSoup(r_player.content, 'html.parser')
        try:
            player_table = player_soup.find_all('table')[0]
        except:
            print('Error: {}, {}, {}'.format(name, pos, year))
            continue
        for row in player_table.find_all('tr')[2:]:
            player_stat = { 'name': name, 'pos': pos, 'stub': stub }
            for data_row in row.find_all('td'):
                data_title = data_row.get('data-stat')
                data_val = data_row.get_text()
                    
                player_stat[data_title] = data_val

            # Remove garbage rows
            if 'year_id' not in player_stat.keys():
                continue
            # Mark rows which appear in top n
            if player_stat['year_id'] == year:
                player_stat['top_n'] = True
            else:
                player_stat['top_n'] = False
                
            data.append(player_stat)

2000
0
150
2001
0
150
2002
0
150
2003
0
150
2004
0
150
2005
0
150
2006
0
150
2007
0
150
2008
0
150
2009
0
150
2010
0
150
2011
0
150
2012
0
150
2013
0
150
2014
0
Error: Corey Knox , , 2014
150
2015
0
150
2016
0
150
2017
0
150
2018
0
150
2019
0
150
2020
0
150


In [29]:
update_top_n = []

In [25]:
players_df = pd.DataFrame(data)
# Clean up data
players_df['off_pct'] = players_df['off_pct'].apply(lambda x: int(x[:-1]) if x is not np.nan and x != '' else np.nan)
players_df['def_pct'] = players_df['def_pct'].apply(lambda x: int(x[:-1]) if x is not np.nan and x != '' else np.nan)
players_df['st_pct'] = players_df['st_pct'].apply(lambda x: int(x[:-1]) if x is not np.nan and x != '' else np.nan)
# Rename so as to not interfere with pandas.Series name attribute, and year for backwards compatability
players_df.rename(columns={'name': 'full_name', 'year_id': 'year'}, inplace=True)

# Update the top_n flag for the years that need it
for stub, year in update_top_n:
    players_df.loc[(players_df['stub'] == stub) & (players_df['year'] == year), 'top_n'] = True

game_df = players_df[~(players_df['week_num'] == '')]
annual_df = players_df[players_df['week_num'] == '']

# Assign each player what is most likely a unique id although not 100% guarenteed if player shares a name, team,
# position, and year
game_df.loc[:, 'unique_id'] = game_df.apply(lambda row: row.full_name + ',' + row.team + ',' + row.pos + ',' + str(row.year), axis=1)
annual_df.loc[:, 'unique_id'] = annual_df.apply(lambda row: row.full_name + ',' + row.team + ',' + row.pos + ',' + str(row.year), axis=1)
# annual_df.loc[:, 'unique'] = annual_df.apply(lambda row: print(row.full_name), axis=1)

if top_n:
    game_df.to_csv(n_file_key.format('game'), index=False)
    annual_df.to_csv(n_file_key.format('annual'), index=False)
else:
    game_df.to_csv(pos_file_key.format('game'), index=False)
    annual_df.to_csv(pos_file_key.format('annual'), index=False)

In [9]:
game_df = pd.read_csv('data/game_pro_ftb_ref_top_220.csv')
annual_df = pd.read_csv('data/annual_pro_ftb_ref_top_220.csv')

In [34]:
for year in range(2000, 2021):
    position_counts = { 'QB': 0, 'RB': 0, 'WR': 0, 'TE': 0, '': 0 }
    tot_players = 0
    print(year)
    r = requests.get(url + fantasy_url.format(year))
    soup = BeautifulSoup(r.content, 'html.parser')
    fantasy_table = soup.find_all('table')[0]
    for row in fantasy_table.find_all('tr')[2:]:
        player_html = row.find('td', attrs={'data-stat': 'player'})
        pos_html = row.find('td', attrs={'data-stat': 'fantasy_pos'})
        if player_html is None or pos_html is None:
            continue
            
        name = player_html.a.get_text()
        pos = pos_html.get_text()
        stub = player_html.a.get('href')
        
        # Check if exit condition is met
        if top_n:
            if tot_players % 150 == 0:
                print(tot_players)
            if tot_players >= n:
                break
            tot_players += 1
        else:
            if position_counts[pos] >= position_limits[pos]:
                print(position_counts)
                # See if all positions are filled
                greater = True
                for key in position_counts.keys():
                    if position_counts[key] < position_limits[key]:
                        greater = False
                        break
                if greater:
                    break
                else:
                    continue
            position_counts[pos] += 1
            
        # Check to make sure player hasnt been seen yet for this year
        if stub + str(year) in encountered:
            continue
        encountered.append(stub + str(year))
        
        for df in [game_df, annual_df]:
            curr_player_df = df[(df['full_name'] == name) & (df['pos'] == pos) & (df['year'] == year)]
            
            df.at[curr_player_df.index, 'stub'] = stub

2000
0
150
2001
0
150
2002
0
150
2003
0
150
2004
0
150
2005
0
150
2006
0
150
2007
0
150
2008
0
150
2009
0
150
2010
0
150
2011
0
150
2012
0
150
2013
0
150
2014
0
150
2015
0
150
2016
0
150
2017
0
150
2018
0
150
2019
0
150
2020
0
150


In [25]:
len(['a' for a in curr_player_df.index])

16

In [26]:
len(curr_player_df.index)

16

In [39]:
game_df.to_csv('data/game_pro_ftb_ref_top_220.csv', index=False)

In [40]:
annual_df.to_csv('data/annual_pro_ftb_ref_top_220.csv', index=False)