# NFL Data Scraper 1

**<font color='purple'>This scraper accesses the gamelogs of every NFL team to determine the urls from which the game data will be obtained. This scraper, to be used for years 2010-2019, works one year at a time. The gamelogs and dataframes to be used in Scraper 2 are stored in binary files.</font>** 

**<font color='teal'> First, the necessary packages are loaded, and the structure of the dataframe that will be populated by the acquired data is set up.</font>** 

In [101]:
import requests
import re
import pandas as pd
import datetime as dt
import pickle
from dateutil.parser import parse

teams = {'Atlanta Falcons':'atl','Buffalo Bills':'buf','Carolina Panthers':'car','Chicago Bears':'chi',
         'Cincinnati Bengals':'cin','Cleveland Browns':'cle','Indianapolis Colts':'clt',
         'Arizona Cardinals':'crd','Dallas Cowboys':'dal','Denver Broncos':'den','Detroit Lions':'det',
         'Green Bay Packers':'gnb','Houston Texans':'htx','Jacksonville Jaguars':'jax',
         'Kansas City Chiefs':'kan','Miami Dolphins':'mia','Minnesota Vikings':'min','New Orleans Saints':'nor',
         'New England Patriots':'nwe','New York Giants':'nyg','New York Jets':'nyj','Tennessee Titans':'oti',
         'Philadelphia Eagles':'phi','Pittsburgh Steelers':'pit','Oakland Raiders':'rai',
         'Las Vegas Raiders':'rai','St. Louis Rams':'ram','Los Angeles Rams':'ram','Baltimore Ravens':'rav',
         'San Diego Chargers':'sdg','Los Angeles Chargers':'sdg','Seattle Seahawks':'sea',
         'San Francisco 49ers':'sfo','Tampa Bay Buccaneers':'tam','Washington Redskins':'was'}

# "org" is the collection of team abbreviations in the form of a set as due to changes in team names, 
# multiples team names share common organization, abbreviation. A set gives the unique values.
org = set(teams.values())

# Index and columns to be used for "dfyear" multiindex dataframe with team abbreviations as the primary index 
# and row number as the temporary secondary index.
yearcols = ['Code','Home/Away','Opponent','Points','Points_Opp','Yds_Off_Pass','Yds_Off_Rush','Yds_Def_Pass',
            'Yds_Def_Rush','TD','TD_on_Def','FG_Made','FG_Att','RZ_Conv','RZ_Att','RZ_Def_Conv','RZ_Def_Att',
            'Possession','Plays','TO_Gained','TO_Lost','Yds_Pen','Sacks_Def','Tackles_Loss','Yds_per_Kickret',
            'Yds_per_Puntret']
ind = pd.MultiIndex.from_product([org,[i for i in range(16)]],names=['Team','row'])

**<font color='teal'>*table* is the raw dataframe obtained from the website. *dfteam* is a preliminary dataframe for a given team and given year used to populate the larger *dfyear* dataframe. It stores the parsed game dates as well as the home/away and opponent columns. These columns are used to formulate the gamecodes, stored in a list, which will be used to access the urls necessary to obtain the bulk of the data.</font>**

In [100]:
years = [str(i) for i in range(2010,2020)]
gcodes = []
for year in years:
    dfyear=pd.DataFrame(index=ind,columns=yearcols)
    for team in org:
        url = 'https://www.pro-football-reference.com/teams/'+team+'/'+year+'/gamelog/'
        res = requests.get(url)        

        tableID = 'gamelog'+year
        table=pd.read_html(res.text,attrs={'id':tableID},flavor='bs4')
        table=table[0]

# Columns for "table" are renamed for ease in populating our dataframe     

        L0 = list(table.columns.get_level_values(0))
        L0_mod = ['Event' if 'Unnamed' in L0[i] else L0[i] for i in range(len(L0))]
        L0_renmd = dict(zip(L0,L0_mod))
        table.rename(columns=L0_renmd,level=0,inplace=True)

        dfteam = pd.DataFrame(columns=['Date','Home/Away','Opponent'])

# String dates from "table" are converted to numerical in proper format for use in url
# Special consideration for Janaury, as those dates correspond to following year
        dconv = lambda x: (parse(x + ' ' + str(int(year)+1)).strftime('%Y%m%d') 
                           if 'January' in x else parse(x + ' ' + year).strftime('%Y%m%d'))
        dfteam['Date'] = table['Event']['Date'].apply(dconv)

# Home and Away designation determines how gamecode is labeled    
        dfteam['Home/Away'][table.iloc[:,6] == '@'] = 'A'
        dfteam['Home/Away'][table.iloc[:,6] != '@'] = 'H'
        dfteam['Opponent'] = table['Event']['Opp']

# Gamecodes are calculated to put in list for later url access 
# and to put in dfyear dataframe for game identification
        for row in range(16):
            date = dfteam['Date'][row]
            if dfteam['Home/Away'][row] == 'H':
                hometeam = team
            elif dfteam['Home/Away'][row] == 'A':
                hometeam = teams[dfteam['Opponent'][row]]
            gamecode = date+'0'+hometeam
            dfyear.loc[(team,row),'Code'] = gamecode
            if gamecode not in gcodes: gcodes.append(gamecode)

# Because of "dfyear" multiindex, cannot simply copy "dfteam" dataframe to "dfyear". Must use .values        
        dfyear.loc[team,'Home/Away':'Opponent'] = dfteam[['Home/Away','Opponent']].values

# Replace row with gamecode in index
    dfyear=dfyear.reset_index(level=1,drop=True)
    dfyear=dfyear.set_index([dfyear.index,dfyear['Code']])
    dfyear.drop('Code',axis=1,inplace=True)

# Check for null values, number of gcodes matches with number of games in season (32*16/2=256).
#    print(dfyear.loc[:,'Home/Away':'Opponent'].isnull().sum())
#    print(len(gcodes))

# Store gamecodes and working dataframe in binary files
    path = 'data/df_step1_'+year+'.data'
    with open(path, 'wb') as f:
        pickle.dump(dfyear,f)

Home/Away    0
Opponent     0
dtype: int64
256
Home/Away    0
Opponent     0
dtype: int64
512
Home/Away    0
Opponent     0
dtype: int64
768
Home/Away    0
Opponent     0
dtype: int64
1024
Home/Away    0
Opponent     0
dtype: int64
1280
Home/Away    0
Opponent     0
dtype: int64
1536
Home/Away    0
Opponent     0
dtype: int64
1792
Home/Away    0
Opponent     0
dtype: int64
2048
Home/Away    0
Opponent     0
dtype: int64
2304
Home/Away    0
Opponent     0
dtype: int64
2560


In [None]:
path = 'data/gamecodes.data'
with open(path, 'wb') as f:
    pickle.dump(gcodes,f)