# NFL Data Scraper 2

In [16]:
teams = {'Atlanta Falcons':'atl','Buffalo Bills':'buf','Carolina Panthers':'car','Chicago Bears':'chi',
         'Cincinnati Bengals':'cin','Cleveland Browns':'cle','Indianapolis Colts':'clt',
         'Arizona Cardinals':'crd','Dallas Cowboys':'dal','Denver Broncos':'den','Detroit Lions':'det',
         'Green Bay Packers':'gnb','Houston Texans':'htx','Jacksonville Jaguars':'jax',
         'Kansas City Chiefs':'kan','Miami Dolphins':'mia','Minnesota Vikings':'min','New Orleans Saints':'nor',
         'New England Patriots':'nwe','New York Giants':'nyg','New York Jets':'nyj','Tennessee Titans':'oti',
         'Philadelphia Eagles':'phi','Pittsburgh Steelers':'pit','Oakland Raiders':'rai',
         'Las Vegas Raiders':'rai','St. Louis Rams':'ram','Los Angeles Rams':'ram','Baltimore Ravens':'rav',
         'San Diego Chargers':'sdg','Los Angeles Chargers':'sdg','Seattle Seahawks':'sea',
         'San Francisco 49ers':'sfo','Tampa Bay Buccaneers':'tam','Washington Redskins':'was'}
org = set(teams.values())

**<font color='teal'>Load in gamecodes and preliminary dataframe.</font>**

In [5]:
import requests
import re
import pandas as pd
import pickle
from datetime import time, timedelta

year = '2011'
path = 'data/gamecodes.data'
with open(path, 'rb') as f:
    gcodes = pickle.load(f)
    
path = 'data/df_step1_'+year+'.data'
with open(path, 'rb') as f:
    dfyear = pickle.load(f)

In [3]:
def pbp_to_tdelt(x):
    if ':' in x:
        x = '00:'+x
        x = pd.to_timedelta(x,unit='m')
        return x
    else:
        return x

In [4]:
len(gcodes)

2560

**<font color='teal'>Set up gameinfo dataframe.</font>**

In [None]:
ginfo = pd.DataFrame(columns=['Code','Surface','Temperature','Over/Under'])
ginfo['Code'] = gcodes
ginfo = ginfo.set_index(['Code'])

**<font color='teal'>Much of the data from Pro Football Reference is contained within comments. Thus, the spider is set up to crawl through commented code as well.</font>**

In [None]:
comm = re.compile('<!--|-->')

**<font color='teal'>Extract gamecodes for the given year.</font>**

In [20]:
gc = [gcodes[i] for i in range(len(gcodes)) 
      if (((gcodes[i][0:4] == year) & (gcodes[i][4:6] != '01')) | (gcodes[i][0:6] == (str(int(year)+1)+'01')))]
#gc = ['201110160was']

In [None]:
for gamecode in gc:
    url = 'https://www.pro-football-reference.com/boxscores/'+gamecode+'.htm'
    res = requests.get(url)

    table = pd.read_html(res.text,attrs={'class':'linescore nohover stats_table no_freeze'},flavor='bs4')
    table = table[0]
## Team Designation
    awayteam = table.iloc[0,1]
    away = teams[awayteam]
    hometeam = table.iloc[1,1]
    home = teams[hometeam]
# Points
    dfyear.loc[(away,gamecode),'Points'] = int(table.iloc[0,6])
    dfyear.loc[(home,gamecode),'Points'] = int(table.iloc[1,6])
    dfyear.loc[(away,gamecode),'Points_Opp'] = int(table.iloc[1,6])
    dfyear.loc[(home,gamecode),'Points_Opp'] = int(table.iloc[0,6])

# Game Info
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'game_info'},flavor='bs4')
    table = table[0]
    table = table.set_index(table.columns[0])
    ginfo.loc[gamecode,'Surface'] = table.loc['Surface',1]
    ginfo.loc[gamecode,'Over/Under'] = float(table.loc['Over/Under',1].split(' ')[0])
    if 'Weather' in table.index: ginfo.loc[gamecode,'Temperature'] = int(table.loc['Weather',1].split(' ')[0])
    
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'team_stats'},flavor='bs4')
    table = table[0]
# Offensive & Defensive Rushing & Passing Yards    
    dfyear.loc[(away,gamecode),'Yds_Off_Rush'] = int(table.iloc[1,1].split('-')[1])
    dfyear.loc[(home,gamecode),'Yds_Off_Rush'] = int(table.iloc[1,2].split('-')[1])
    dfyear.loc[(away,gamecode),'Yds_Off_Pass'] = int(table.iloc[2,1].split('-')[2])
    dfyear.loc[(home,gamecode),'Yds_Off_Pass'] = int(table.iloc[2,2].split('-')[2])
    dfyear.loc[(away,gamecode),'Yds_Def_Rush'] = int(table.iloc[1,2].split('-')[1])
    dfyear.loc[(home,gamecode),'Yds_Def_Rush'] = int(table.iloc[1,1].split('-')[1])
    dfyear.loc[(away,gamecode),'Yds_Def_Pass'] = int(table.iloc[2,2].split('-')[2])
    dfyear.loc[(home,gamecode),'Yds_Def_Pass'] = int(table.iloc[2,1].split('-')[2])
# Time of Possession    
    dfyear.loc[(away,gamecode),'Possession'] = \
        float(table.iloc[11,1].split(':')[0])+float(table.iloc[11,1].split(':')[1])/60.0
    dfyear.loc[(home,gamecode),'Possession'] = \
        float(table.iloc[11,2].split(':')[0])+float(table.iloc[11,2].split(':')[1])/60.0
# Penalties
    dfyear.loc[(away,gamecode),'Yds_Pen'] = int(table.iloc[8,1].split('-')[1])
    dfyear.loc[(home,gamecode),'Yds_Pen'] = int(table.iloc[8,2].split('-')[1])
# Defensive Sacks
    dfyear.loc[(away,gamecode),'Sacks_Def'] = int(table.iloc[3,2].split('-')[0])
    dfyear.loc[(home,gamecode),'Sacks_Def'] = int(table.iloc[3,1].split('-')[0])
# Turnovers
    dfyear.loc[(away,gamecode),'TO_Gained'] = int(table.iloc[7,2])
    dfyear.loc[(home,gamecode),'TO_Gained'] = int(table.iloc[7,1])
    dfyear.loc[(away,gamecode),'TO_Lost'] = int(table.iloc[7,1])
    dfyear.loc[(home,gamecode),'TO_Lost'] = int(table.iloc[7,2])
# Touchdowns (Defensive & Special Teams to be added later)
    dfyear.loc[(away,gamecode),'TD'] = int(table.iloc[1,1].split('-')[2])+int(table.iloc[2,1].split('-')[3])
    dfyear.loc[(home,gamecode),'TD'] = int(table.iloc[1,2].split('-')[2])+int(table.iloc[2,2].split('-')[3])
    
# Defense
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'player_defense'},flavor='bs4')
    table = table[0]
# Remove multilevel, NANs, separator row
    table = table.droplevel(level=0,axis=1)
    table = table.dropna(subset=['Tm']).iloc[:,1:]
    table = table[table.iloc[:,0]!='Tm']

# Rename columns to remove column name duplication
    cols = list(table.columns)
    cols[3] = 'TD_Int'
    cols[2] = 'Yds_Int'
    cols[13] = 'Yds_Fum'
    cols[14] = 'TD_Fum'
    table.columns = cols

# Change value types to numeric types
    cols = [list(table.columns[1:6]),table.columns[6],list(table.columns[7:])]
    for grp in cols:
        table[grp]=table[grp].apply(pd.to_numeric)
        
# Group by team to calculate team values        
    grouped = table.groupby(table['Tm'])
    
# Add Defensive Touchdowns
    dftd=grouped['TD_Int'].sum()+grouped['TD_Fum'].sum()
    dfyear.loc[(away,gamecode),'TD'] += dftd[0]
    dfyear.loc[(home,gamecode),'TD'] += dftd[1]
    dfyear.loc[(away,gamecode),'TD_on_Def'] = dftd[0]
    dfyear.loc[(home,gamecode),'TD_on_Def'] = dftd[1]
    
# Tackles for Loss
    tfl=grouped['TFL'].sum()
    dfyear.loc[(away,gamecode),'Tackles_Loss'] = tfl[0]
    dfyear.loc[(home,gamecode),'Tackles_Loss'] = tfl[1]

# Kick/Punt Returns
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'returns'},flavor='bs4')
    table = table[0]
    table = table.droplevel(level=0,axis=1)
    table = table.dropna(subset=['Tm']).iloc[:,1:]
    table = table[table.iloc[:,0]!='Tm']
    cols = list(table.columns)
    cols[2] = 'Yds_KR'
    cols[7] = 'Yds_PR'
    cols[5] = 'Lng_KR'
    cols[10] = 'Lng_PR'
    cols[4] = 'TD_KR'
    cols[9] = 'TD_PR'
    table.columns = cols
    cols = [list(table.columns[1:3]),table.columns[3],list(table.columns[4:8]),table.columns[8],
            list(table.columns[9:])]
    for grp in cols:
        table[grp]=table[grp].apply(pd.to_numeric)

    grouped = table.groupby(table['Tm'])
    kptd=grouped['TD_KR'].sum()+grouped['TD_PR'].sum()
    kyds=grouped['Yds_KR'].sum()
    knum=grouped['Rt'].sum()
    pyds=grouped['Yds_PR'].sum()
    pnum=grouped['Ret'].sum()
    if (kptd.shape[0] == 0):
        a_kptd = h_kptd = a_knum = h_knum = a_kyds = h_kyds = a_pyds = h_pyds = a_pnum = h_pnum = 0
    elif (kptd.shape[0] == 1):
        if (away.upper() == table.loc[0,'Tm']):
            a_kptd = kptd[0]
            a_knum = knum[0]
            a_kyds = kyds[0]
            a_pyds = pyds[0]
            a_pnum = pnum[0]
            h_kptd = h_knum = h_kyds = h_pyds = h_pnum = 0
        elif (home.upper() == table.loc[0,'Tm']):
            h_kptd = kptd[0]
            h_knum = knum[0]
            h_kyds = kyds[0]
            h_pyds = pyds[0]
            h_pnum = pnum[0]
            a_kptd = a_knum = a_kyds = a_pyds = a_pnum = 0
    else:
            a_kptd = kptd[0]
            a_knum = knum[0]
            a_kyds = kyds[0]
            a_pyds = pyds[0]
            a_pnum = pnum[0]
            h_kptd = kptd[1]
            h_knum = knum[1]
            h_kyds = kyds[1]
            h_pyds = pyds[1]
            h_pnum = pnum[1]
    dfyear.loc[(away,gamecode),'TD'] += a_kptd
    dfyear.loc[(home,gamecode),'TD'] += h_kptd
    if (a_knum > 0):
        dfyear.loc[(away,gamecode),'Yds_per_Kickret'] = float(a_kyds)/float(a_knum)
    else:
        dfyear.loc[(away,gamecode),'Yds_per_Kickret'] = 0.0
    if (h_knum > 0):
        dfyear.loc[(home,gamecode),'Yds_per_Kickret'] = float(h_kyds)/float(h_knum)
    else:
        dfyear.loc[(home,gamecode),'Yds_per_Kickret'] = 0.0
    if (a_pnum > 0):
        dfyear.loc[(away,gamecode),'Yds_per_Puntret'] = float(a_pyds)/float(a_pnum)
    else:
        dfyear.loc[(away,gamecode),'Yds_per_Puntret'] = 0.0
    if (h_pnum > 0):
        dfyear.loc[(home,gamecode),'Yds_per_Puntret'] = float(h_pyds)/float(h_pnum)
    else:
        dfyear.loc[(home,gamecode),'Yds_per_Puntret'] = 0.0

# Field Goals
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'kicking'},flavor='bs4')
    table = table[0]
    table = table.droplevel(level=0,axis=1)
    table = table.dropna(subset=['Tm']).iloc[:,1:]
    table = table[table.iloc[:,0]!='Tm']
    cols = [list(table.columns[1:7]),table.columns[7],table.columns[8]]
    for grp in cols:
        table[grp]=table[grp].apply(pd.to_numeric)
    grouped = table.groupby(table['Tm'])
    fgm=grouped['FGM'].sum()
    fga=grouped['FGA'].sum()
    dfyear.loc[(away,gamecode),'FG_Made'] = fgm[0]
    dfyear.loc[(home,gamecode),'FG_Made'] = fgm[1]
    dfyear.loc[(away,gamecode),'FG_Att'] = fga[0]
    dfyear.loc[(home,gamecode),'FG_Att'] = fga[1]

# Total Plays & Set up Drive and Play-by-Play tables for Red Zone calculation
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'home_drives'},flavor='bs4')
    hdrive = table[0].rename({'#':'Drive'},axis=1)
    hdrive.set_index('Drive',inplace=True)
    dfyear.loc[(home,gamecode),'Plays'] = hdrive['Plays'].sum()
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'vis_drives'},flavor='bs4')
    adrive = table[0].rename({'#':'Drive'},axis=1)
    adrive.set_index('Drive',inplace=True)
    dfyear.loc[(away,gamecode),'Plays'] = adrive['Plays'].sum()

    hdrive['LOS']=hdrive['LOS'].fillna(home.upper()+' 25')
    position=hdrive['LOS'].str.split(expand=True)
    hdrive.drop('LOS',axis=1)
    hdrive[['Side','Marker']] = position
    hdrive=hdrive.drop('LOS',axis=1)
    adrive['LOS']=adrive['LOS'].fillna(away.upper()+' 25')
    position=adrive['LOS'].str.split(expand=True)
    adrive.drop('LOS',axis=1)
    adrive[['Side','Marker']] = position
    cols=['Quarter','Time','Side','Marker','Plays','Length','Yds','Result']
    hdrive=hdrive.reindex(columns=cols,copy=False)
    hdrive[hdrive['Side']=='50']=hdrive[hdrive['Side']=='50'].assign(Marker = '50') # Case where ball is on 50 yard line
    hdrive[hdrive['Side']=='50']=hdrive[hdrive['Side']=='50'].assign(Side = home.upper())
    hdrive['Marker']=hdrive['Marker'].astype('int')
    adrive[adrive['Side']=='50']=adrive[adrive['Side']=='50'].assign(Marker = '50') # Case where ball is on 50 yard line
    adrive[adrive['Side']=='50']=adrive[adrive['Side']=='50'].assign(Side = home.upper())
    adrive=adrive.reindex(columns=cols,copy=False)
    adrive['Marker']=adrive['Marker'].astype('int')
    
    intcols=['Plays','Yds']
    timecols=['Time','Length']
    strcols=['Quarter','Result']
    for colu in intcols:
        hdrive[colu] = hdrive[colu].astype('int')
        adrive[colu] = adrive[colu].astype('int')
    for colu in strcols:
        hdrive[colu] = hdrive[colu].astype('str')
        adrive[colu] = adrive[colu].astype('str')
    for colu in timecols:
        for drive in range(1,hdrive.shape[0]+1):
            entry = hdrive.loc[drive,colu].split(':')
            hdrive.loc[drive,colu] = timedelta(hours=0,minutes=int(entry[0]),seconds=int(entry[1]))
        for drive in range(1,adrive.shape[0]+1):
            entry = adrive.loc[drive,colu].split(':')
            adrive.loc[drive,colu] = timedelta(hours=0,minutes=int(entry[0]),seconds=int(entry[1]))

    table=pd.read_html(comm.sub("",res.text),attrs={'id':'pbp'},flavor='bs4')
    pbp=table[0].iloc[:,0:6]
    pbp=pbp.dropna()
    pbp=pbp[pbp['Time'].apply(lambda x: True if set(':').issubset(x) else False)]
    pbp['Time'] = pbp['Time'].apply(pbp_to_tdelt)
    position=pbp['Location'].str.split(expand=True)
    position=position.iloc[:,0:2]
    pbp=pbp.drop('Location',axis=1)
    pbp[['Team','Location']] = position
    cols=['Quarter','Time','Down','ToGo','Team','Location','Detail']
    pbp=pbp.reindex(columns=cols,copy=False)
    pbp=pbp.reset_index(drop=True)
    pbp[pbp['Team']=='50']=pbp[pbp['Team']=='50'].assign(Location = '50') # Case where ball is on 50 yard line
    pbp[pbp['Team']=='50']=pbp[pbp['Team']=='50'].assign(Team = home.upper())
    pbp['Location']=pbp['Location'].astype('int')    
    pbp[pbp['Quarter']=='OT']=pbp[pbp['Quarter']=='OT'].assign(Quarter = '5') # Case of Overtime

# Red Zone Routine

    team = {'H':home, 'A':away}
    drive = {home:1, away:1}
    df = {home:hdrive, away:adrive}
    rz_conv = {home:0, away:0}
    rz_att = {home:0, away:0}
#    starttime = timedelta(hours=0,minutes=15,seconds=0)
    row = 0

#            Red Zone starts at 20 yard line
    while ((drive[home] <= hdrive.shape[0]) | (drive[away] <= adrive.shape[0])):
        stop = False
        if ((drive[home] <= hdrive.shape[0]) & (drive[away] <= adrive.shape[0])):
#            if (hdrive.loc[drive[home],'Time'] == starttime):
#                poss = 'H'
#            elif (adrive.loc[drive[away],'Time'] == starttime):
#                poss = 'A'
            if (hdrive.loc[drive[home],'Quarter'] < adrive.loc[drive[away],'Quarter']):
                poss = 'H'
                starttime = hdrive.loc[drive[home],'Time']
            elif (adrive.loc[drive[away],'Quarter'] < hdrive.loc[drive[home],'Quarter']):
                poss = 'A'
                starttime = adrive.loc[drive[away],'Time']
            else:
                if (hdrive.loc[drive[home],'Time'] > adrive.loc[drive[away],'Time']):
                    poss = 'H'
                    starttime = hdrive.loc[drive[home],'Time']
                else:
                    poss = 'A'
                    starttime = adrive.loc[drive[away],'Time']
        else:
            if (drive[home] <= hdrive.shape[0]):
                poss = 'H'
                starttime = hdrive.loc[drive[home],'Time']
            else:
                poss = 'A'
                starttime = adrive.loc[drive[away],'Time']
        rz = False   # Not in Red Zone
        bquar = int(df[team[poss]].loc[drive[team[poss]],'Quarter']) # Beginning quarter of drive
        if (df[team[poss]].loc[drive[team[poss]],'Length'] <= starttime): # Set time for next drive
            endtime = starttime - df[team[poss]].loc[drive[team[poss]],'Length']
            equar = bquar                                            # Ending quarter of drive
        else:    
            endtime = (timedelta(hours=0,minutes=15,seconds=0) -  # Assuming 15 minute quarters
                (df[team[poss]].loc[drive[team[poss]],'Length'] - starttime))
            equar = bquar+1                                            # Ending quarter of drive
#        print(team[poss],drive[team[poss]],starttime,endtime)

        if (df[team[poss]].loc[drive[team[poss]],'Side'] != team[poss].upper()):
            if (df[team[poss]].loc[drive[team[poss]],'Marker'] <= 20):
                rz = True
            else:
                start = df[team[poss]].loc[drive[team[poss]],'Marker'] - 20
        else:
            start = 100 - df[team[poss]].loc[drive[team[poss]],'Marker'] - 20 # Other side of the field
        if not (rz):    # If starting outside Red Zone
            end = start - df[team[poss]].loc[drive[team[poss]],'Yds']   # Calculate end of drive
            if end <= 20: # If drive ends within 20 yards of red zone to the goal line check if goes in RZ
                found = False     # Look for beginning of drive
                while (((row+1) < pbp.shape[0]) & (not (found))):
#                    if (drive[team[poss]] <= 7):
#                        print(team[poss],drive[team[poss]],row,starttime,endtime,bquar)
#                    if ((team[poss]=='nor') & (drive['nor'] == 11)):
#                        print(row,starttime,endtime,'BB')
                    if (int(pbp.loc[row,'Quarter']) < bquar):
                        row += 1
                    elif (pbp.loc[row,'Time'] <= starttime):
                        found = True
                    else:
                        row += 1
#                while ((pbp.loc[row,'Time'] > endtime) & (not (rz)) & (int(pbp.loc[row,'Quarter']) == quarter)):
#                if ((team[poss] == 'nor') & (drive['nor'] == 11)):
#                    print(row,starttime,endtime,equar,rz)
#                while (((pbp.loc[row,'Time'] > endtime) | (int(pbp.loc[row,'Quarter']) < equar)) & 
#                       (not (rz)) & (not (endofpbp))):
                while (not(stop)):
                    if ((pbp.loc[row,'Team'] != team[poss].upper()) & (pbp.loc[row,'Location'] <= 20)):
                        rz = True
                        stop = True
                    elif (int(pbp.loc[row,'Quarter']) > equar):
#                        if ((team[poss] == 'sfo') & (drive['sfo'] == 7)):
#                            print(row,int(pbp.loc[row,'Quarter'],)
                        stop = True
                    elif (row+1) == pbp.shape[0]:                         # Failsafe in case of bad data
                        stop = True
                    elif ((int(pbp.loc[row,'Quarter']) == equar) & (pbp.loc[row,'Time'] <= endtime)):
                        stop = True
                    else:
                        row += 1
#                    if ((team[poss] == 'sfo') & (drive['sfo'] == 7)):
#                        print(row,starttime,endtime,pbp.loc[row,'Time'],pbp.loc[row,'Quarter'])
#                if ((drive['sfo'] == 7) & (team[poss] == 'sfo')):
#                    print(row,pbp.loc[row,'Location'],rz)
        if (rz):
            rz_att[team[poss]] += 1 # Increment Red Zone Attempts
#            print(drive[team[poss]],row,team[poss],rz_att[team[poss]],'A')
#            if df[team[poss]].loc[drive[team[poss]],'Result'] in ['Touchdown','Field Goal']:
            if df[team[poss]].loc[drive[team[poss]],'Result'] in ['Touchdown']:
                rz_conv[team[poss]] += 1  # If score, increment Red Zone successes 
#                print(drive[team[poss]],row,team[poss],rz_conv[team[poss]],'C')
        drive[team[poss]] += 1  # Increment drive number for team in question

    dfyear.loc[(away,gamecode),'RZ_Att'] = rz_att[away]
    dfyear.loc[(home,gamecode),'RZ_Att'] = rz_att[home]
    dfyear.loc[(away,gamecode),'RZ_Conv'] = rz_conv[away]
    dfyear.loc[(home,gamecode),'RZ_Conv'] = rz_conv[home]
    dfyear.loc[(away,gamecode),'RZ_Def_Att'] = rz_att[home]
    dfyear.loc[(home,gamecode),'RZ_Def_Att'] = rz_att[away]
    dfyear.loc[(away,gamecode),'RZ_Def_Conv'] = rz_conv[home]
    dfyear.loc[(home,gamecode),'RZ_Def_Conv'] = rz_conv[away]

In [None]:
adrive

In [None]:
entry = adrive.loc[drive,colu].split(':')

In [None]:
print(grouped.shape[0])

In [None]:
table

In [None]:
adrive

In [None]:
pbp.iloc[40:80,:]

In [None]:
intcols=['Points','Points_Opp','Yds_Off_Pass','Yds_Off_Rush','Yds_Def_Pass','Yds_Def_Rush','TD','TD_on_Def',
         'FG_Made','FG_Att','RZ_Conv','RZ_Att','RZ_Def_Conv','RZ_Def_Att','Plays','TO_Gained','TO_Lost',
         'Yds_Pen','Tackles_Loss']
fltcols=['Possession','Sacks_Def','Yds_per_Kickret','Yds_per_Puntret']
strcols=['Home/Away', 'Opponent']
for colu in intcols: dfyear[colu] = dfyear[colu].astype('int64')
for colu in fltcols: dfyear[colu] = dfyear[colu].astype('float64')
for colu in strcols: dfyear[colu] = dfyear[colu].astype('str')

In [None]:
dfyear

In [None]:
dfyear.info()

In [None]:
dfyear.describe().iloc[:,0:10]

In [None]:
    path = 'data/df_step2_'+year+'.data'
    with open(path, 'wb') as f:
        pickle.dump(dfyear,f)

In [None]:
len(gc)

In [None]:
gc.index(gamecode)

In [None]:
table