In [9]:
import pandas as pd
import pickle
import requests
import re

In [225]:
teams = {'Atlanta Falcons':'atl','Buffalo Bills':'buf','Carolina Panthers':'car','Chicago Bears':'chi',
         'Cincinnati Bengals':'cin','Cleveland Browns':'cle','Indianapolis Colts':'clt',
         'Arizona Cardinals':'crd','Dallas Cowboys':'dal','Denver Broncos':'den','Detroit Lions':'det',
         'Green Bay Packers':'gnb','Houston Texans':'htx','Jacksonville Jaguars':'jax',
         'Kansas City Chiefs':'kan','Miami Dolphins':'mia','Minnesota Vikings':'min','New Orleans Saints':'nor',
         'New England Patriots':'nwe','New York Giants':'nyg','New York Jets':'nyj','Tennessee Titans':'oti',
         'Philadelphia Eagles':'phi','Pittsburgh Steelers':'pit','Oakland Raiders':'rai',
         'Las Vegas Raiders':'rai','St. Louis Rams':'ram','Los Angeles Rams':'ram','Baltimore Ravens':'rav',
         'San Diego Chargers':'sdg','Los Angeles Chargers':'sdg','Seattle Seahawks':'sea',
         'San Francisco 49ers':'sfo','Tampa Bay Buccaneers':'tam','Washington Redskins':'was'}
cols = ['H_Pts','V_Pts','H_Pts_Opp','V_Pts_Opp','H_Off_Pass','H_O_Pass_Rank','V_Off_Pass','V_O_Pass_Rank','H_Off_Rush',
        'H_O_Rush_Rank','V_Off_Rush','V_O_Rush_Rank','H_Def_Pass','H_D_Pass_Rank','V_Def_Pass','V_D_Pass_Rank',
        'H_Def_Rush','H_D_Rush_Rank','V_Def_Rush','V_D_Rush_Rank','H_TD','V_TD','H_TD_on_Def','V_TD_on_Def',
        'H_FG_Made','V_FG_Made','H_FG_Att','V_FG_Att','H_RZ_Conv','V_RZ_Conv','H_RZ_Att','V_RZ_Att','H_RZ_Def_Conv',
        'V_RZ_Def_Conv','H_RZ_Def_Att','V_RZ_Def_Att','H_Poss','V_Poss','H_Plays','V_Plays','H_TO_Gain','V_TO_Gain',
        'H_TO_Lost','V_TO_Lost','H_Yds_Pen','V_Yds_Pen','H_Sacks_Def','V_Sacks_Def','H_Tackles_Loss','V_Tackles_Loss',
        'H_Kickret','V_Kickret','H_Puntret','V_Puntret']
hdictmap = {'Points':'H_Pts','Points_Opp':'H_Pts_Opp','Yds_Off_Pass':'H_Off_Pass','Yds_Off_Rush':'H_Off_Rush',
           'Yds_Def_Pass':'H_Def_Pass','Yds_Def_Rush':'H_Def_Rush','TD':'H_TD','TD_on_Def':'H_TD_on_Def',
            'FG_Made':'H_FG_Made','FG_Att':'H_FG_Att','RZ_Conv':'H_RZ_Conv','RZ_Att':'H_RZ_Att',
            'RZ_Def_Conv':'H_RZ_Def_Conv','RZ_Def_Att':'H_RZ_Def_Att','Possession':'H_Poss','Plays':'H_Plays',
           'TO_Gained':'H_TO_Gain','TO_Lost':'H_TO_Lost','Yds_Pen':'H_Yds_Pen','Sacks_Def':'H_Sacks_Def',
           'Tackles_Loss':'H_Tackles_Loss','Yds_per_Kickret':'H_Kickret','Yds_per_Puntret':'H_Puntret'}
vdictmap = {'Points':'V_Pts','Points_Opp':'V_Pts_Opp','Yds_Off_Pass':'V_Off_Pass','Yds_Off_Rush':'V_Off_Rush',
           'Yds_Def_Pass':'V_Def_Pass','Yds_Def_Rush':'V_Def_Rush','TD':'V_TD','TD_on_Def':'V_TD_on_Def',
            'FG_Made':'V_FG_Made','FG_Att':'V_FG_Att','RZ_Conv':'V_RZ_Conv','RZ_Att':'V_RZ_Att',
            'RZ_Def_Conv':'V_RZ_Def_Conv','RZ_Def_Att':'V_RZ_Def_Att','Possession':'V_Poss','Plays':'V_Plays',
           'TO_Gained':'V_TO_Gain','TO_Lost':'V_TO_Lost','Yds_Pen':'V_Yds_Pen','Sacks_Def':'V_Sacks_Def',
           'Tackles_Loss':'V_Tackles_Loss','Yds_per_Kickret':'V_Kickret','Yds_per_Puntret':'V_Puntret'}

In [282]:
years = [str(year) for year in range(2010,2020)]
#years = ['2018']

In [283]:
dfgame=pd.DataFrame(columns=cols)
dfgame.index.set_names('Code',inplace=True)

In [284]:
for year in years:
    path = 'data/df_step2_'+year+'.data'
    with open(path, 'rb') as f:
        dfyear = pickle.load(f)
    for team in dfyear.index.levels[0]:  # Fill in mean values of statistical categories
        dfyear.loc[team].sort_index(inplace=True)
        for row in range(5,len(dfyear.loc[team])):
            code = dfyear.loc[team].index[row]
            if team == code[-3:]:
                for key in hdictmap:
                    dfgame.loc[code,hdictmap[key]] = \
                        dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc(key)].mean() # Averaged over 5 games
            else: 
                for key in vdictmap:
                    dfgame.loc[code,vdictmap[key]] = \
                        dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc(key)].mean()
            
    for row in range(5,16):       # Calculate rank categories
        opass = [dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc('Yds_Off_Pass')].mean() 
                 for team in dfyear.index.levels[0]]
        oprnk = pd.Series(opass,index=dfyear.index.levels[0]).rank()
        orush = [dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc('Yds_Off_Rush')].mean() 
                 for team in dfyear.index.levels[0]]
        orrnk = pd.Series(orush,index=dfyear.index.levels[0]).rank()
        dpass = [dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc('Yds_Def_Pass')].mean() 
                 for team in dfyear.index.levels[0]]
        dprnk = pd.Series(dpass,index=dfyear.index.levels[0]).rank(ascending=False)
        drush = [dfyear.loc[team].iloc[row-5:row,dfyear.columns.get_loc('Yds_Def_Rush')].mean() 
                 for team in dfyear.index.levels[0]]
        drrnk = pd.Series(drush,index=dfyear.index.levels[0]).rank(ascending=False)
        for team in dfyear.index.levels[0]:
            if row < len(dfyear.loc[team]): 
                code = dfyear.loc[team].index[row]
                if team == code[-3:]: 
                    dfgame.loc[code,'H_O_Pass_Rank'] = oprnk[team]
                    dfgame.loc[code,'H_O_Rush_Rank'] = orrnk[team]
                    dfgame.loc[code,'H_D_Pass_Rank'] = dprnk[team]
                    dfgame.loc[code,'H_D_Rush_Rank'] = drrnk[team]
                else: 
                    dfgame.loc[code,'V_O_Pass_Rank'] = oprnk[team]
                    dfgame.loc[code,'V_O_Rush_Rank'] = orrnk[team]
                    dfgame.loc[code,'V_D_Pass_Rank'] = dprnk[team]
                    dfgame.loc[code,'V_D_Rush_Rank'] = drrnk[team]

dfgame.dropna(inplace=True)
dfgame[cols] = dfgame[cols].astype('float64')

In [203]:
ginfo = pd.DataFrame(index=dfgame.index,columns=['Surface','Temperature','Over/Under'])
codes = ginfo.index
for code in codes:
    url='https://www.pro-football-reference.com/boxscores/'+code+'.htm'
    res=requests.get(url)
    comm = re.compile('<!--|-->')
# Game Info
    table=pd.read_html(comm.sub("",res.text),attrs={'id':'game_info'},flavor='bs4')
    table = table[0]
    table = table.set_index(table.columns[0])
    ginfo.loc[code,'Surface'] = table.loc['Surface',1]
    ginfo.loc[code,'Over/Under'] = float(table.loc['Over/Under',1].split(' ')[0])
    if 'Weather' in table.index: ginfo.loc[code,'Temperature'] = int(table.loc['Weather',1].split(' ')[0])    

path='data/df_gameinfo.pkl'
with open(path, 'wb') as f:
    pickle.dump(ginfo,f)

In [None]:
dfgame = dfgame.join(ginfo,how='left')
dfgame['Temperature'].fillna(dfgame['Temperature'].mean(),inplace=True)

In [306]:
newcols = ['Over/Under']+dfgame.columns.to_list()[:-1]
dfgame = dfgame[newcols]

In [307]:
dfgame

Unnamed: 0_level_0,Over/Under,H_Pts,V_Pts,H_Pts_Opp,V_Pts_Opp,H_Off_Pass,H_O_Pass_Rank,V_Off_Pass,V_O_Pass_Rank,H_Off_Rush,...,H_Sacks_Def,V_Sacks_Def,H_Tackles_Loss,V_Tackles_Loss,H_Kickret,V_Kickret,H_Puntret,V_Puntret,Surface,Temperature
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201010170phi,43,24.4,22.6,20.6,14.0,255.4,23.0,233.0,17.0,130.4,...,3.0,2.2,5.0,4.4,19.800000,21.300000,13.446667,7.000000,grass,66.000000
201010240atl,43,24.2,20.0,17.2,20.4,232.6,18.0,257.4,24.0,150.2,...,1.8,1.2,5.0,2.6,21.866667,25.913333,8.600000,2.433333,fieldturf,52.945669
201011070atl,45.5,23.8,19.8,22.2,28.4,247.4,19.0,252.2,21.0,136.6,...,2.0,0.4,5.0,2.4,19.923810,22.163333,6.200000,5.266667,fieldturf,52.945669
201011110atl,44,23.8,26.2,21.6,19.6,248.8,18.0,253.2,23.0,122.2,...,1.8,1.8,5.0,4.2,24.663810,21.600000,6.200000,1.066667,fieldturf,52.945669
201011210ram,43.5,16.6,25.8,22.4,23.0,196.2,2.0,257.4,20.0,114.4,...,3.8,2.0,6.0,4.4,36.000000,26.630476,8.666667,7.800000,astroplay,52.945669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912010rav,45,40.4,35.2,12.4,19.8,198.4,4.0,283.4,27.0,217.2,...,2.8,4.8,4.4,7.6,15.666667,12.733333,5.400000,4.633333,grass,40.000000
201911030sea,51,26.4,31.8,23.4,33.4,245.2,17.0,334.0,32.0,141.8,...,1.4,2.4,4.2,7.0,25.066667,9.100000,2.933333,4.293333,fieldturf,54.000000
201911110sfo,47,27.8,29.0,9.6,28.2,213.4,10.0,272.8,21.0,168.8,...,4.2,1.0,6.6,3.2,9.400000,30.266667,13.766667,4.866667,grass,77.000000
201912290sea,47,21.8,31.6,23.6,26.8,228.0,12.0,250.0,19.0,148.6,...,1.2,1.6,3.2,4.0,19.200000,29.200000,3.400000,8.233333,fieldturf,51.000000


In [308]:
path='data/df_analyze.pkl'
with open(path, 'wb') as f:
    pickle.dump(dfgame,f)

In [309]:
dfgame['Surface'].value_counts()

grass         1006
fieldturf      458
sportturf      114
matrixturf      48
a_turf          44
astroplay       36
astroturf       28
Name: Surface, dtype: int64

In [289]:
dfgame.describe().iloc[:,10:20]

Unnamed: 0,V_Off_Rush,V_O_Rush_Rank,H_Def_Pass,H_D_Pass_Rank,V_Def_Pass,V_D_Pass_Rank,H_Def_Rush,H_D_Rush_Rank,V_Def_Rush,V_D_Rush_Rank
count,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0
mean,113.112226,16.611592,250.9797,16.383218,250.011419,16.635236,113.352826,16.393887,112.572549,16.612745
std,27.052921,9.111943,36.766483,9.188263,36.782894,9.282187,25.86534,9.225832,25.956681,9.259474
min,50.2,1.0,134.8,1.0,121.4,1.0,43.8,1.0,37.2,1.0
25%,94.2,9.0,225.05,8.0,224.4,9.0,95.2,8.0,94.0,9.0
50%,110.6,17.0,250.0,16.0,248.8,17.0,111.7,17.0,111.8,16.0
75%,129.4,24.375,277.35,24.0,275.2,25.0,129.2,24.0,129.0,25.0
max,230.4,32.0,379.8,32.0,360.4,32.0,219.6,32.0,235.2,32.0


In [290]:
dfgame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1734 entries, 201010170phi to 201910200was
Data columns (total 54 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   H_Pts           1734 non-null   float64
 1   V_Pts           1734 non-null   float64
 2   H_Pts_Opp       1734 non-null   float64
 3   V_Pts_Opp       1734 non-null   float64
 4   H_Off_Pass      1734 non-null   float64
 5   H_O_Pass_Rank   1734 non-null   float64
 6   V_Off_Pass      1734 non-null   float64
 7   V_O_Pass_Rank   1734 non-null   float64
 8   H_Off_Rush      1734 non-null   float64
 9   H_O_Rush_Rank   1734 non-null   float64
 10  V_Off_Rush      1734 non-null   float64
 11  V_O_Rush_Rank   1734 non-null   float64
 12  H_Def_Pass      1734 non-null   float64
 13  H_D_Pass_Rank   1734 non-null   float64
 14  V_Def_Pass      1734 non-null   float64
 15  V_D_Pass_Rank   1734 non-null   float64
 16  H_Def_Rush      1734 non-null   float64
 17  H_D_Rush_Rank   173

In [136]:
dayconv={3:0,4:1,5:2,6:3,0:4,1:5,2:6} # Convert day of week to 
dfdate = pd.DataFrame(index=dfgame.index,columns=['Date'])
dfdate.reset_index(inplace=True)
dfdate['Date'] = dfdate['Code'].apply(lambda x: x[0:8]).apply(lambda y: datetime.strptime(y,'%Y%m%d'))
dfdate['dayofweek'] = pd.to_timedelta(dfdate['Date'].dt.weekday.apply(lambda x: dayconv[x]),unit='D')
dfdate['Beg_week'] = dfdate['Date'] - dfdate['dayofweek']
dfdate.set_index('Code',inplace=True)

In [137]:
dfdate

Unnamed: 0_level_0,Date,dayofweek,Beg_week
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
201010170phi,2010-10-17,3 days,2010-10-14
201010240atl,2010-10-24,3 days,2010-10-21
201011070atl,2010-11-07,3 days,2010-11-04
201011110atl,2010-11-11,0 days,2010-11-11
201011210ram,2010-11-21,3 days,2010-11-18
...,...,...,...
201912010rav,2019-12-01,3 days,2019-11-28
201911030sea,2019-11-03,3 days,2019-10-31
201911110sfo,2019-11-11,4 days,2019-11-07
201912290sea,2019-12-29,3 days,2019-12-26
