In [2]:
import pandas as pd
import numpy as np
import pickle
import requests
import re

In [50]:
teams = {'Atlanta Falcons':'atl','Buffalo Bills':'buf','Carolina Panthers':'car','Chicago Bears':'chi',
         'Cincinnati Bengals':'cin','Cleveland Browns':'cle','Indianapolis Colts':'clt',
         'Arizona Cardinals':'crd','Dallas Cowboys':'dal','Denver Broncos':'den','Detroit Lions':'det',
         'Green Bay Packers':'gnb','Houston Texans':'htx','Jacksonville Jaguars':'jax',
         'Kansas City Chiefs':'kan','Miami Dolphins':'mia','Minnesota Vikings':'min','New Orleans Saints':'nor',
         'New England Patriots':'nwe','New York Giants':'nyg','New York Jets':'nyj','Tennessee Titans':'oti',
         'Philadelphia Eagles':'phi','Pittsburgh Steelers':'pit','Oakland Raiders':'rai',
         'Las Vegas Raiders':'rai','St. Louis Rams':'ram','Los Angeles Rams':'ram','Baltimore Ravens':'rav',
         'San Diego Chargers':'sdg','Los Angeles Chargers':'sdg','Seattle Seahawks':'sea',
         'San Francisco 49ers':'sfo','Tampa Bay Buccaneers':'tam','Washington Redskins':'was'}


In [4]:
def weekofgame(game):
    from datetime import timedelta, datetime as dt
    day_of_week={2:0,3:1,4:2,5:3,6:4,0:5,1:6} #day of week starts on Wednesday
    date = dt.strptime(game[0:8],'%Y%m%d')
# .weekday() returns the day of the week for datetime.datetime   
    beg_week = date - timedelta(days=day_of_week[date.weekday()])
    fmt = lambda x: str(x) if x >= 10 else '0'+str(x)
    return(fmt(beg_week.month)+fmt(beg_week.day))

In [5]:
def get_mapping(group,averaged):
    d={}
    keys = sorted(group['Week_Beg'].unique())
    values = [i for i in range((averaged+1),(averaged+1)+len(keys))]
    for key,value in zip(keys,values):
        d[key] = value
    return d

In [113]:
def get_year(code):
    if (code[4:6] == '01'):
        year = str(int(code[0:4])-1)
    else:
        year = str(int(code[0:4]))
    return year

In [7]:
def get_codes(path):
    with open(path, 'rb') as f:
        codes = pickle.load(f)
    return codes

In [8]:
def year_codes(codes,yr):
    subset = [codes[i] for i in range(len(codes)) if get_year(codes[i]) == yr]
    return subset

In [98]:
def normalize_weights(n,a=0):
    arr = np.exp(a*np.arange(n))
    return n*arr/np.sum(arr)

In [10]:
def add_week(df,num):
    by_week = pd.DataFrame(index=df.index,columns=['Week_Beg','Year','Week'])
    for ind in by_week.index: 
        by_week.loc[ind,'Week_Beg'] = weekofgame(ind)
        by_week.loc[ind,'Year'] = get_year(ind)
    grouped_by_year = by_week.groupby(by_week['Year'])
    for key, item in grouped_by_year:
        mapping = get_mapping(item,num)
        for code in item.index:
            by_week.loc[code,'Week'] = int(mapping[by_week.loc[code,'Week_Beg']])
    by_week['Week'] = by_week['Week'].astype('int')
    df['Week'] = by_week['Week']
    return df

In [42]:
def add_scores(df,dfyear):
# Take only the predicted games from dfyear
    codes = set(dfyear.index.levels[1]).intersection(df.index)
    for code in codes:
        home = code[-3:]
        df.loc[code,'Vis_Team'] = dfyear.loc[(home,code),'Opponent']
        df.loc[code,'V_Game'] = dfyear.loc[(home,code),'Points_Opp']
        df.loc[code,'Home_Team'] = \
            dfyear.loc[(teams[df.loc[code,'Vis_Team']],code),'Opponent']
        df.loc[code,'H_Game'] = dfyear.loc[(home,code),'Points']
        df.loc[code,'Tot_Pts'] = dfyear.loc[(home,code),'Points'] + dfyear.loc[(home,code),'Points_Opp']
    return df

In [256]:
def replace_col_name(col):
    seg = col.split('_')
    newcol = col.replace(seg[-1],'Pct')
    return newcol

In [257]:
def compute_pct(df,conv,att):
    prefix = ['H_','V_']
    for pref in prefix:
        col_conv = pref+conv
        col_att = pref+att
        col_new=replace_col_name(col_conv)
        ind_no=df[df[col_att]==0].index
        df.loc[ind_no,col_new] = 0.0
        ind = df[df[col_att]!=0].index
        df.loc[ind,col_new] = df.loc[ind,col_conv]/df.loc[ind,col_att]
        df.drop([col_conv,col_att],axis=1,inplace=True)
    return df

In [271]:
def average_games(df,source,num):
    hdictmap = {'Points':'H_Pts','Points_Opp':'H_Pts_Opp','Yds_Off_Pass':'H_Off_Pass',
                'Yds_Off_Rush':'H_Off_Rush','Yds_Def_Pass':'H_Def_Pass','Yds_Def_Rush':'H_Def_Rush','TD':'H_TD',
                'TD_on_Def':'H_TD_on_Def','FG_Made':'H_FG_Made','FG_Att':'H_FG_Att','RZ_Conv':'H_RZ_Conv',
                'RZ_Att':'H_RZ_Att','RZ_Def_Conv':'H_RZ_Def_Conv','RZ_Def_Att':'H_RZ_Def_Att',
                'Possession':'H_Poss','Plays':'H_Plays','TO_Gained':'H_TO_Gain','TO_Lost':'H_TO_Lost',
                'Yds_Pen':'H_Yds_Pen','Sacks_Def':'H_Sacks_Def','Tackles_Loss':'H_Tackles_Loss',
                'Yds_per_Kickret':'H_Kickret','Yds_per_Puntret':'H_Puntret'}
    vdictmap = {'Points':'V_Pts','Points_Opp':'V_Pts_Opp','Yds_Off_Pass':'V_Off_Pass',
                'Yds_Off_Rush':'V_Off_Rush','Yds_Def_Pass':'V_Def_Pass','Yds_Def_Rush':'V_Def_Rush','TD':'V_TD',
                'TD_on_Def':'V_TD_on_Def','FG_Made':'V_FG_Made','FG_Att':'V_FG_Att','RZ_Conv':'V_RZ_Conv',
                'RZ_Att':'V_RZ_Att','RZ_Def_Conv':'V_RZ_Def_Conv','RZ_Def_Att':'V_RZ_Def_Att',
                'Possession':'V_Poss','Plays':'V_Plays','TO_Gained':'V_TO_Gain','TO_Lost':'V_TO_Lost',
                'Yds_Pen':'V_Yds_Pen','Sacks_Def':'V_Sacks_Def','Tackles_Loss':'V_Tackles_Loss',
                'Yds_per_Kickret':'V_Kickret','Yds_per_Puntret':'V_Puntret'}
    wt_factor = 0
    wt = normalize_weights(num,wt_factor)
    for team in source.index.levels[0]:  # Fill in mean values of statistical categories
        source.loc[team].sort_index(inplace=True)
        for row in range(num,len(source.loc[team])):
            code = source.loc[team].index[row]
            if team == code[-3:]:
                for key in hdictmap:
                    df.loc[code,hdictmap[key]] = \
                        (source.loc[team].iloc[row-num:row,source.columns.get_loc(key)]*wt).mean() # Averaged over num games
            else: 
                for key in vdictmap:
                    df.loc[code,vdictmap[key]] = \
                        (source.loc[team].iloc[row-num:row,source.columns.get_loc(key)]*wt).mean()
                    
    for row in range(num,16):       # Calculate rank categories
        opass = [(source.loc[team].iloc[row-num:row,source.columns.get_loc('Yds_Off_Pass')]*wt).mean() 
                 for team in source.index.levels[0]]
        oprnk = pd.Series(opass,index=source.index.levels[0]).rank()
        orush = [(source.loc[team].iloc[row-num:row,source.columns.get_loc('Yds_Off_Rush')]*wt).mean() 
                 for team in source.index.levels[0]]
        orrnk = pd.Series(orush,index=source.index.levels[0]).rank()
        dpass = [(source.loc[team].iloc[row-num:row,source.columns.get_loc('Yds_Def_Pass')]*wt).mean() 
                 for team in source.index.levels[0]]
        dprnk = pd.Series(dpass,index=source.index.levels[0]).rank(ascending=False)
        drush = [(source.loc[team].iloc[row-num:row,source.columns.get_loc('Yds_Def_Rush')]*wt).mean() 
                 for team in source.index.levels[0]]
        drrnk = pd.Series(drush,index=source.index.levels[0]).rank(ascending=False)
        for team in source.index.levels[0]:
            if row < len(source.loc[team]): 
                code = source.loc[team].index[row]
                if team == code[-3:]: 
                    df.loc[code,'H_O_Pass_Rank'] = oprnk[team]
                    df.loc[code,'H_O_Rush_Rank'] = orrnk[team]
                    df.loc[code,'H_D_Pass_Rank'] = dprnk[team]
                    df.loc[code,'H_D_Rush_Rank'] = drrnk[team]
                else: 
                    df.loc[code,'V_O_Pass_Rank'] = oprnk[team]
                    df.loc[code,'V_O_Rush_Rank'] = orrnk[team]
                    df.loc[code,'V_D_Pass_Rank'] = dprnk[team]
                    df.loc[code,'V_D_Rush_Rank'] = drrnk[team]
    return df

In [281]:
def compute_stats_metrics(df):
    df['H_Pass_Metric'] = (df['H_O_Pass_Rank'] + df['V_D_Pass_Rank'])/2.0
    df['H_Rush_Metric'] = (df['H_O_Rush_Rank'] + df['V_D_Rush_Rank'])/2.0
    df['V_Pass_Metric'] = (df['V_O_Pass_Rank'] + df['H_D_Pass_Rank'])/2.0
    df['V_Rush_Metric'] = (df['V_O_Rush_Rank'] + df['H_D_Rush_Rank'])/2.0
    df.drop(['H_O_Pass_Rank','H_O_Rush_Rank','V_O_Pass_Rank','V_O_Rush_Rank','V_D_Pass_Rank','V_D_Rush_Rank',
               'H_D_Pass_Rank','H_D_Rush_Rank'],axis=1,inplace=True)
    return df

In [12]:
def add_dome(df):
    df['dome'] = ((df['Temperature'] == 70) & (df['Wind'] == 0))*1
    special = ['201909220kan','201611060sfo','200311240tam','200810260car','200911150was']
    for code in special:
        if code in df.index:
            df.loc[code,'dome'] = 0
    return df

In [13]:
def windchill(df):
# Wind chill factor for temperature when T <= 50 °F and wind > 3 mph
    ind = df[(df['Temperature'] <= 50) & (df['Wind'] > 3)].index
    df.loc[ind,'Temperature'] = (35.74 + 0.6215*df.loc[ind,'Temperature'] - 
                                35.75*(df.loc[ind,'Wind']**0.16) + 
                                0.4275*df.loc[ind,'Temperature']*(df.loc[ind,'Wind']**0.16))
    return df

In [311]:
codepath = 'data/gamecodes.data'
gcodes = get_codes(codepath)

# Game Info
ginfo = pd.DataFrame(index=gcodes,
                     columns=['Over_Under','Temperature','Wind','Surface','Week','Year','dome'])
for gamecode in gcodes:
    year = get_year(gamecode)
    path = 'data/raw/'+year+'/'+gamecode+'.pkl'
    f = open(path, 'rb')
    table = pickle.load(f)
    table = pickle.load(f)
    table = table.set_index(table.columns[0])
    ginfo.loc[gamecode,'Surface'] = table.loc['Surface',1]
    ginfo.loc[gamecode,'Over_Under'] = float(table.loc['Over/Under',1].split(' ')[0])
    if 'Weather' in table.index:
        string = table.loc['Weather',1]
        ginfo.loc[gamecode,'Temperature'] = int(string.split(' ')[0])
        if 'mph' in string:
            for substr in string.split(','):
                if 'mph' in substr:
                    ginfo.loc[gamecode,'Wind'] = int(''.join(filter(str.isdigit, substr)))
        else:
            ginfo.loc[gamecode,'Wind'] = 0
#         if 'humidity' in string:
#             for substr in string.split(','):
#                 if 'humidity' in substr:
#                     ginfo.loc[gamecode,'Humidity'] = int(''.join(filter(str.isdigit, substr)))
#         else:
#             ginfo.loc[gamecode,'Humidity'] = 0
    else:
#         ginfo.loc[gamecode,'Humidity'] = 0 
        ginfo.loc[gamecode,'Wind'] = 0
    f.close()
    ginfo.loc[gamecode,'Year'] = int(year)
# Actual game temperatures manually entered
special={'201912290jax':74,'201912080jax':68,'201912010jax':83,'201912010mia':82,
       '201911100gnb':36,'201911100cle':53,'201911030sdg':77,'201910270jax':84,
       '201909220kan':79,'201812090sdg':72,'201712100jax':55,'201610230nyj':62,
       '201512130nyj':66,'200010080mia':87,'200012100kan':25}
for key in special.keys():
    if key in ginfo.index:
        ginfo.loc[key,'Temperature'] = special[key]
# Imputed for dome games
ginfo['Temperature']=ginfo['Temperature'].fillna(70)
ginfo = windchill(ginfo)
ginfo = add_dome(ginfo)    
#ginfo['dome'] = ginfo['dome'].astype('int')
ginfo = add_week(ginfo,0)

In [329]:
num = 5 # number of games averaged over

cols = ['Home_Team','Vis_Team','H_Game','V_Game','Tot_Pts','H_Pts','H_Pts_Opp','V_Pts','V_Pts_Opp',
        'H_Off_Pass','H_O_Pass_Rank','V_Off_Pass','V_O_Pass_Rank','H_Off_Rush','H_O_Rush_Rank','V_Off_Rush',
        'V_O_Rush_Rank','H_Def_Pass','H_D_Pass_Rank','V_Def_Pass','V_D_Pass_Rank','H_Def_Rush','H_D_Rush_Rank',
        'V_Def_Rush','V_D_Rush_Rank','H_TD','V_TD','H_TD_on_Def','V_TD_on_Def','H_FG_Made','V_FG_Made',
        'H_FG_Att','V_FG_Att','H_RZ_Conv','V_RZ_Conv','H_RZ_Att','V_RZ_Att','H_RZ_Def_Conv','V_RZ_Def_Conv',
        'H_RZ_Def_Att','V_RZ_Def_Att','H_Poss','V_Poss','H_Plays','V_Plays','H_TO_Gain','V_TO_Gain','H_TO_Lost',
        'V_TO_Lost','H_Yds_Pen','V_Yds_Pen','H_Sacks_Def','V_Sacks_Def','H_Tackles_Loss','V_Tackles_Loss',
        'H_Kickret','V_Kickret','H_Puntret','V_Puntret']
dfgame=pd.DataFrame(columns=cols)
dfgame.index.set_names('Code',inplace=True)
years=[str(year) for year in range(2010,2020)]
for year in years:
    path = 'data/df_step2_'+year+'.pkl'
    with open(path, 'rb') as f:
        dfyear = pickle.load(f)
    dfgame = average_games(dfgame,dfyear,num)
    dfgame = add_scores(dfgame,dfyear)
dfgame = dfgame.join(ginfo,how='left')
dfgame.dropna(inplace=True)
dfgame = compute_stats_metrics(dfgame)
dfgame = compute_pct(dfgame,'FG_Made','FG_Att')
dfgame = compute_pct(dfgame,'RZ_Conv','RZ_Att')
dfgame = compute_pct(dfgame,'RZ_Def_Conv','RZ_Def_Att')

new_cols = ['Home_Team','Vis_Team','H_Game','V_Game','Over_Under','Tot_Pts','H_Pts',
            'H_Pts_Opp','V_Pts','V_Pts_Opp','H_Off_Pass','H_Pass_Metric','V_Off_Pass',
            'V_Pass_Metric','H_Off_Rush','H_Rush_Metric','V_Off_Rush','V_Rush_Metric',
            'H_Def_Pass','V_Def_Pass','H_Def_Rush','V_Def_Rush','H_TD','V_TD',
            'H_TD_on_Def','V_TD_on_Def','H_FG_Pct','V_FG_Pct','H_RZ_Pct','V_RZ_Pct',
            'H_RZ_Def_Pct','V_RZ_Def_Pct','H_Poss','V_Poss','H_Plays','V_Plays',
            'H_TO_Gain','V_TO_Gain','H_TO_Lost','V_TO_Lost','H_Yds_Pen','V_Yds_Pen',
            'H_Sacks_Def','V_Sacks_Def','H_Tackles_Loss','V_Tackles_Loss','H_Kickret',
            'V_Kickret','H_Puntret','V_Puntret','Temperature','Wind','dome','Week','Year','Surface']
dfgame = dfgame[new_cols]
dfgame = pd.concat([dfgame.drop('Surface', axis = 1), pd.get_dummies(dfgame.Surface)], axis = 1)
dfgame.drop('astroturf',axis=1,inplace=True)

numer_cols = dfgame.columns[4:]
for col in numer_cols:
    dfgame[col] = pd.to_numeric(dfgame[col])
print('End')

End


In [310]:
path = 'data/df_working5.pkl'
with open(path, 'wb') as f:
    pickle.dump(dfgame,f)

In [303]:
dfall = dfgame
path = 'data/df_complete.pkl'
with open(path, 'wb') as f:
    pickle.dump(dfall,f)

In [322]:
path='data/df_gameinfo00_09.pkl'
with open(path, 'wb') as f:
    pickle.dump(ginfo,f)

In [340]:
path = 'data/df_2000_2009.pkl'
with open(path, 'wb') as f:
    pickle.dump(dfgame,f)