In [1]:
#pip install keras-tuner

In [2]:
import nfl_data_py as nfl
import pandas as pd
import os
import numpy as np
import urllib.request
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import matplotlib.image as mpimg
import tensorflow as tf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from scipy.stats import uniform, randint
from skopt.space import Real, Integer
from skopt import BayesSearchCV
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

import joblib

In [3]:
'''
this section cleans the data, 
it can be easily reused'''


df_twofour = nfl.import_pbp_data([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], downcast=True, cache=False, alt_path=None, include_participation=False)

#regular season data only
df_twofour = df_twofour.loc[df_twofour.season_type=='REG']

'''
The below line removes kick offs, field goals, kneels, etc - only plays with passes, runs, and penalties, and we're going to leave it out because it can impact max scores
'''
#df_twofour = df_twofour.loc[(df_twofour.play_type.isin(['no_play','pass','run'])) & (df_twofour.epa.isna()==False)]

#match play call to play type, so QB scrambles still considered pass plays
df_twofour.loc[df_twofour['pass']==1, 'play_type'] = 'pass'
df_twofour.loc[df_twofour.rush==1, 'play_type'] = 'run'

#reset index to skip missing numbers
df_twofour.reset_index(drop=True, inplace=True)

df_twofour.head()


2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,1.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,,,,...,0.0,0.0,-0.0,,,,,,,
1,36.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.0,,,,,,,
2,51.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.337139,0.691718,4.699278,3.0,0.678964,0.225919,0.456481,54.35191
3,75.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.262481,,,,,,0.545905,-54.590458
4,96.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,1.661242,1.794671,4.66265,2.0,0.71235,0.71235,0.968533,3.146732


In [4]:
'''
In the following sections we're going to generate a few dataframes.

--The first will be points per game in 2024
--The second will be offensive stats
    --We'll break these up by home and away
--The third section will be defensive stats
    --Likewise, we'll break these up by home and away
--Final section will include some miscellaneous scoring such as fumbles lost/recovered etc
'''

"\nIn the following sections we're going to generate a few dataframes.\n\n--The first will be points per game in 2024\n--The second will be offensive stats\n    --We'll break these up by home and away\n--The third section will be defensive stats\n    --Likewise, we'll break these up by home and away\n--Final section will include some miscellaneous scoring such as fumbles lost/recovered etc\n"

In [5]:
'''
This is section (dataframe) 1 - Scores
This section pulls max scores, practically meaning score at the end of the game

'''

df_twofour_scores = df_twofour[['game_id', 'posteam', 'home_team', 'total_home_score', 'away_team', 'total_away_score']]
  
df_scores = (
        df_twofour_scores.groupby(['game_id', 'home_team', 'away_team'], as_index=False)
        .agg({
            'total_home_score': 'max',
            'total_away_score': 'max'
        })
        .rename(columns={
            'game_id': 'Game ID',
            'home_team': 'Home Team',
            'away_team': 'Away Team',
            'total_home_score': 'Max Home Score',
            'total_away_score': 'Max Away Score'
        })
    )
df_scores



Unnamed: 0,Game ID,Home Team,Away Team,Max Home Score,Max Away Score
0,2015_01_BAL_DEN,DEN,BAL,19.0,13.0
1,2015_01_CAR_JAX,JAX,CAR,9.0,20.0
2,2015_01_CIN_OAK,LV,CIN,13.0,33.0
3,2015_01_CLE_NYJ,NYJ,CLE,31.0,10.0
4,2015_01_DET_SD,LAC,DET,33.0,28.0
...,...,...,...,...,...
2618,2024_18_NO_TB,TB,NO,27.0,19.0
2619,2024_18_NYG_PHI,PHI,NYG,20.0,13.0
2620,2024_18_SEA_LA,LA,SEA,25.0,30.0
2621,2024_18_SF_ARI,ARI,SF,47.0,24.0


In [6]:
#get scores for home team

home_scores = df_scores[['Game ID', 'Home Team', 'Max Home Score']]

home_scores = (home_scores.rename(
    columns={
            
            'Home Team': 'Team',
            'Max Home Score': 'Score'
    })
               
)
               
home_scores

Unnamed: 0,Game ID,Team,Score
0,2015_01_BAL_DEN,DEN,19.0
1,2015_01_CAR_JAX,JAX,9.0
2,2015_01_CIN_OAK,LV,13.0
3,2015_01_CLE_NYJ,NYJ,31.0
4,2015_01_DET_SD,LAC,33.0
...,...,...,...
2618,2024_18_NO_TB,TB,27.0
2619,2024_18_NYG_PHI,PHI,20.0
2620,2024_18_SEA_LA,LA,25.0
2621,2024_18_SF_ARI,ARI,47.0


In [7]:
#get scores for away team

away_scores = df_scores[['Game ID', 'Away Team', 'Max Away Score']]

away_scores = (away_scores.rename(
    columns={
            
            'Away Team': 'Team',
            'Max Away Score': 'Score'
    })
               
)
               
away_scores

Unnamed: 0,Game ID,Team,Score
0,2015_01_BAL_DEN,BAL,13.0
1,2015_01_CAR_JAX,CAR,20.0
2,2015_01_CIN_OAK,CIN,33.0
3,2015_01_CLE_NYJ,CLE,10.0
4,2015_01_DET_SD,DET,28.0
...,...,...,...
2618,2024_18_NO_TB,NO,19.0
2619,2024_18_NYG_PHI,NYG,13.0
2620,2024_18_SEA_LA,SEA,30.0
2621,2024_18_SF_ARI,SF,24.0


In [8]:
'''
Based on how the datasets ended being shaped/prepared, I no longer think this is necessary but leaving here for posterity
'''

scores_per_game = (
    df_scores.assign(
        Teams=df_scores[["Home Team", "Away Team"]].values.tolist(),
        Scores=df_scores[["Max Home Score", "Max Away Score"]].values.tolist()
    )
    .explode(["Teams", "Scores"])  
    .reset_index(drop=True)        
    .loc[:, ["Game ID", "Teams", "Scores"]]  
    .rename(columns={"Teams": "Team", "Scores": "Score"}) 
)

scores_per_game

Unnamed: 0,Game ID,Team,Score
0,2015_01_BAL_DEN,DEN,19.0
1,2015_01_BAL_DEN,BAL,13.0
2,2015_01_CAR_JAX,JAX,9.0
3,2015_01_CAR_JAX,CAR,20.0
4,2015_01_CIN_OAK,LV,13.0
...,...,...,...
5241,2024_18_SEA_LA,SEA,30.0
5242,2024_18_SF_ARI,ARI,47.0
5243,2024_18_SF_ARI,SF,24.0
5244,2024_18_WAS_DAL,DAL,19.0


In [9]:
'''
This is section -- dataframe -- 2. It is a collection of offensive stats that I am asserting are relevant to scoring points.
'''

'\nThis is section -- dataframe -- 2. It is a collection of offensive stats that I am asserting are relevant to scoring points.\n'

In [10]:
df_off_stats = df_twofour[['game_id', 'home_team', 'away_team', 'posteam', 'pass_length', 'air_yards', 'air_epa', 'yac_epa', 'comp_air_epa', 'comp_yac_epa', 'yards_gained', 'total_home_rush_epa', 'total_away_rush_epa', 'total_home_pass_epa', 'total_away_pass_epa']]

df_off_stats.head()

Unnamed: 0,game_id,home_team,away_team,posteam,pass_length,air_yards,air_epa,yac_epa,comp_air_epa,comp_yac_epa,yards_gained,total_home_rush_epa,total_away_rush_epa,total_home_pass_epa,total_away_pass_epa
0,2015_01_BAL_DEN,DEN,BAL,,,,,,,,,0.0,0.0,0.0,0.0
1,2015_01_BAL_DEN,DEN,BAL,BAL,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015_01_BAL_DEN,DEN,BAL,BAL,short,3.0,-0.337139,0.0,-0.337139,0.0,3.0,0.0,0.0,0.337139,-0.337139
3,2015_01_BAL_DEN,DEN,BAL,BAL,,,,,0.0,0.0,2.0,0.262481,-0.262481,0.337139,-0.337139
4,2015_01_BAL_DEN,DEN,BAL,BAL,short,4.0,-0.791399,2.452641,-0.791399,2.452641,10.0,0.262481,-0.262481,-1.324103,1.324103


In [11]:
'''This next section aggregates offensive stats by team and game_id

for each home_team

pass epa avg
rush epa avg
air yards total
yards_gained avg
total_home_pass_epa

for each away_team
epa avg
pass epa per game
rush epa per game
air yards per game
avg yards_gained per game
total_home_pass_epa                   

'''



home_off_stats = (
    df_off_stats.groupby([
        'game_id', 'posteam', 'home_team'], as_index=False)
    .agg({        
                
        'air_yards': 'sum',
        'air_epa': 'mean',
        'yac_epa': 'mean',
        'yards_gained': 'mean',
        'total_home_rush_epa': 'last',
        'total_home_pass_epa': 'last'
        
    })
)

home_off_stats = home_off_stats[home_off_stats['posteam'] == home_off_stats['home_team']]

home_off_stats = home_off_stats[['game_id', 'home_team', 'air_yards', 'air_epa', 'yac_epa', 'yards_gained', 'total_home_rush_epa', 'total_home_pass_epa']]

home_off_stats

Unnamed: 0,game_id,home_team,air_yards,air_epa,yac_epa,yards_gained,total_home_rush_epa,total_home_pass_epa
1,2015_01_BAL_DEN,DEN,288.0,0.409077,-0.563374,2.406593,-0.466295,5.530431
3,2015_01_CAR_JAX,JAX,285.0,0.534663,-0.879461,3.312500,3.448772,-18.808174
5,2015_01_CIN_OAK,LV,223.0,0.272619,-0.168092,3.113924,-8.325734,-17.605350
7,2015_01_CLE_NYJ,NYJ,254.0,0.880318,-0.450067,4.625000,7.088470,14.367256
9,2015_01_DET_SD,LAC,153.0,-0.118145,0.513762,5.250000,-7.139055,13.664722
...,...,...,...,...,...,...,...,...
5237,2024_18_NO_TB,TB,258.0,0.823625,-0.500669,4.702381,3.531472,10.484469
5239,2024_18_NYG_PHI,PHI,270.0,0.290206,-0.005745,3.695122,-12.487288,16.192055
5240,2024_18_SEA_LA,LA,221.0,0.170221,0.047322,5.037500,-3.371261,-7.761229
5242,2024_18_SF_ARI,ARI,223.0,0.203110,0.025771,4.678571,9.089952,4.701149


In [12]:
'''
add rush and pass epa for home
'''

pass_epa_home = (
    df_twofour[
        (df_twofour['pass'] == 1) &
        (df_twofour['posteam'] == df_twofour['home_team'])
    ]
    .groupby(['game_id', 'home_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'pass_epa',
                    })
)


rush_epa_home = (
    df_twofour[
        (df_twofour['rush'] == 1) &
        (df_twofour['posteam'] == df_twofour['home_team'])
    ]
    .groupby(['game_id', 'home_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'rush_epa',
                   })
)

merged_home_epa = pd.merge(pass_epa_home, rush_epa_home, on = ["game_id", "home_team"], how = "inner")

merged_home_epa


Unnamed: 0,game_id,home_team,pass_epa,rush_epa
0,2015_01_BAL_DEN,DEN,-0.240782,-0.201215
1,2015_01_CAR_JAX,JAX,-0.440445,-0.017656
2,2015_01_CIN_OAK,LV,-0.087525,-0.424873
3,2015_01_CLE_NYJ,NYJ,0.430251,-0.044048
4,2015_01_DET_SD,LAC,0.288580,-0.227229
...,...,...,...,...
2618,2024_18_NO_TB,TB,0.389386,-0.163374
2619,2024_18_NYG_PHI,PHI,0.174808,-0.426006
2620,2024_18_SEA_LA,LA,0.044681,-0.023251
2621,2024_18_SF_ARI,ARI,0.311485,0.245257


In [13]:

mgd_home_off_stats = pd.merge(home_off_stats, merged_home_epa, on = ["game_id", "home_team"], how = "inner")
mgd_home_off_stats["is_home"]=1

mgd_home_off_stats = (
        mgd_home_off_stats.rename(columns={
            'game_id': 'Game ID',
            'home_team': 'Team',
            'total_home_rush_epa': 'tot_rush_epa',
            'total_home_pass_epa': 'tot_pass_epa'
        })
    )

mgd_home_off_stats


Unnamed: 0,Game ID,Team,air_yards,air_epa,yac_epa,yards_gained,tot_rush_epa,tot_pass_epa,pass_epa,rush_epa,is_home
0,2015_01_BAL_DEN,DEN,288.0,0.409077,-0.563374,2.406593,-0.466295,5.530431,-0.240782,-0.201215,1
1,2015_01_CAR_JAX,JAX,285.0,0.534663,-0.879461,3.312500,3.448772,-18.808174,-0.440445,-0.017656,1
2,2015_01_CIN_OAK,LV,223.0,0.272619,-0.168092,3.113924,-8.325734,-17.605350,-0.087525,-0.424873,1
3,2015_01_CLE_NYJ,NYJ,254.0,0.880318,-0.450067,4.625000,7.088470,14.367256,0.430251,-0.044048,1
4,2015_01_DET_SD,LAC,153.0,-0.118145,0.513762,5.250000,-7.139055,13.664722,0.288580,-0.227229,1
...,...,...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,TB,258.0,0.823625,-0.500669,4.702381,3.531472,10.484469,0.389386,-0.163374,1
2619,2024_18_NYG_PHI,PHI,270.0,0.290206,-0.005745,3.695122,-12.487288,16.192055,0.174808,-0.426006,1
2620,2024_18_SEA_LA,LA,221.0,0.170221,0.047322,5.037500,-3.371261,-7.761229,0.044681,-0.023251,1
2621,2024_18_SF_ARI,ARI,223.0,0.203110,0.025771,4.678571,9.089952,4.701149,0.311485,0.245257,1


In [14]:
away_off_stats = (
    df_off_stats.groupby([
        'game_id', 'posteam', 'away_team'], as_index=False)
    .agg({        
               
        'air_yards': 'sum',
        'air_epa': 'mean',
        'yac_epa': 'mean',
        'yards_gained': 'mean',
        'total_away_rush_epa': 'last',
        'total_away_pass_epa': 'last'
        
    })
)

away_off_stats = away_off_stats[away_off_stats['posteam'] == away_off_stats['away_team']]

away_off_stats = away_off_stats[['game_id', 'away_team', 'air_yards', 'air_epa', 'yac_epa', 'yards_gained', 'total_away_rush_epa', 'total_away_pass_epa']]

away_off_stats

Unnamed: 0,game_id,away_team,air_yards,air_epa,yac_epa,yards_gained,total_away_rush_epa,total_away_pass_epa
0,2015_01_BAL_DEN,BAL,192.0,0.483464,-0.913622,2.369863,0.466295,-5.530431
2,2015_01_CAR_JAX,CAR,255.0,0.620840,-0.523357,3.130952,-3.448772,18.808174
4,2015_01_CIN_OAK,CIN,281.0,0.519265,-0.110283,4.771084,7.602132,17.605350
6,2015_01_CLE_NYJ,CLE,242.0,0.723360,-0.511172,3.867470,-5.123583,-14.821829
8,2015_01_DET_SD,DET,203.0,0.152260,-0.106199,4.718750,7.139055,-13.664722
...,...,...,...,...,...,...,...,...
5236,2024_18_NO_TB,NO,370.0,0.570054,-0.529343,3.560976,-3.531472,-11.079356
5238,2024_18_NYG_PHI,NYG,106.0,-0.152756,-0.146393,3.500000,12.487288,-16.192055
5241,2024_18_SEA_LA,SEA,215.0,0.323960,0.205319,4.732394,3.371261,7.761229
5243,2024_18_SF_ARI,SF,335.0,0.515233,-0.329818,4.688172,-10.304694,-4.701149


In [15]:
pass_epa_away = (
    df_twofour[
        (df_twofour['pass'] == 1) &
        (df_twofour['posteam'] == df_twofour['away_team'])
    ]
    .groupby(['game_id', 'away_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'pass_epa',
                     
                    })
)

rush_epa_away = (
    df_twofour[
        (df_twofour['rush'] == 1) &
        (df_twofour['posteam'] == df_twofour['away_team'])
    ]
    .groupby(['game_id', 'away_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'rush_epa',
                    })
)

merged_away_epa = pd.merge(pass_epa_away, rush_epa_away, on = ["game_id", "away_team"], how = "inner")
merged_away_epa


Unnamed: 0,game_id,away_team,pass_epa,rush_epa
0,2015_01_BAL_DEN,BAL,-0.475791,-0.189689
1,2015_01_CAR_JAX,CAR,-0.012959,-0.181480
2,2015_01_CIN_OAK,CIN,0.392640,0.014176
3,2015_01_CLE_NYJ,CLE,-0.237395,-0.236218
4,2015_01_DET_SD,DET,-0.040509,0.085140
...,...,...,...,...
2618,2024_18_NO_TB,NO,0.032712,-0.050440
2619,2024_18_NYG_PHI,NYG,-0.299149,0.090526
2620,2024_18_SEA_LA,SEA,0.327962,0.144192
2621,2024_18_SF_ARI,SF,0.046621,-0.087529


In [16]:
mgd_away_off_stats = pd.merge(away_off_stats, merged_away_epa, on = ["game_id", "away_team"], how = "inner")

mgd_away_off_stats["is_home"]=0

mgd_away_off_stats = (
        mgd_away_off_stats.rename(columns={
            'total_away_rush_epa': 'tot_rush_epa',
            'total_away_pass_epa': 'tot_pass_epa',
            'game_id': 'Game ID',
            'away_team': 'Team'
        })
    )

mgd_away_off_stats

Unnamed: 0,Game ID,Team,air_yards,air_epa,yac_epa,yards_gained,tot_rush_epa,tot_pass_epa,pass_epa,rush_epa,is_home
0,2015_01_BAL_DEN,BAL,192.0,0.483464,-0.913622,2.369863,0.466295,-5.530431,-0.475791,-0.189689,0
1,2015_01_CAR_JAX,CAR,255.0,0.620840,-0.523357,3.130952,-3.448772,18.808174,-0.012959,-0.181480,0
2,2015_01_CIN_OAK,CIN,281.0,0.519265,-0.110283,4.771084,7.602132,17.605350,0.392640,0.014176,0
3,2015_01_CLE_NYJ,CLE,242.0,0.723360,-0.511172,3.867470,-5.123583,-14.821829,-0.237395,-0.236218,0
4,2015_01_DET_SD,DET,203.0,0.152260,-0.106199,4.718750,7.139055,-13.664722,-0.040509,0.085140,0
...,...,...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,NO,370.0,0.570054,-0.529343,3.560976,-3.531472,-11.079356,0.032712,-0.050440,0
2619,2024_18_NYG_PHI,NYG,106.0,-0.152756,-0.146393,3.500000,12.487288,-16.192055,-0.299149,0.090526,0
2620,2024_18_SEA_LA,SEA,215.0,0.323960,0.205319,4.732394,3.371261,7.761229,0.327962,0.144192,0
2621,2024_18_SF_ARI,SF,335.0,0.515233,-0.329818,4.688172,-10.304694,-4.701149,0.046621,-0.087529,0


In [17]:
'''

Section Break

Moving to Defensive Stats

'''

'\n\nSection Break\n\nMoving to Defensive Stats\n\n'

In [18]:
#Rush epa/game
#home first

    
def_epa_rush_home = (
    df_twofour[(df_twofour['rush'] == 1) & (df_twofour['home_team'] ==df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'home_team'], as_index=False)
    .agg({        
        'epa': 'mean'
  
    })
)



def_epa_rush_home = (def_epa_rush_home.rename(
    columns={
            'game_id': 'Game ID',
            'home_team': 'Team',
            'epa': "Def Rush EPA"
        })
    )

def_epa_rush_home.drop(columns=['defteam'], inplace=True)

def_epa_rush_home




Unnamed: 0,Game ID,Team,Def Rush EPA
0,2015_01_BAL_DEN,DEN,-0.189689
1,2015_01_CAR_JAX,JAX,-0.181480
2,2015_01_CIN_OAK,LV,0.014176
3,2015_01_CLE_NYJ,NYJ,-0.236218
4,2015_01_DET_SD,LAC,0.085140
...,...,...,...
2618,2024_18_NO_TB,TB,-0.050440
2619,2024_18_NYG_PHI,PHI,0.090526
2620,2024_18_SEA_LA,LA,0.144192
2621,2024_18_SF_ARI,ARI,-0.087529


In [19]:
#Rush epa/game
#away 

def_epa_rush_away = (
    df_twofour[(df_twofour['rush'] == 1) & (df_twofour['away_team'] == df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'away_team'], as_index=False)
    .agg({        
        'epa': 'mean'
  
    })
)

def_epa_rush_away = (def_epa_rush_away.rename(
    columns={
            'game_id': 'Game ID',
            'away_team': 'Team',
            'epa': "Def Rush EPA"
        })
    )

def_epa_rush_away.drop(columns=['defteam'], inplace=True)

def_epa_rush_away




Unnamed: 0,Game ID,Team,Def Rush EPA
0,2015_01_BAL_DEN,BAL,-0.201215
1,2015_01_CAR_JAX,CAR,-0.017656
2,2015_01_CIN_OAK,CIN,-0.424873
3,2015_01_CLE_NYJ,CLE,-0.044048
4,2015_01_DET_SD,DET,-0.227229
...,...,...,...
2618,2024_18_NO_TB,NO,-0.163374
2619,2024_18_NYG_PHI,NYG,-0.426006
2620,2024_18_SEA_LA,SEA,-0.023251
2621,2024_18_SF_ARI,SF,0.245257


In [20]:
'''
Defensive EPA - Pass & Home Team
'''


def_epa_pass_home = (
    df_twofour[(df_twofour['pass'] == 1) & (df_twofour['home_team'] == df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'home_team'], as_index=False)
    .agg({        
        'epa': 'mean'
  
    })
)

def_epa_pass_home = (def_epa_pass_home.rename(
    columns={
            'game_id': 'Game ID',
            'home_team': 'Team',
            'epa': "Def Pass EPA"
        })
    )

def_epa_pass_home.drop(columns=['defteam'], inplace=True)

def_epa_pass_home


Unnamed: 0,Game ID,Team,Def Pass EPA
0,2015_01_BAL_DEN,DEN,-0.475791
1,2015_01_CAR_JAX,JAX,-0.012959
2,2015_01_CIN_OAK,LV,0.392640
3,2015_01_CLE_NYJ,NYJ,-0.237395
4,2015_01_DET_SD,LAC,-0.040509
...,...,...,...
2618,2024_18_NO_TB,TB,0.032712
2619,2024_18_NYG_PHI,PHI,-0.299149
2620,2024_18_SEA_LA,LA,0.327962
2621,2024_18_SF_ARI,ARI,0.046621


In [21]:
'''
Defensive EPA - Pass & Away Team
'''


def_epa_pass_away = (
    df_twofour[(df_twofour['pass'] == 1) & (df_twofour['away_team'] == df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'away_team'], as_index=False)
    .agg({        
        'epa': 'mean'
  
    })
)

def_epa_pass_away = (def_epa_pass_away.rename(
    columns={
            'game_id': 'Game ID',
            'away_team': 'Team',
            'epa': "Def Pass EPA"
        })
    )

def_epa_pass_away.drop(columns=['defteam'], inplace=True)

def_epa_pass_away

Unnamed: 0,Game ID,Team,Def Pass EPA
0,2015_01_BAL_DEN,BAL,-0.240782
1,2015_01_CAR_JAX,CAR,-0.440445
2,2015_01_CIN_OAK,CIN,-0.087525
3,2015_01_CLE_NYJ,CLE,0.430251
4,2015_01_DET_SD,DET,0.288580
...,...,...,...
2618,2024_18_NO_TB,NO,0.389386
2619,2024_18_NYG_PHI,NYG,0.174808
2620,2024_18_SEA_LA,SEA,0.044681
2621,2024_18_SF_ARI,SF,0.311485


In [22]:


def_stats_home = df_twofour[['game_id', 'defteam','home_team', 'epa', 'interception', 'fumble_forced', 'tackled_for_loss', 'sack', 'qb_hit']] 


def_stats_home = (
    def_stats_home[(df_twofour['home_team'] == df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'home_team'], as_index=False)
    .agg({
        'interception': 'sum',
        'fumble_forced': 'sum',
        'tackled_for_loss': 'sum',
        'sack': 'sum',
        'qb_hit': 'sum'})
)


def_stats_home = (def_stats_home.rename(
    columns={
            'game_id': 'Game ID',
            'home_team': 'Team'            
        })
    )

def_stats_home.drop(columns=['defteam'], inplace=True)

def_stats_home   



Unnamed: 0,Game ID,Team,interception,fumble_forced,tackled_for_loss,sack,qb_hit
0,2015_01_BAL_DEN,DEN,2.0,0.0,3.0,2.0,7.0
1,2015_01_CAR_JAX,JAX,1.0,0.0,3.0,2.0,2.0
2,2015_01_CIN_OAK,LV,0.0,0.0,2.0,0.0,2.0
3,2015_01_CLE_NYJ,NYJ,1.0,3.0,3.0,3.0,5.0
4,2015_01_DET_SD,LAC,2.0,1.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,TB,0.0,0.0,4.0,2.0,6.0
2619,2024_18_NYG_PHI,PHI,1.0,0.0,0.0,0.0,2.0
2620,2024_18_SEA_LA,LA,0.0,0.0,2.0,2.0,3.0
2621,2024_18_SF_ARI,ARI,2.0,4.0,2.0,2.0,4.0


In [23]:
def_stats_away = df_twofour[['game_id', 'defteam','away_team', 'epa', 'interception', 'fumble_forced', 'tackled_for_loss', 'sack', 'qb_hit']] 


def_stats_away = (
    def_stats_away[(df_twofour['away_team'] == df_twofour['defteam'])].groupby([
        'game_id', 'defteam', 'away_team'], as_index=False)
    .agg({
        'interception': 'sum',
        'fumble_forced': 'sum',
        'tackled_for_loss': 'sum',
        'sack': 'sum',
        'qb_hit': 'sum'})
)


def_stats_away = (def_stats_away.rename(
    columns={
            'game_id': 'Game ID',
            'away_team': 'Team'            
        })
    )

def_stats_away.drop(columns=['defteam'], inplace=True)

def_stats_away      




Unnamed: 0,Game ID,Team,interception,fumble_forced,tackled_for_loss,sack,qb_hit
0,2015_01_BAL_DEN,BAL,1.0,0.0,3.0,4.0,5.0
1,2015_01_CAR_JAX,CAR,2.0,1.0,3.0,5.0,7.0
2,2015_01_CIN_OAK,CIN,1.0,2.0,0.0,2.0,5.0
3,2015_01_CLE_NYJ,CLE,1.0,1.0,1.0,0.0,2.0
4,2015_01_DET_SD,DET,2.0,1.0,3.0,2.0,6.0
...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,NO,1.0,0.0,3.0,1.0,1.0
2619,2024_18_NYG_PHI,NYG,0.0,0.0,3.0,2.0,7.0
2620,2024_18_SEA_LA,SEA,1.0,0.0,0.0,3.0,9.0
2621,2024_18_SF_ARI,SF,0.0,0.0,2.0,0.0,2.0


In [24]:
'''
Consolidating defense stats

first merge home: Rush EPA, Pass EPA

'''

merged_home_df = pd.merge(def_epa_rush_home, def_epa_pass_home, on = ["Game ID", "Team"], how = "inner")

merged_home_df = pd.merge(merged_home_df, def_stats_home, on = ["Game ID", "Team"], how = "inner")

#merged_home_df["is_home"]=1

merged_home_df



Unnamed: 0,Game ID,Team,Def Rush EPA,Def Pass EPA,interception,fumble_forced,tackled_for_loss,sack,qb_hit
0,2015_01_BAL_DEN,DEN,-0.189689,-0.475791,2.0,0.0,3.0,2.0,7.0
1,2015_01_CAR_JAX,JAX,-0.181480,-0.012959,1.0,0.0,3.0,2.0,2.0
2,2015_01_CIN_OAK,LV,0.014176,0.392640,0.0,0.0,2.0,0.0,2.0
3,2015_01_CLE_NYJ,NYJ,-0.236218,-0.237395,1.0,3.0,3.0,3.0,5.0
4,2015_01_DET_SD,LAC,0.085140,-0.040509,2.0,1.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,TB,-0.050440,0.032712,0.0,0.0,4.0,2.0,6.0
2619,2024_18_NYG_PHI,PHI,0.090526,-0.299149,1.0,0.0,0.0,0.0,2.0
2620,2024_18_SEA_LA,LA,0.144192,0.327962,0.0,0.0,2.0,2.0,3.0
2621,2024_18_SF_ARI,ARI,-0.087529,0.046621,2.0,4.0,2.0,2.0,4.0


In [25]:
'''
now Consolidating away stats

first merge home: Rush EPA, Pass EPA

'''

merged_away_df = pd.merge(def_epa_rush_away, def_epa_pass_away, on = ["Game ID", "Team"], how = "inner")

merged_away_df = pd.merge(merged_away_df, def_stats_away, on = ["Game ID", "Team"], how = "inner")

#merged_away_df["is_home"]=0

merged_away_df

Unnamed: 0,Game ID,Team,Def Rush EPA,Def Pass EPA,interception,fumble_forced,tackled_for_loss,sack,qb_hit
0,2015_01_BAL_DEN,BAL,-0.201215,-0.240782,1.0,0.0,3.0,4.0,5.0
1,2015_01_CAR_JAX,CAR,-0.017656,-0.440445,2.0,1.0,3.0,5.0,7.0
2,2015_01_CIN_OAK,CIN,-0.424873,-0.087525,1.0,2.0,0.0,2.0,5.0
3,2015_01_CLE_NYJ,CLE,-0.044048,0.430251,1.0,1.0,1.0,0.0,2.0
4,2015_01_DET_SD,DET,-0.227229,0.288580,2.0,1.0,3.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,NO,-0.163374,0.389386,1.0,0.0,3.0,1.0,1.0
2619,2024_18_NYG_PHI,NYG,-0.426006,0.174808,0.0,0.0,3.0,2.0,7.0
2620,2024_18_SEA_LA,SEA,-0.023251,0.044681,1.0,0.0,0.0,3.0,9.0
2621,2024_18_SF_ARI,SF,0.245257,0.311485,0.0,0.0,2.0,0.0,2.0


In [26]:
'''
To prepare data for our model we'll now need to consolidate a Team's offensive data with the opponents defensive data.

This means Home Offense will be merged with Away Defense /// Away Offense with Home Defense.

To do this we'll first rename our defensive Column Headers with "opponent"

'''


'\nTo prepare data for our model we\'ll now need to consolidate a Team\'s offensive data with the opponents defensive data.\n\nThis means Home Offense will be merged with Away Defense /// Away Offense with Home Defense.\n\nTo do this we\'ll first rename our defensive Column Headers with "opponent"\n\n'

In [27]:
#rename home def

merged_home_df = (merged_home_df.rename(
    columns={
            'Team': 'Opp_Team',
            'interception': 'o_ints',
            'fumble_forced': 'o_forced_fumble',
            'tackled_for_loss': 'o_tac_for_loss',
            'sack': 'o_sacks',
                    
        })
    )

merged_home_df



Unnamed: 0,Game ID,Opp_Team,Def Rush EPA,Def Pass EPA,o_ints,o_forced_fumble,o_tac_for_loss,o_sacks,qb_hit
0,2015_01_BAL_DEN,DEN,-0.189689,-0.475791,2.0,0.0,3.0,2.0,7.0
1,2015_01_CAR_JAX,JAX,-0.181480,-0.012959,1.0,0.0,3.0,2.0,2.0
2,2015_01_CIN_OAK,LV,0.014176,0.392640,0.0,0.0,2.0,0.0,2.0
3,2015_01_CLE_NYJ,NYJ,-0.236218,-0.237395,1.0,3.0,3.0,3.0,5.0
4,2015_01_DET_SD,LAC,0.085140,-0.040509,2.0,1.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,TB,-0.050440,0.032712,0.0,0.0,4.0,2.0,6.0
2619,2024_18_NYG_PHI,PHI,0.090526,-0.299149,1.0,0.0,0.0,0.0,2.0
2620,2024_18_SEA_LA,LA,0.144192,0.327962,0.0,0.0,2.0,2.0,3.0
2621,2024_18_SF_ARI,ARI,-0.087529,0.046621,2.0,4.0,2.0,2.0,4.0


In [28]:
#rename away_df

merged_away_df = (merged_away_df.rename(
    columns={
            
            'Team': 'Opp_Team',
            'interception': 'o_ints',
            'fumble_forced': 'o_forced_fumble',
            'tackled_for_loss': 'o_tac_for_loss',
            'sack': 'o_sacks',
                    
        })
    )



merged_away_df

Unnamed: 0,Game ID,Opp_Team,Def Rush EPA,Def Pass EPA,o_ints,o_forced_fumble,o_tac_for_loss,o_sacks,qb_hit
0,2015_01_BAL_DEN,BAL,-0.201215,-0.240782,1.0,0.0,3.0,4.0,5.0
1,2015_01_CAR_JAX,CAR,-0.017656,-0.440445,2.0,1.0,3.0,5.0,7.0
2,2015_01_CIN_OAK,CIN,-0.424873,-0.087525,1.0,2.0,0.0,2.0,5.0
3,2015_01_CLE_NYJ,CLE,-0.044048,0.430251,1.0,1.0,1.0,0.0,2.0
4,2015_01_DET_SD,DET,-0.227229,0.288580,2.0,1.0,3.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,NO,-0.163374,0.389386,1.0,0.0,3.0,1.0,1.0
2619,2024_18_NYG_PHI,NYG,-0.426006,0.174808,0.0,0.0,3.0,2.0,7.0
2620,2024_18_SEA_LA,SEA,-0.023251,0.044681,1.0,0.0,0.0,3.0,9.0
2621,2024_18_SF_ARI,SF,0.245257,0.311485,0.0,0.0,2.0,0.0,2.0


In [29]:
merged_home_nums = pd.merge(home_scores, mgd_home_off_stats, on = ["Game ID", "Team"], how = "inner")

merged_home_nums = pd.merge(merged_home_nums, merged_away_df, on = ["Game ID"], how = "inner")

merged_home_nums

Unnamed: 0,Game ID,Team,Score,air_yards,air_epa,yac_epa,yards_gained,tot_rush_epa,tot_pass_epa,pass_epa,rush_epa,is_home,Opp_Team,Def Rush EPA,Def Pass EPA,o_ints,o_forced_fumble,o_tac_for_loss,o_sacks,qb_hit
0,2015_01_BAL_DEN,DEN,19.0,288.0,0.409077,-0.563374,2.406593,-0.466295,5.530431,-0.240782,-0.201215,1,BAL,-0.201215,-0.240782,1.0,0.0,3.0,4.0,5.0
1,2015_01_CAR_JAX,JAX,9.0,285.0,0.534663,-0.879461,3.312500,3.448772,-18.808174,-0.440445,-0.017656,1,CAR,-0.017656,-0.440445,2.0,1.0,3.0,5.0,7.0
2,2015_01_CIN_OAK,LV,13.0,223.0,0.272619,-0.168092,3.113924,-8.325734,-17.605350,-0.087525,-0.424873,1,CIN,-0.424873,-0.087525,1.0,2.0,0.0,2.0,5.0
3,2015_01_CLE_NYJ,NYJ,31.0,254.0,0.880318,-0.450067,4.625000,7.088470,14.367256,0.430251,-0.044048,1,CLE,-0.044048,0.430251,1.0,1.0,1.0,0.0,2.0
4,2015_01_DET_SD,LAC,33.0,153.0,-0.118145,0.513762,5.250000,-7.139055,13.664722,0.288580,-0.227229,1,DET,-0.227229,0.288580,2.0,1.0,3.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,TB,27.0,258.0,0.823625,-0.500669,4.702381,3.531472,10.484469,0.389386,-0.163374,1,NO,-0.163374,0.389386,1.0,0.0,3.0,1.0,1.0
2619,2024_18_NYG_PHI,PHI,20.0,270.0,0.290206,-0.005745,3.695122,-12.487288,16.192055,0.174808,-0.426006,1,NYG,-0.426006,0.174808,0.0,0.0,3.0,2.0,7.0
2620,2024_18_SEA_LA,LA,25.0,221.0,0.170221,0.047322,5.037500,-3.371261,-7.761229,0.044681,-0.023251,1,SEA,-0.023251,0.044681,1.0,0.0,0.0,3.0,9.0
2621,2024_18_SF_ARI,ARI,47.0,223.0,0.203110,0.025771,4.678571,9.089952,4.701149,0.311485,0.245257,1,SF,0.245257,0.311485,0.0,0.0,2.0,0.0,2.0


In [30]:
merged_away_nums = pd.merge(away_scores, mgd_away_off_stats, on = ["Game ID", "Team"], how = "inner")

merged_away_nums = pd.merge(merged_away_nums, merged_home_df, on = ["Game ID"], how = "inner")

merged_away_nums

Unnamed: 0,Game ID,Team,Score,air_yards,air_epa,yac_epa,yards_gained,tot_rush_epa,tot_pass_epa,pass_epa,rush_epa,is_home,Opp_Team,Def Rush EPA,Def Pass EPA,o_ints,o_forced_fumble,o_tac_for_loss,o_sacks,qb_hit
0,2015_01_BAL_DEN,BAL,13.0,192.0,0.483464,-0.913622,2.369863,0.466295,-5.530431,-0.475791,-0.189689,0,DEN,-0.189689,-0.475791,2.0,0.0,3.0,2.0,7.0
1,2015_01_CAR_JAX,CAR,20.0,255.0,0.620840,-0.523357,3.130952,-3.448772,18.808174,-0.012959,-0.181480,0,JAX,-0.181480,-0.012959,1.0,0.0,3.0,2.0,2.0
2,2015_01_CIN_OAK,CIN,33.0,281.0,0.519265,-0.110283,4.771084,7.602132,17.605350,0.392640,0.014176,0,LV,0.014176,0.392640,0.0,0.0,2.0,0.0,2.0
3,2015_01_CLE_NYJ,CLE,10.0,242.0,0.723360,-0.511172,3.867470,-5.123583,-14.821829,-0.237395,-0.236218,0,NYJ,-0.236218,-0.237395,1.0,3.0,3.0,3.0,5.0
4,2015_01_DET_SD,DET,28.0,203.0,0.152260,-0.106199,4.718750,7.139055,-13.664722,-0.040509,0.085140,0,LAC,0.085140,-0.040509,2.0,1.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618,2024_18_NO_TB,NO,19.0,370.0,0.570054,-0.529343,3.560976,-3.531472,-11.079356,0.032712,-0.050440,0,TB,-0.050440,0.032712,0.0,0.0,4.0,2.0,6.0
2619,2024_18_NYG_PHI,NYG,13.0,106.0,-0.152756,-0.146393,3.500000,12.487288,-16.192055,-0.299149,0.090526,0,PHI,0.090526,-0.299149,1.0,0.0,0.0,0.0,2.0
2620,2024_18_SEA_LA,SEA,30.0,215.0,0.323960,0.205319,4.732394,3.371261,7.761229,0.327962,0.144192,0,LA,0.144192,0.327962,0.0,0.0,2.0,2.0,3.0
2621,2024_18_SF_ARI,SF,24.0,335.0,0.515233,-0.329818,4.688172,-10.304694,-4.701149,0.046621,-0.087529,0,ARI,-0.087529,0.046621,2.0,4.0,2.0,2.0,4.0


In [31]:
#all combined data concantenated to prep for model training

train_df = pd.concat([merged_home_nums, merged_away_nums], ignore_index=True)

train_df

Unnamed: 0,Game ID,Team,Score,air_yards,air_epa,yac_epa,yards_gained,tot_rush_epa,tot_pass_epa,pass_epa,rush_epa,is_home,Opp_Team,Def Rush EPA,Def Pass EPA,o_ints,o_forced_fumble,o_tac_for_loss,o_sacks,qb_hit
0,2015_01_BAL_DEN,DEN,19.0,288.0,0.409077,-0.563374,2.406593,-0.466295,5.530431,-0.240782,-0.201215,1,BAL,-0.201215,-0.240782,1.0,0.0,3.0,4.0,5.0
1,2015_01_CAR_JAX,JAX,9.0,285.0,0.534663,-0.879461,3.312500,3.448772,-18.808174,-0.440445,-0.017656,1,CAR,-0.017656,-0.440445,2.0,1.0,3.0,5.0,7.0
2,2015_01_CIN_OAK,LV,13.0,223.0,0.272619,-0.168092,3.113924,-8.325734,-17.605350,-0.087525,-0.424873,1,CIN,-0.424873,-0.087525,1.0,2.0,0.0,2.0,5.0
3,2015_01_CLE_NYJ,NYJ,31.0,254.0,0.880318,-0.450067,4.625000,7.088470,14.367256,0.430251,-0.044048,1,CLE,-0.044048,0.430251,1.0,1.0,1.0,0.0,2.0
4,2015_01_DET_SD,LAC,33.0,153.0,-0.118145,0.513762,5.250000,-7.139055,13.664722,0.288580,-0.227229,1,DET,-0.227229,0.288580,2.0,1.0,3.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5241,2024_18_NO_TB,NO,19.0,370.0,0.570054,-0.529343,3.560976,-3.531472,-11.079356,0.032712,-0.050440,0,TB,-0.050440,0.032712,0.0,0.0,4.0,2.0,6.0
5242,2024_18_NYG_PHI,NYG,13.0,106.0,-0.152756,-0.146393,3.500000,12.487288,-16.192055,-0.299149,0.090526,0,PHI,0.090526,-0.299149,1.0,0.0,0.0,0.0,2.0
5243,2024_18_SEA_LA,SEA,30.0,215.0,0.323960,0.205319,4.732394,3.371261,7.761229,0.327962,0.144192,0,LA,0.144192,0.327962,0.0,0.0,2.0,2.0,3.0
5244,2024_18_SF_ARI,SF,24.0,335.0,0.515233,-0.329818,4.688172,-10.304694,-4.701149,0.046621,-0.087529,0,ARI,-0.087529,0.046621,2.0,4.0,2.0,2.0,4.0


In [1]:
os.makedirs("../data", exist_ok=True)

train_df.to_parquet("../data/train_df.parquet")

NameError: name 'os' is not defined

In [33]:
'''
End of file
'''

#train_df.to_clipboard()

'\nEnd of file\n'