In [1]:
import nfl_data_py as nfl
import pandas as pd
import os
import numpy as np
import urllib.request
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import matplotlib.image as mpimg

In [4]:
'''
this section cleans the data, 
it can be easily reused'''


df_twofour = nfl.import_pbp_data([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], downcast=True, cache=False, alt_path=None, include_participation=False)

#regular season data only
df_twofour = df_twofour.loc[df_twofour.season_type=='REG']

'''
The below line removes kick offs, field goals, kneels, etc - only plays with passes, runs, and penalties, and we're going to leave it out because it can impact max scores
'''
#df_twofour = df_twofour.loc[(df_twofour.play_type.isin(['no_play','pass','run'])) & (df_twofour.epa.isna()==False)]

#match play call to play type, so QB scrambles still considered pass plays
df_twofour.loc[df_twofour['pass']==1, 'play_type'] = 'pass'
df_twofour.loc[df_twofour.rush==1, 'play_type'] = 'run'

#reset index to skip missing numbers
df_twofour.reset_index(drop=True, inplace=True)

df_twofour.head()


2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,1.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,,,,...,0.0,0.0,-0.0,,,,,,,
1,36.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.0,,,,,,,
2,51.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.337139,0.691718,4.699278,3.0,0.678964,0.225919,0.456481,54.35191
3,75.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,-0.262481,,,,,,0.545905,-54.590458
4,96.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,0.0,0.0,1.661242,1.794671,4.66265,2.0,0.71235,0.71235,0.968533,3.146732


In [22]:
pass_epa_home = (
    df_twofour[
        (df_twofour['pass'] == 1) &
        (df_twofour['posteam'] == df_twofour['home_team'])
    ]
    .groupby(['game_id', 'home_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'pass_epa',
                    'home_team': 'Team'})
)

pass_epa_home

Unnamed: 0,game_id,Team,pass_epa
0,2015_01_BAL_DEN,DEN,-0.240782
1,2015_01_CAR_JAX,JAX,-0.440445
2,2015_01_CIN_OAK,LV,-0.087525
3,2015_01_CLE_NYJ,NYJ,0.430251
4,2015_01_DET_SD,LAC,0.288580
...,...,...,...
2618,2024_18_NO_TB,TB,0.389386
2619,2024_18_NYG_PHI,PHI,0.174808
2620,2024_18_SEA_LA,LA,0.044681
2621,2024_18_SF_ARI,ARI,0.311485


In [23]:
pass_epa_away = (
    df_twofour[
        (df_twofour['pass'] == 1) &
        (df_twofour['posteam'] == df_twofour['away_team'])
    ]
    .groupby(['game_id', 'away_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'pass_epa',
                     'away_team': 'Team'
                    })
)

pass_epa_away

Unnamed: 0,game_id,Team,pass_epa
0,2015_01_BAL_DEN,BAL,-0.475791
1,2015_01_CAR_JAX,CAR,-0.012959
2,2015_01_CIN_OAK,CIN,0.392640
3,2015_01_CLE_NYJ,CLE,-0.237395
4,2015_01_DET_SD,DET,-0.040509
...,...,...,...
2618,2024_18_NO_TB,NO,0.032712
2619,2024_18_NYG_PHI,NYG,-0.299149
2620,2024_18_SEA_LA,SEA,0.327962
2621,2024_18_SF_ARI,SF,0.046621


In [24]:
rush_epa_home = (
    df_twofour[
        (df_twofour['rush'] == 1) &
        (df_twofour['posteam'] == df_twofour['home_team'])
    ]
    .groupby(['game_id', 'home_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'rush_epa',
                   'home_team': 'Team'})
)


rush_epa_home

Unnamed: 0,game_id,Team,rush_epa
0,2015_01_BAL_DEN,DEN,-0.201215
1,2015_01_CAR_JAX,JAX,-0.017656
2,2015_01_CIN_OAK,LV,-0.424873
3,2015_01_CLE_NYJ,NYJ,-0.044048
4,2015_01_DET_SD,LAC,-0.227229
...,...,...,...
2618,2024_18_NO_TB,TB,-0.163374
2619,2024_18_NYG_PHI,PHI,-0.426006
2620,2024_18_SEA_LA,LA,-0.023251
2621,2024_18_SF_ARI,ARI,0.245257


In [25]:
rush_epa_away = (
    df_twofour[
        (df_twofour['rush'] == 1) &
        (df_twofour['posteam'] == df_twofour['away_team'])
    ]
    .groupby(['game_id', 'away_team'])['epa']
    .mean()
    .reset_index()
    .rename(columns={'epa': 'rush_epa',
                    'away_team': 'Team'})
)


rush_epa_away

Unnamed: 0,game_id,Team,rush_epa
0,2015_01_BAL_DEN,BAL,-0.189689
1,2015_01_CAR_JAX,CAR,-0.181480
2,2015_01_CIN_OAK,CIN,0.014176
3,2015_01_CLE_NYJ,CLE,-0.236218
4,2015_01_DET_SD,DET,0.085140
...,...,...,...
2618,2024_18_NO_TB,NO,-0.050440
2619,2024_18_NYG_PHI,NYG,0.090526
2620,2024_18_SEA_LA,SEA,0.144192
2621,2024_18_SF_ARI,SF,-0.087529


In [33]:
merged_home_epa = pd.merge(pass_epa_home, rush_epa_home, on = ["game_id", "Team"], how = "inner")


merged_home_epa

Unnamed: 0,game_id,Team,pass_epa,rush_epa
0,2015_01_BAL_DEN,DEN,-0.240782,-0.201215
1,2015_01_CAR_JAX,JAX,-0.440445,-0.017656
2,2015_01_CIN_OAK,LV,-0.087525,-0.424873
3,2015_01_CLE_NYJ,NYJ,0.430251,-0.044048
4,2015_01_DET_SD,LAC,0.288580,-0.227229
...,...,...,...,...
2618,2024_18_NO_TB,TB,0.389386,-0.163374
2619,2024_18_NYG_PHI,PHI,0.174808,-0.426006
2620,2024_18_SEA_LA,LA,0.044681,-0.023251
2621,2024_18_SF_ARI,ARI,0.311485,0.245257


In [34]:
merged_away_epa = pd.merge(pass_epa_away, rush_epa_away, on = ["game_id", "Team"], how = "inner")


merged_away_epa

Unnamed: 0,game_id,Team,pass_epa,rush_epa
0,2015_01_BAL_DEN,BAL,-0.475791,-0.189689
1,2015_01_CAR_JAX,CAR,-0.012959,-0.181480
2,2015_01_CIN_OAK,CIN,0.392640,0.014176
3,2015_01_CLE_NYJ,CLE,-0.237395,-0.236218
4,2015_01_DET_SD,DET,-0.040509,0.085140
...,...,...,...,...
2618,2024_18_NO_TB,NO,0.032712,-0.050440
2619,2024_18_NYG_PHI,NYG,-0.299149,0.090526
2620,2024_18_SEA_LA,SEA,0.327962,0.144192
2621,2024_18_SF_ARI,SF,0.046621,-0.087529


In [None]:
#next step is to figure out the merge on my other sheet.