In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import nfl_data_py as nfl
import ssl
ssl._create_default_https_context = ssl._create_stdlib_context
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
import warnings
warnings.filterwarnings('ignore')

# nfl-data-py

In [2]:
# 1999-2021 available for analysis. Some properties are not available in earlier years.  Offensive formation available starting in 2022
seasons = [*range(2022, 2024, 1)]
draft_years = [*range(2000, 2024, 1)]

print(seasons)
print(draft_years)

[2022, 2023]
[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


#### Data dictionary

https://nflreadr.nflverse.com/articles/dictionary_pbp.html

#### Import play-by-play data

In [3]:
play_data = nfl.import_pbp_data(years=seasons, downcast=False, cache=False) # , alt_path=None)

# Clean dataframe using nfl-data-api cleaning function
play_df = nfl.clean_nfl_data(play_data)

play_df.shape

2022 done.
2023 done.


(79397, 384)

In [4]:
# Preliminary play_type filter 'None' and 'no play'
play_df = play_df.loc[~(play_df['play_type'].isin([None,'no_play']))]

play_df.shape

(69048, 384)

In [5]:
# Modify team names to match other data sources
play_df['home_team'] = np.where(play_df['home_team'] == 'OAK', 'LV', play_df['home_team'])
play_df['away_team'] = np.where(play_df['away_team'] == 'OAK', 'LV', play_df['away_team'])
play_df['posteam'] = np.where(play_df['posteam'] == 'OAK', 'LV', play_df['posteam'])
play_df['defteam'] = np.where(play_df['defteam'] == 'OAK', 'LV', play_df['defteam'])

play_df['side_of_field'] = np.where(play_df['side_of_field'] == 'OAK', 'LV', play_df['side_of_field'])

play_df['game_id'] = play_df['game_id'].str.replace('OAK', 'LV', case=True)

# Convert game_date to datetime format
play_df['game_date']= pd.to_datetime(play_df['game_date'])

# Create year column from game_date
play_df['year'] = pd.DatetimeIndex(play_df['game_date']).year
play_df['month'] = pd.DatetimeIndex(play_df['game_date']).month
play_df['day'] = pd.DatetimeIndex(play_df['game_date']).day

play_df['game_alt_id'] = play_df['home_team'] + '_' + play_df['away_team'] + '_' +  play_df['year'].astype(str) + '_' + play_df['month'].astype(str).str.zfill(2) + '_' + play_df['day'].astype(str).str.zfill(2)
    
play_df.drop(['year','month','day'], axis=1, inplace=True)

play_df.sample(1)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense,game_alt_id
77150,4154.0,2023_11_DAL_CAR,2023111900,CAR,DAL,REG,11,DAL,away,CAR,...,"2 RB, 2 TE, 1 WR",8.0,"3 DL, 4 LB, 4 DB",,48544;45536;56450;47875;52934;54919;56457;4349...,00-0033662;00-0038731;00-0035698;00-0036032;00...,00-0035409;00-0032437;00-0038535;00-0033579;00...,11,11,CAR_DAL_2023_11_19


In [6]:
# Move game_alt_id to fourth column in dataframe
cols = play_df.columns.tolist()
cols = cols[:3] + cols[-1:] + cols[3:-1]
play_df = play_df[cols]

In [7]:
# Print column names and missing values count
print(play_df.columns.tolist())

['play_id', 'game_id', 'old_game_id', 'game_alt_id', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_away_score', 'posteam_score', 'defteam_score', 'score_differential', 'posteam_score_post', 'defteam_score_post', 'sco

In [8]:
play_df.home_team.nunique()

32

In [9]:
# Game-play sequence
play_df['play'] = play_df.groupby(['game_id'])['play_id'].cumcount() + 1

play_df.play.nunique()

208

In [10]:
# Print columns with missing values
missing = play_df.isnull().sum()
missing

play_id            0
game_id            0
old_game_id        0
game_alt_id        0
home_team          0
                  ..
players_on_play    0
offense_players    0
defense_players    0
n_offense          0
n_defense          0
Length: 385, dtype: int64

In [11]:
# Return dataframe with missing posteam values
play_check = play_df.loc[play_df['posteam'].isnull()]
play_check

Unnamed: 0,play_id,game_id,old_game_id,game_alt_id,home_team,away_team,season_type,week,posteam,posteam_type,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense


#### Import Schedule data

In [12]:
game_data = nfl.import_schedules(years=seasons)

# Clean dataframe using nfl-data-api cleaning function
game_df = nfl.clean_nfl_data(game_data)

# # Sort dataframe
game_df = game_df.sort_values(by=['season','week','gameday'], ascending=True)

game_df.shape

(556, 46)

In [13]:
game_df.head(2)

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,...,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
6137,2022_01_BUF_LA,2022,REG,1,2022-09-08,Thursday,20:20,BUF,31.0,LA,...,,00-0034857,00-0026498,Josh Allen,Matthew Stafford,Sean McDermott,Sean McVay,Carl Cheffers,LAX01,SoFi Stadium
6138,2022_01_NO_ATL,2022,REG,1,2022-09-11,Sunday,13:00,NO,27.0,ATL,...,,00-0031503,00-0032268,Jameis Winston,Marcus Mariota,Dennis Allen,Arthur Smith,Alex Kemp,ATL97,Mercedes-Benz Stadium


In [14]:
# Modify team codes to match schedule
game_df['home_team'] = np.where(game_df['home_team'] == 'OAK', 'LV', game_df['home_team'])
game_df['away_team'] = np.where(game_df['away_team'] == 'OAK', 'LV', game_df['away_team'])
game_df['game_id'] = game_df['game_id'].str.replace('OAK', 'LV', case=True)

In [15]:
game_df.home_team.nunique()

32

In [16]:
game_df.away_team.nunique()

32

In [17]:
# Create teams_date_id
game_df['year'] = pd.DatetimeIndex(game_df['gameday']).year
game_df['month'] = pd.DatetimeIndex(game_df['gameday']).month
game_df['day'] = pd.DatetimeIndex(game_df['gameday']).day

game_df['game_alt_id'] = game_df['home_team'] + '_' + game_df['away_team'] + '_' + game_df['year'].astype(str) + '_' + game_df['month'].astype(str).str.zfill(2) + '_' + game_df['day'].astype(str).str.zfill(2)
    
game_df.drop(['year','month','day'], axis=1, inplace=True)

# Create game_id lookup table (joined with QBR table)
game_df = game_df[['game_id','game_alt_id','gameday','weekday','gametime','under_odds','over_odds','away_rest','home_rest']]

game_df.sample(2)

Unnamed: 0,game_id,game_alt_id,gameday,weekday,gametime,under_odds,over_odds,away_rest,home_rest
6168,2022_02_MIN_PHI,PHI_MIN_2022_09_19,2022-09-19,Monday,20:30,-115.0,102.0,8,8
6246,2022_08_DEN_JAX,JAX_DEN_2022_10_30,2022-10-30,Sunday,09:30,-103.0,-109.0,7,7


#### Import QBR data

In [18]:
qbr_data = nfl.import_qbr(years=seasons, level='nfl', frequency='weekly')

# Clean dataframe using nfl-data-api cleaning function
qbr_df = nfl.clean_nfl_data(qbr_data)

qbr_df.shape

(896, 30)

In [19]:
qbr_df.tail(3)

Unnamed: 0,season,season_type,game_id,game_week,week_text,team_abb,player_id,name_short,rank,qbr_total,...,name_last,name_display,headshot_href,team,opp_id,opp_abb,opp_team,opp_name,week_num,qualified
9329,2023,Regular,401547551,12,Week 12,WSH,4426875,S. Howell,4.0,57.1,...,Howell,Sam Howell,https://a.espncdn.com/i/headshots/nfl/players/...,Commanders,6,DAL,Dallas Cowboys,Cowboys,12,True
9330,2023,Regular,401547552,12,Week 12,DET,3046779,J. Goff,5.0,15.2,...,Goff,Jared Goff,https://a.espncdn.com/i/headshots/nfl/players/...,Lions,9,GB,Green Bay Packers,Packers,12,True
9331,2023,Regular,401547553,12,Week 12,SEA,15864,G. Smith,6.0,15.1,...,Smith,Geno Smith,https://a.espncdn.com/i/headshots/nfl/players/...,Seahawks,25,SF,San Francisco 49ers,49ers,12,True


In [20]:
# Rename columns
qbr_df = qbr_df.rename(columns={'team_abb':'posteam', 'opp_abb':'defteam'})

# Drop column
qbr_df.drop(['game_id'], axis=1, inplace=True)

# Rename columns
qbr_df = qbr_df.rename(columns={'qb_plays': 'plays'})

# Create sequential week column
qbr_df['week'] = np.where((qbr_df['season'] < 2021) & (qbr_df['season_type'] == 'Playoffs'), (17 + qbr_df['game_week']),
                 np.where((qbr_df['season'] >= 2021) & (qbr_df['season_type'] == 'Playoffs'), (18 + qbr_df['game_week']),
                 qbr_df['game_week']))

# Modify team codes to match schedule
qbr_df['posteam'] = np.where(qbr_df['posteam'] == 'OAK', 'LV', qbr_df['posteam'])
qbr_df['defteam'] = np.where(qbr_df['defteam'] == 'OAK', 'LV', qbr_df['defteam'])
qbr_df['posteam'] = np.where(qbr_df['posteam'] == 'LAR', 'LA', qbr_df['posteam'])
qbr_df['defteam'] = np.where(qbr_df['defteam'] == 'LAR', 'LA', qbr_df['defteam'])
qbr_df['posteam'] = np.where(qbr_df['posteam'] == 'WSH', 'WAS', qbr_df['posteam'])
qbr_df['defteam'] = np.where(qbr_df['defteam'] == 'WSH', 'WAS', qbr_df['defteam'])

# Impute incorrect values
qbr_df['week'] = np.where((qbr_df['season'] == 2018) & (qbr_df['posteam'] == 'LV') & (qbr_df['defteam'] == 'BAL') & (qbr_df['week'] == 1), 12, qbr_df['week'])
qbr_df['week'] = np.where((qbr_df['season'] == 2018) & (qbr_df['posteam'] == 'BAL') & (qbr_df['defteam'] == 'LV') & (qbr_df['week'] == 1), 12, qbr_df['week'])
qbr_df['defteam'] = np.where((qbr_df['season'] == 2018) & (qbr_df['posteam'] == 'LV') & (qbr_df['defteam'] == 'CHI') & (qbr_df['week'] == 9), 'SF', qbr_df['defteam'])
qbr_df['defteam'] = np.where((qbr_df['season'] == 2018) & (qbr_df['posteam'] == 'CHI') & (qbr_df['defteam'] == 'LV') & (qbr_df['week'] == 9), 'SF', qbr_df['defteam'])

# Create both combinations of game_id
qbr_df['game_id_1'] = qbr_df['season'].astype(str) + '_' + qbr_df['week'].astype(str).str.zfill(2) + '_' + qbr_df['posteam'] + '_' + qbr_df['defteam']
qbr_df['game_id_2'] = qbr_df['season'].astype(str) + '_' + qbr_df['week'].astype(str).str.zfill(2) + '_' + qbr_df['defteam'] + '_' + qbr_df['posteam']

# Reduce dataframe dimensions
qbr_df = qbr_df[['game_id_1','game_id_2','season','week','posteam','rank','pts_added','plays','qbr_raw','qbr_total','exp_sack','sack','pass','run']]

# Merge with schedule dataframe to get game_id
qbr_df = pd.merge(qbr_df, game_df, how='left', left_on=['game_id_1'], right_on = ['game_id'])
qbr_df = qbr_df.rename(columns={'game_id':'game_id1',
                                'game_alt_id':'game_alt_id1',
                                'gameday':'gameday1',
                                'weekday':'weekday1',
                                'gametime':'gametime1',
                                'under_odds':'under_odds1',
                                'over_odds':'over_odds1',
                                'away_rest':'away_rest1',
                                'home_rest':'home_rest1',
                                })

qbr_df = pd.merge(qbr_df, game_df, how='left', left_on=['game_id_2'], right_on = ['game_id'])
qbr_df = qbr_df.rename(columns={'game_id':'game_id2',
                                'game_alt_id':'game_alt_id2',
                                'gameday':'gameday2',
                                'weekday':'weekday2',
                                'gametime':'gametime2',
                                'under_odds':'under_odds2',
                                'over_odds':'over_odds2',
                                'away_rest':'away_rest2',
                                'home_rest':'home_rest2',
                                })

# Combine columns
qbr_df['game_id'] = np.where(qbr_df['game_id1'].isnull(), qbr_df['game_id2'], qbr_df['game_id1'])
qbr_df['game_alt_id'] = np.where(qbr_df['game_alt_id1'].isnull(), qbr_df['game_alt_id2'], qbr_df['game_alt_id1'])
qbr_df['gameday'] = np.where(qbr_df['gameday1'].isnull(), qbr_df['gameday2'], qbr_df['gameday1'])
qbr_df['weekday'] = np.where(qbr_df['weekday1'].isnull(), qbr_df['weekday2'], qbr_df['weekday1'])

qbr_df['gametime'] = np.where(qbr_df['gametime1'].isnull(), qbr_df['gametime2'], qbr_df['gametime1'])
qbr_df['under_odds'] = np.where(qbr_df['under_odds1'].isnull(), qbr_df['under_odds2'], qbr_df['under_odds1'])
qbr_df['over_odds'] = np.where(qbr_df['over_odds1'].isnull(), qbr_df['over_odds2'], qbr_df['over_odds1'])
qbr_df['away_rest'] = np.where(qbr_df['away_rest1'].isnull(), qbr_df['away_rest2'], qbr_df['away_rest1'])
qbr_df['home_rest'] = np.where(qbr_df['home_rest1'].isnull(), qbr_df['home_rest2'], qbr_df['home_rest1'])

qbr_df['game_id'].fillna('NULL', inplace = True)

qbr_df.drop(['game_id_1','game_id_2','game_id1','game_id2','game_alt_id1','game_alt_id2','gameday1','gameday2','weekday1','weekday2','gametime1','gametime2','under_odds1','under_odds2','over_odds1','over_odds2','away_rest1','away_rest2','home_rest1','home_rest2'], axis=1, inplace=True)

qbr_df.shape

(896, 21)

In [21]:
# Filter missing game_alt_id values
qbr_df = qbr_df.loc[~(qbr_df['game_alt_id'].isnull())]

qbr_df.shape

(880, 21)

In [22]:
# Sort dataframe and drop quarterback with fewer passes during week
qbr_df = qbr_df.sort_values(by=['season','posteam','week','plays'], ascending=True)

qbr_df = qbr_df.drop_duplicates(['season','posteam','week'], keep='last')

qbr_df.shape

(874, 21)

In [23]:
# Create rolling average of quarterback statistics
qbr_df['qb_rank'] = qbr_df.groupby(['season',
                                    'posteam'])['rank'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())

qbr_df['qb_pts'] = qbr_df.groupby(['season',
                                    'posteam'])['pts_added'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())

qbr_df['qb_plays'] = qbr_df.groupby(['season',
                                    'posteam'])['plays'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())
qbr_df['qb_qbr_raw'] = qbr_df.groupby(['season',
                                    'posteam'])['qbr_raw'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())
qbr_df['qb_qbr'] = qbr_df.groupby(['season',
                                    'posteam'])['qbr_total'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())
# qbr_df['qb_sack'] = qbr_df.groupby(['season',
#                                     'posteam'])['sack'].transform(lambda x: x.rolling(window=3,
#                                                                                       min_periods=1,
#                                                                                       closed='left',
#                                                                                       center=False).mean())
# qbr_df['qb_exp_sack'] = qbr_df.groupby(['season',
#                                     'posteam'])['exp_sack'].transform(lambda x: x.rolling(window=3,
#                                                                                       min_periods=1,
#                                                                                       closed='left',
#                                                                                       center=False).mean())
qbr_df['qb_pass'] = qbr_df.groupby(['season',
                                    'posteam'])['pass'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())
qbr_df['qb_run'] = qbr_df.groupby(['season',
                                    'posteam'])['run'].transform(lambda x: x.rolling(window=3,
                                                                                      min_periods=1,
                                                                                      closed='left',
                                                                                      center=False).mean())

# Sort dataframe by gameday and posteam
qbr_df = qbr_df.sort_values(by=['gameday','posteam'], ascending=True)

# Print column names
qbr_df.drop(['game_id','season','week','rank','pts_added','plays','qbr_raw','qbr_total','exp_sack','sack','pass','run','gameday','weekday','gametime','under_odds','over_odds','away_rest','home_rest'],  axis=1, inplace=True)

In [24]:
# Impute missing data with median values
qbr_df['qb_rank'] = qbr_df['qb_rank'].fillna(qbr_df.qb_rank.median())
qbr_df['qb_pts'] = qbr_df['qb_pts'].fillna(qbr_df.qb_pts.median())
qbr_df['qb_plays'] = qbr_df['qb_plays'].fillna(qbr_df.qb_plays.median())
qbr_df['qb_qbr_raw'] = qbr_df['qb_qbr_raw'].fillna(qbr_df.qb_qbr_raw.median())
qbr_df['qb_qbr'] = qbr_df['qb_qbr'].fillna(qbr_df.qb_qbr.median())
# qbr_df['qb_sack'] = qbr_df['qb_sack'].fillna(qbr_df.qb_sack.median())
# qbr_df['qb_exp_sack'] = qbr_df['qb_exp_sack'].fillna(qbr_df.qb_exp_sack.median())
qbr_df['qb_pass'] = qbr_df['qb_pass'].fillna(qbr_df.qb_pass.median())
qbr_df['qb_run'] = qbr_df['qb_run'].fillna(qbr_df.qb_run.median())

qbr_df.shape

(874, 9)

In [25]:
print('team count:', qbr_df.posteam.nunique())

team count: 32


In [26]:
qbr_df.head(3)

Unnamed: 0,posteam,game_alt_id,qb_rank,qb_pts,qb_plays,qb_qbr_raw,qb_qbr,qb_pass,qb_run
1,BUF,LA_BUF_2022_09_08,15.666667,0.266667,41.583333,52.9,52.766667,3.033333,0.366667
26,LA,LA_BUF_2022_09_08,15.666667,0.266667,41.583333,52.9,52.766667,3.033333,0.366667
14,ARI,ARI_KC_2022_09_11,15.666667,0.266667,41.583333,52.9,52.766667,3.033333,0.366667


#### Import game rosters

In [27]:
gr_data = nfl.import_rosters(years=seasons)

# Clean dataframe using nfl-data-api cleaning function
gr_df = nfl.clean_nfl_data(gr_data)

gr_df.shape

(6197, 36)

In [28]:
print('team count:', gr_df.team.nunique())

team count: 32


In [29]:
# Impute missing height with median by position
gr_df['height'] = gr_df.groupby(['position'])['height'].transform(lambda x: x.fillna(x.median()))
gr_df['weight'] = gr_df.groupby(['position'])['weight'].transform(lambda x: x.fillna(x.median()))

gr_df['weight'] = gr_df['weight'].astype(np.int64)

gr_df.sample(1)

Unnamed: 0,season,team,position,depth_chart_position,jersey_number,status,player_name,first_name,last_name,birth_date,...,game_type,status_description_abbr,football_name,esb_id,gsis_it_id,smart_id,entry_year,rookie_year,draft_club,draft_number
2899,2023,JAX,OL,T,77.0,ACT,Anton Harrison,Anton,Harrison,2002-02-02,...,REG,A01,Anton,HAR575324,55891,32004841-5257-5324-7acd-121b32d76e46,2023,2023.0,JAX,27.0


#### import depth charts

#### Import snap counts

In [30]:
sc_data = nfl.import_snap_counts(years=seasons)

# Clean dataframe using nfl-data-api cleaning function
sc_df = nfl.clean_nfl_data(sc_data)

sc_df.shape

(41863, 16)

In [31]:
print('team count:', sc_df.team.nunique())

team count: 32


In [32]:
sc_df.sample(2)

Unnamed: 0,game_id,pfr_game_id,season,game_type,week,player,pfr_player_id,position,team,opponent,offense_snaps,offense_pct,defense_snaps,defense_pct,st_snaps,st_pct
5172,2022_04_LAC_HOU,202210020htx,2022,REG,4,JK Scott,ScotJK00,P,LAC,HOU,0.0,0.0,0.0,0.0,10.0,0.34
5589,2023_04_PIT_HOU,202310010htx,2023,REG,4,Jimmie Ward,WardJi01,FS,HOU,PIT,0.0,0.0,52.0,0.88,0.0,0.0


#### Import combine data

In [33]:
co_data = nfl.import_combine_data(years=draft_years)

# Clean dataframe using nfl-data-api cleaning function
co_df = nfl.clean_nfl_data(co_data)

co_df.shape

(7999, 18)

In [34]:
co_df['pfr_id'].fillna('NULL', inplace = True)
co_df = co_df.loc[co_df['pfr_id'] != 'NULL']

co_df = co_df.rename(columns={'pfr_id':'pfr_player_id',
                              'season':'combine',
                              'player_name':'comb_name',
                              'school':'comb_school',
                              'ht':'comb_ht',
                              'wt':'comb_wt',
                              'pos':'comb_pos',
                              'forty':'comb_forty',
                              'bench':'comb_bench',
                              'vertical':'comb_vert',
                              'broad_jump':'comb_broad',
                              'cone':'comb_cone',
                              'shuttle':'comb_shut',
                              })

co_df.drop(['draft_year','draft_team','draft_round','draft_ovr','cfb_id'], axis=1, inplace=True)

co_df.shape

(6606, 13)

In [35]:
# Convert height to inches
co_df['comb_ht'].fillna('NULL', inplace = True)
co_df['comb_ht'] = np.where(co_df['comb_ht'] == 'NULL', '0-0', co_df['comb_ht'])
co_df['comb_ht'] = (co_df['comb_ht'].astype(str).str[0]).astype(float) * 12 + (co_df['comb_ht'].astype(str).str[2]).astype(float)

In [36]:
co_df.sample(1)

Unnamed: 0,combine,pfr_player_id,comb_name,comb_pos,comb_school,comb_ht,comb_wt,comb_forty,comb_bench,comb_vert,comb_broad,comb_cone,comb_shut
3658,2011,DobbDe00,Demarcus Dobbs,DE,Georgia,74.0,281.0,4.87,,31.0,107.0,7.22,4.37


#### Import NFL draft selections

In [37]:
dr_data = nfl.import_draft_picks(years=draft_years)

# Clean dataframe using nfl-data-api cleaning function
dr_df = nfl.clean_nfl_data(dr_data)

dr_df.shape

(6130, 36)

In [38]:
dr_df['team'] = np.where(dr_df['team'] == 'GNB', 'GB', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'KAN', 'KC', dr_df['team'])

dr_df['team'] = np.where(dr_df['team'] == 'RAM', 'LA', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'STL', 'LA', dr_df['team'])

dr_df['team'] = np.where(dr_df['team'] == 'SDG', 'LAC', dr_df['team'])

dr_df['team'] = np.where(dr_df['team'] == 'RAI', 'LV', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'OAK', 'LV', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'LVR', 'LV', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'LAR', 'LV', dr_df['team'])

dr_df['team'] = np.where(dr_df['team'] == 'NWE', 'NE', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'NOR', 'NO', dr_df['team'])

dr_df['team'] = np.where(dr_df['team'] == 'SFO', 'SF', dr_df['team'])
dr_df['team'] = np.where(dr_df['team'] == 'TAM', 'TB', dr_df['team'])

print('team count:', dr_df.team.nunique())

team count: 32


In [39]:
# Calculate years in the NFL
dr_df['to'].fillna((dr_df['season']-1), inplace=True)

dr_df['nfl_years'] = dr_df['to'] - dr_df['season'] + 1

In [40]:
# Conver HOF to binary
dr_df['hof'] = np.where(dr_df['hof'] == True, 1, 0)

In [41]:
dr_df['pfr_player_id'].fillna('NULL', inplace = True)
dr_df = dr_df.loc[dr_df['pfr_player_id'] != 'NULL']

dr_df = dr_df.rename(columns={'season':'draft',
                              'team':'draft_team',
                              'pfr_player_name':'player_name',
                              'probowls':'pro_bowls'})

dr_df.drop(['cfb_player_id','gsis_id','w_av','car_av','dr_av','pass_attempts','pass_completions','pass_yards','pass_tds','pass_ints','rush_atts','rush_yards','rush_tds','receptions','rec_yards','rec_tds','def_solo_tackles','def_ints','def_sacks','to'], axis=1, inplace=True)

dr_df.shape

(5874, 17)

## Join draft data with combine data

In [42]:
print('combine records:', co_df.shape[0])
print('draft records:', dr_df.shape[0])

combine records: 6606
draft records: 5874


In [43]:
draft_df = pd.merge(dr_df, co_df, how='inner', left_on=['pfr_player_id'], right_on = ['pfr_player_id'])

draft_df.shape

(4967, 29)

In [44]:
draft_df = draft_df.loc[(draft_df['category'] != 'P') & 
                        (draft_df['category'] != 'K') & 
                        (draft_df['category'] != 'LS')]

draft_df.shape

(4892, 29)

In [45]:
# Impute missing values
draft_df['age'] = draft_df.age.fillna(draft_df.groupby('category').age.transform('median'))
draft_df['comb_ht'] = draft_df.comb_ht.fillna(draft_df.groupby('category').comb_ht.transform('median'))
draft_df['comb_wt'] = draft_df.comb_wt.fillna(draft_df.groupby('category').comb_wt.transform('median'))
draft_df['comb_forty'] = draft_df.comb_forty.fillna(draft_df.groupby('category').comb_forty.transform('median'))
draft_df['comb_bench'] = draft_df.comb_bench.fillna(draft_df.groupby('category').comb_bench.transform('median'))
draft_df['comb_vert'] = draft_df.comb_vert.fillna(draft_df.groupby('category').comb_vert.transform('median'))
draft_df['comb_broad'] = draft_df.comb_broad.fillna(draft_df.groupby('category').comb_broad.transform('median'))
draft_df['comb_cone'] = draft_df.comb_cone.fillna(draft_df.groupby('category').comb_cone.transform('median'))
draft_df['comb_shut'] = draft_df.comb_shut.fillna(draft_df.groupby('category').comb_shut.transform('median'))
draft_df['games'].fillna(0, inplace = True)

In [46]:
# Count missing data
missing = draft_df.isnull().sum()
missing

draft              0
round              0
pick               0
draft_team         0
pfr_player_id      0
player_name        0
hof                0
position           0
category           0
side               0
college            0
age                0
allpro             0
pro_bowls          0
seasons_started    0
games              0
nfl_years          0
combine            0
comb_name          0
comb_pos           0
comb_school        0
comb_ht            0
comb_wt            0
comb_forty         0
comb_bench         0
comb_vert          0
comb_broad         0
comb_cone          0
comb_shut          0
dtype: int64

In [47]:
draft_df.sample(5)

Unnamed: 0,draft,round,pick,draft_team,pfr_player_id,player_name,hof,position,category,side,...,comb_pos,comb_school,comb_ht,comb_wt,comb_forty,comb_bench,comb_vert,comb_broad,comb_cone,comb_shut
4297,2019,6,178,JAX,MinsGa00,Gardner Minshew II,0,QB,QB,O,...,QB,Washington State,73.0,225.0,4.97,22.0,33.5,116.0,7.14,4.45
3760,2021,3,69,CIN,OssaJo00,Joseph Ossai,0,LB,LB,D,...,LB,Texas,75.0,256.0,4.63,19.0,41.5,131.0,7.1,4.28
2769,2013,6,172,LV,KasaNi00,Nick Kasa,0,TE,TE,O,...,TE,Colorado,78.0,269.0,4.71,22.0,31.5,113.0,7.12,4.33
234,2001,2,56,DAL,DixoTo20,Tony Dixon,0,DB,DB,D,...,S,Alabama,73.0,213.0,4.63,17.0,37.0,117.0,7.38,4.17
4858,2023,3,91,BUF,WillDo04,Dorian Williams,0,LB,LB,D,...,LB,Tulane,73.0,228.0,4.49,23.0,33.5,120.0,7.1,4.28


In [48]:
# Export binary classifier dataframe
draft_df.to_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/output_files/nfl_post_processing_draft_data.csv', index=None, header=True)

## Join play-by-play data with supplemental data sources

In [49]:
play_df.shape

(69048, 385)

In [50]:
# Print qbr_df columns
print(qbr_df.columns.tolist())

['posteam', 'game_alt_id', 'qb_rank', 'qb_pts', 'qb_plays', 'qb_qbr_raw', 'qb_qbr', 'qb_pass', 'qb_run']


In [51]:
qbr_df.shape

(874, 9)

In [52]:
# Merge play-by-play and QBR data
df = pd.merge(play_df, 
              qbr_df, 
              how='left', 
              on=['game_alt_id','posteam'],
              #indicator=True,
              )

df.shape

(69048, 392)

In [53]:
df.game_alt_id.nunique()

451

In [54]:
# Print column names
print(df.columns.tolist())

['play_id', 'game_id', 'old_game_id', 'game_alt_id', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_away_score', 'posteam_score', 'defteam_score', 'score_differential', 'posteam_score_post', 'defteam_score_post', 'sco

In [55]:
# Fillna for qb_rank
df['qb_rank'].fillna(df.qb_rank.median(), inplace=True)
df['qb_pts'].fillna(df.qb_pts.median(), inplace=True)
df['qb_plays'].fillna(df.qb_plays.median(), inplace=True)
df['qb_qbr_raw'].fillna(df.qb_qbr_raw.median(), inplace=True)
df['qb_qbr'].fillna(df.qb_qbr.median(), inplace=True)
df['qb_pass'].fillna(df.qb_pass.median(), inplace=True)
df['qb_run'].fillna(df.qb_run.median(), inplace=True)

# _______________
## Primary play-by-play filters

In [56]:
# Exclude columns with the following records
df = df.loc[~(df['desc'].str.startswith("END |END_|Two-Minute|Two Minute|Two minute|Two-minute|Two minute|Two-min"))]

# Exclude columns with specific key words
df = df.loc[~(df['desc'].str.contains("Captains:|CAPTAINS:|Captians:|Captains #|Captians #"))]
df = df.loc[~(df['desc'].str.contains("substitution infraction|Two-Minute Warning"))]
df = df.loc[~(df['desc'].str.contains("game has been suspended|game has resumed|Game was resumed"))]
df = df.loc[~(df['desc'].str.contains("Game delayed|game delayed|Game suspended|Game was resumed"))]
df = df.loc[~(df['desc'].str.contains("no play run|Humidity|weather delay|severe weather"))]
df = df.loc[df['play_deleted'] == 0]
df = df.loc[df['qtr'] <= 4.0]
df = df.loc[df['down'] >= 1.0]
df = df.loc[df['special_teams_play'] == 0]
df = df.loc[df['season_type'] != 'PRE']
df = df.loc[df['aborted_play'] == 0]
df = df.loc[~(df['play_type'].isin(['punt','kickoff','extra_point','field_goal','qb_kneel','qb_spike']))]
df = df.loc[~df['drive'].isnull()]

# Drop qb_kneel, qb_spike, and aborted_play columns
df.drop(['qb_kneel','qb_spike','aborted_play'], axis=1, inplace=True)

df.shape


(55576, 389)

In [57]:
# Print a list of columns with only one value and then drop them
single_value_columns = df.loc[:,df.nunique() == 1].columns
df = df.loc[:,df.nunique() > 1]

print('Single value columns:', single_value_columns)
print('Remaining columns:', df.shape[1])

Single value columns: Index(['quarter_end', 'extra_point_prob', 'two_point_conversion_prob',
       'punt_blocked', 'punt_inside_twenty', 'punt_in_endzone',
       'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch',
       'kickoff_inside_twenty', 'kickoff_in_endzone', 'kickoff_out_of_bounds',
       'kickoff_downed', 'kickoff_fair_catch', 'own_kickoff_recovery',
       'own_kickoff_recovery_td', 'extra_point_attempt', 'two_point_attempt',
       'field_goal_attempt', 'kickoff_attempt', 'punt_attempt',
       'tackle_with_assist_2_player_id', 'tackle_with_assist_2_player_name',
       'tackle_with_assist_2_team', 'defensive_two_point_attempt',
       'defensive_two_point_conv', 'defensive_extra_point_attempt',
       'defensive_extra_point_conv', 'play_deleted', 'special_teams_play',
       'special'],
      dtype='object')
Remaining columns: 327


In [58]:
# Convert missing values to Null
df = df.fillna(value=np.nan)

In [59]:
# Convert field to datetime format
df['game_date']= pd.to_datetime(df['game_date'])

# Sort dataframe
df = df.sort_values(by=['posteam','game_date','play'], ascending=True)

df.shape

(55576, 327)

## Modify dataframe

In [60]:
# Impute missing values
df['time_of_day'].fillna(method='ffill', inplace=True)
df['quarter_seconds_remaining'].fillna(method='bfill', inplace=True)
df['half_seconds_remaining'].fillna(method='bfill', inplace=True)
df['game_seconds_remaining'].fillna(method='bfill', inplace=True)
df['wp'].fillna(method='bfill', inplace=True)
df['def_wp'].fillna(method='bfill', inplace=True)


In [61]:
# Remove commas from strings
df['offense_personnel'] = df['offense_personnel'].str.replace(',', '')
df['defense_personnel'] = df['defense_personnel'].str.replace(',', '')

In [62]:
# Strip all whitespace from strings
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Strips all objects in dataframe
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [63]:
# Impute missing values
df['play_type'].fillna('no_play', inplace = True)
df['play_type_nfl'].fillna('NO_PLAY', inplace = True)
df['sp'].fillna(0, inplace = True)
df['qtr'].fillna(0, inplace = True)
df['goal_to_go'].fillna(0, inplace = True)
df['ydstogo'].fillna(0, inplace = True) 
df['ydsnet'].fillna(0, inplace = True)
df['shotgun'].fillna(0, inplace = True)
df['no_huddle'].fillna(0, inplace = True)
df['qb_dropback'].fillna(0, inplace = True)
df['qb_scramble'].fillna(0, inplace = True)
df['yards_after_catch'].fillna(0, inplace = True)
df['home_timeouts_remaining'].fillna(0, inplace = True)
df['away_timeouts_remaining'].fillna(0, inplace = True)
df['timeout'].fillna(0, inplace = True)
df['posteam_timeouts_remaining'].fillna(0, inplace = True) 
df['defteam_timeouts_remaining'].fillna(0, inplace = True)
df['total_home_score'].fillna(0, inplace = True)
df['total_away_score'].fillna(0, inplace = True)
df['posteam_score'].fillna(0, inplace = True) 
df['defteam_score'].fillna(0, inplace = True)
df['score_differential'].fillna(0, inplace = True)
df['posteam_score_post'].fillna(0, inplace = True)
df['defteam_score_post'].fillna(0, inplace = True)
df['score_differential_post'].fillna(0, inplace = True)
df['first_down_rush'].fillna(0, inplace = True)
df['first_down_pass'].fillna(0, inplace = True)
df['first_down_penalty'].fillna(0, inplace = True)
df['third_down_converted'].fillna(0, inplace = True)
df['third_down_failed'].fillna(0, inplace = True)
df['fourth_down_converted'].fillna(0, inplace = True)
df['fourth_down_failed'].fillna(0, inplace = True)
df['incomplete_pass'].fillna(0, inplace = True)
df['touchback'].fillna(0, inplace = True)
df['interception'].fillna(0, inplace = True)
df['fumble_forced'].fillna(0, inplace = True)
df['fumble_not_forced'].fillna(0, inplace = True)
df['fumble_out_of_bounds'].fillna(0, inplace = True)
df['solo_tackle'].fillna(0, inplace = True)
df['penalty'].fillna(0, inplace = True)
df['tackled_for_loss'].fillna(0, inplace = True)
df['fumble'].fillna(0, inplace = True)
df['fumble_lost'].fillna(0, inplace = True)
df['qb_hit'].fillna(0, inplace = True)
df['rush_attempt'].fillna(0, inplace = True)
df['pass_attempt'].fillna(0, inplace = True)
df['sack'].fillna(0, inplace = True)
df['safety'].fillna(0, inplace = True)
df['touchdown'].fillna(0, inplace = True)
df['pass_touchdown'].fillna(0, inplace = True)
df['rush_touchdown'].fillna(0, inplace = True)
df['return_touchdown'].fillna(0, inplace = True)
df['complete_pass'].fillna(0, inplace = True)
df['assist_tackle'].fillna(0, inplace = True)
df['lateral_reception'].fillna(0, inplace = True)
df['lateral_rush'].fillna(0, inplace = True)
df['lateral_return'].fillna(0, inplace = True)
df['lateral_recovery'].fillna(0, inplace = True)
df['passing_yards'].fillna(0, inplace = True)
df['receiving_yards'].fillna(0, inplace = True)
df['rushing_yards'].fillna(0, inplace = True)
df['lateral_receiving_yards'].fillna(0, inplace = True)
df['lateral_rushing_yards'].fillna(0, inplace = True)
df['tackle_with_assist'].fillna(0, inplace = True)
df['return_yards'].fillna(0, inplace = True)
df['replay_or_challenge'].fillna(0, inplace = True)
df['series_success'].fillna(0, inplace = True)
df['order_sequence'].fillna(0, inplace = True)
df['fixed_drive'].fillna(0, inplace = True)
df['drive_play_count'].fillna(0, inplace = True)
df['drive_first_downs'].fillna(0, inplace = True)
df['drive_inside20'].fillna(0, inplace = True)
df['drive_ended_with_score'].fillna(0, inplace = True)
df['drive_quarter_start'].fillna(0, inplace = True)
df['drive_quarter_end'].fillna(0, inplace = True)
df['drive_yards_penalized'].fillna(0, inplace = True)
df['drive_play_id_started'].fillna(0, inplace = True)
df['drive_play_id_ended'].fillna(0, inplace = True)
df['success'].fillna(0, inplace = True)
df['pass'].fillna(0, inplace = True)
df['rush'].fillna(0, inplace = True)
df['first_down'].fillna(0, inplace = True)
df['play'].fillna(0, inplace = True)
df['out_of_bounds'].fillna(0, inplace = True)
df['home_opening_kickoff'].fillna(0, inplace = True)
df['fumble_recovery_1_yards'].fillna(0, inplace = True)
df['fumble_recovery_2_yards'].fillna(0, inplace = True)
df['penalty_yards'].fillna(0, inplace = True)

In [64]:
# Convert to integer
df['play_id'] = df['play_id'].astype(int)
df['quarter_seconds_remaining'] = df['quarter_seconds_remaining'].astype(int)
df['half_seconds_remaining'] = df['half_seconds_remaining'].astype(int)
df['game_seconds_remaining'] = df['game_seconds_remaining'].astype(int)
df['sp'] = df['sp'].astype(int)
df['qtr'] = df['qtr'].astype(int)
df['goal_to_go'] = df['goal_to_go'].astype(int)
df['ydstogo'] = df['ydstogo'].astype(int)
# df['ydsnet'] = df['ydsnet'].astype(int)
df['shotgun'] = df['shotgun'].astype(int)
df['no_huddle'] = df['no_huddle'].astype(int)
df['qb_dropback'] = df['qb_dropback'].astype(int)
df['qb_scramble'] = df['qb_scramble'].astype(int)
df['yards_after_catch'] = df['yards_after_catch'].astype(int)
df['home_timeouts_remaining'] = df['home_timeouts_remaining'].astype(int)
df['away_timeouts_remaining'] = df['away_timeouts_remaining'].astype(int)
df['timeout'] = df['timeout'].astype(int)
df['posteam_timeouts_remaining'] = df['posteam_timeouts_remaining'].astype(int)
df['defteam_timeouts_remaining'] = df['defteam_timeouts_remaining'].astype(int)
df['total_home_score'] = df['total_home_score'].astype(int)
df['total_away_score'] = df['total_away_score'].astype(int)
df['posteam_score'] = df['posteam_score'].astype(int)
df['defteam_score'] = df['defteam_score'].astype(int)
df['score_differential'] = df['score_differential'].astype(int)
df['posteam_score_post'] = df['posteam_score_post'].astype(int)
df['defteam_score_post'] = df['defteam_score_post'].astype(int)
df['score_differential_post'] = df['score_differential_post'].astype(int)
df['first_down_rush'] = df['first_down_rush'].astype(int)
df['first_down_pass'] = df['first_down_pass'].astype(int)
df['first_down_penalty'] = df['first_down_penalty'].astype(int)
df['third_down_converted'] = df['third_down_converted'].astype(int)
df['third_down_failed'] = df['third_down_failed'].astype(int)
df['fourth_down_converted'] = df['fourth_down_converted'].astype(int)
df['fourth_down_failed'] = df['fourth_down_failed'].astype(int)
df['incomplete_pass'] = df['incomplete_pass'].astype(int)
df['touchback'] = df['touchback'].astype(int)
df['interception'] = df['interception'].astype(int)
df['fumble_forced'] = df['fumble_forced'].astype(int)
df['fumble_not_forced'] = df['fumble_not_forced'].astype(int)
df['fumble_out_of_bounds'] = df['fumble_out_of_bounds'].astype(int)
df['solo_tackle'] = df['solo_tackle'].astype(int)
df['safety'] = df['safety'].astype(int)
df['penalty'] = df['penalty'].astype(int)
df['tackled_for_loss'] = df['tackled_for_loss'].astype(int)
df['fumble_lost'] = df['fumble_lost'].astype(int)
df['qb_hit'] = df['qb_hit'].astype(int)
df['rush_attempt'] = df['rush_attempt'].astype(int)
df['pass_attempt'] = df['pass_attempt'].astype(int)
df['sack'] = df['sack'].astype(int)
df['touchdown'] = df['touchdown'].astype(int)
df['rush_touchdown'] = df['rush_touchdown'].astype(int)
df['pass_touchdown'] = df['pass_touchdown'].astype(int)
df['return_touchdown'] = df['return_touchdown'].astype(int)
df['fumble'] = df['fumble'].astype(int)
df['complete_pass'] = df['complete_pass'].astype(int)
df['assist_tackle'] = df['assist_tackle'].astype(int)
df['lateral_reception'] = df['lateral_reception'].astype(int)
df['lateral_rush'] = df['lateral_rush'].astype(int)
df['lateral_return'] = df['lateral_return'].astype(int)
df['lateral_recovery'] = df['lateral_recovery'].astype(int)
df['passing_yards'] = df['passing_yards'].astype(int)
df['receiving_yards'] = df['receiving_yards'].astype(int)
df['rushing_yards'] = df['rushing_yards'].astype(int)
df['lateral_receiving_yards'] = df['lateral_receiving_yards'].astype(int)
df['lateral_rushing_yards'] = df['lateral_rushing_yards'].astype(int)
df['tackle_with_assist'] = df['tackle_with_assist'].astype(int)
df['return_yards'] = df['return_yards'].astype(int)
df['replay_or_challenge'] = df['replay_or_challenge'].astype(int)
df['series'] = df['series'].astype(int)
df['series_success'] = df['series_success'].astype(int)
df['order_sequence'] = df['order_sequence'].astype(int)
df['fixed_drive'] = df['fixed_drive'].astype(int)
df['drive_play_count'] = df['drive_play_count'].astype(int)
df['drive_first_downs'] = df['drive_first_downs'].astype(int)
df['drive_inside20'] = df['drive_inside20'].astype(int)
df['drive_ended_with_score'] = df['drive_ended_with_score'].astype(int)
df['drive_quarter_start'] = df['drive_quarter_start'].astype(int)
df['drive_quarter_end'] = df['drive_quarter_end'].astype(int)
df['drive_yards_penalized'] = df['drive_yards_penalized'].astype(int)
df['drive_play_id_started'] = df['drive_play_id_started'].astype(int)
df['drive_play_id_ended'] = df['drive_play_id_ended'].astype(int)
df['success'] = df['success'].astype(int)
df['pass'] = df['pass'].astype(int)
df['rush'] = df['rush'].astype(int)
df['first_down'] = df['first_down'].astype(int)
df['play'] = df['play'].astype(int)
df['out_of_bounds'] = df['out_of_bounds'].astype(int)
df['home_opening_kickoff'] = df['home_opening_kickoff'].astype(int)
df['fumble_recovery_1_yards'] = df['fumble_recovery_1_yards'].astype(int)
df['fumble_recovery_2_yards'] = df['fumble_recovery_2_yards'].astype(int)
df['penalty_yards'] = df['penalty_yards'].astype(int)
# df['home_rest'] = df['home_rest'].astype(int)
# df['away_rest'] = df['away_rest'].astype(int)
df.shape

(55576, 327)

In [65]:
counts = df.play_type_nfl.value_counts(normalize=False)
counts

play_type_nfl
PASS                            29748
RUSH                            23315
SACK                             2216
INTERCEPTION                      277
UNSPECIFIED                        12
FUMBLE_RECOVERED_BY_OPPONENT        8
Name: count, dtype: int64

In [66]:
counts = df.play_type.value_counts(normalize=False)
counts

play_type
pass    32253
run     23323
Name: count, dtype: int64

### Impute play_type

In [67]:
df1 = df.copy()

df1.shape

(55576, 327)

In [68]:
# Modify description property
df1['desc'] = df1['desc'].str.strip().str.lower()

# QB Scramble
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('scrambles ')>= 0), 'qb_scramble', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'run') & (df1['desc'].str.find('scrambles ')>= 0), 'qb_scramble', df1['play_type'])

# Kickoffs
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('kicks')>= 0), 'kickoff', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('kick formation')>= 0), 'kickoff', df1['play_type'])

# Field goals
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('field goal')>= 0), 'field_goal', df1['play_type'])

# Punts
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find(' punts')>= 0), 'punt', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find(' punt is')>= 0), 'punt', df1['play_type'])

# Extra points (PATs)
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find(' extra point')>= 0) & (df1['desc'].str.find('penalty')>= 0), 'extra_point', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('two-point conversion')>= 0), 'two_point', df1['play_type'])

# Kneel
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find(' kneels')>= 0) & (df1['desc'].str.find('penalty')>= 0), 'qb_kneel', df1['play_type'])

# QB spikes
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find(' spiked')>= 0) & (df1['desc'].str.find('penalty')>= 0), 'qb_spike', df1['play_type'])

# Passes
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass incomplete')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass complete')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass short')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass deep')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass to')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('sacked')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass intended')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('pass intercepted')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass incomplete')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass complete')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass short')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass deep')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass to')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('sacked')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass intended')>= 0), 'pass', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('pass intercepted')>= 0), 'pass', df1['play_type'])

# Runs
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('left end')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('left tackle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('left guard')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('up the middle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('right guard')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('right tackle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('right end')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('end around')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('left end')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('left tackle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('left guard')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('up the middle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('right guard')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('right tackle')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('right end')>= 0), 'run', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'timeout') & (df1['desc'].str.find('end around')>= 0), 'run', df1['play_type'])

In [69]:
# Penalties
df1['play_type'] = np.where(df1['desc'].str.find('false start')>= 0, 'penalty', df1['play_type'])
df1['play_type'] = np.where(df1['desc'].str.find('delay of game')>= 0, 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['desc'].str.find('penalty') >= 0) & (df1['desc'].str.find('neutral zone')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('delay of kickoff')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('too many men')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('encroachment')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('12 on-field')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('unsportsmanlike')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('interference')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('holding')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('illegal')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('offside')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('roughness')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('chop block')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('tripping')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('roughing')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('face mask')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('personal foul')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('disqualification')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('taunting')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('intentional grounding')>= 0), 'penalty', df1['play_type'])
df1['play_type'] = np.where((df1['desc'].str.find('penalty')>= 0) & (df1['desc'].str.find('play over the down')>= 0), 'penalty', df1['play_type'])

# Replay reviews
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('replay was upheld')>= 0), 'replay_review', df1['play_type'])
df1['play_type'] = np.where((df1['play_type'] == 'no_play') & (df1['desc'].str.find('replay assistant')>= 0), 'replay_review', df1['play_type'])

# Timeouts
df1['play_type'] = np.where((df1['desc'].str.find('timeout #')>= 0) & (df1['desc'].str.find('passer_jersey_number')>= 0) &  (df1['desc'].str.find('rusher_jersey_number')>= 0), 'timeout', df1['play_type'])
df1['play_type'] = np.where((df1['desc'].str.find('timeout at')>= 0) & (df1['desc'].str.find('passer_jersey_number')>= 0) &  (df1['desc'].str.find('rusher_jersey_number')>= 0), 'timeout', df1['play_type'])

# Clock runoff (no play)
df1['play_type'] = np.where(df1['desc'].str.find('end of quarter due to 10 second clock run-off')>= 0, 'clock_runoff', df1['play_type'])

# Replay down (no play)
df1['play_type'] = np.where(df1['desc'].str.find('play over the down')>= 0, 'replay_down', df1['play_type'])
df1['play_type'] = np.where(df1['desc'].str.find('play the down')>= 0, 'replay_down', df1['play_type'])

In [70]:
df1['play_type_nfl'] = np.where(df1['play_type'] == 'run', 'RUSH',
                                np.where(df1['play_type'] == 'penalty', 'PENALTY',
                                         np.where(df1['play_type'] == 'extra_point', 'XP_KICK',
                                                  np.where(df1['play_type'] == 'field_goal', 'FIELD_GOAL',
                                                           np.where(df1['play_type'] == 'timeout', 'TIMEOUT',
                                                                    np.where(df1['play_type'] == 'two_point', 'PAT2',
                                                                             df1['play_type_nfl']))))))

# Impute down property
df1['down'] = np.where(df1['play_type'] == 'kickoff', 0,
                       np.where(df1['play_type'] == 'extra_point', 0,
                                np.where(df1['play_type'] == 'two_point', 0,
                                         np.where(df1['play_type'] == 'injury',  df1['down'].shift(-1),
                                                  np.where(df1['play_type'] == 'replay',  df1['down'].shift(-1),
                                                           np.where(df1['play_type'] == 'timeout', df1['down'].shift(1),
                                                                    np.where(df1['play_type'] == 'penalty', df1['down'].shift(1),
                                                                             df1['down'])))))))

# Remove kickoff from drive groupings
df1['drive'] = np.where(df1['play_type'] == 'kickoff', np.nan, df1['drive'])

# Offsetting penalties
df1['offsetting_penalties'] = np.where((df1['desc'].str.find('offsetting') >= 0) & (df1['desc'].str.find('no play') >= 0), 1, 0)

# Impute missing yards_gained
df1['yards_gained'] = np.where(df1['offsetting_penalties'] == 1, 0, df1['yards_gained'])
df1['yards_gained'] = np.where(df1['play_type'] == 'timeout', 0, df1['yards_gained'])
df1['yards_gained'] = np.where((df1['play_type']=='penalty') & (df1['yards_gained']==np.nan),0,df1['yards_gained'])

## PLAY FILTERS

In [106]:
df2 = df1.copy()

df2.shape

(55576, 328)

In [107]:
counts = df2.play_type_nfl.value_counts(normalize=False)
counts

play_type_nfl
PASS                            29744
RUSH                            23323
SACK                             2214
INTERCEPTION                      277
FUMBLE_RECOVERED_BY_OPPONENT        8
PENALTY                             6
UNSPECIFIED                         4
Name: count, dtype: int64

In [108]:
counts = df2.play_type.value_counts(normalize=False, dropna=False)
counts

play_type
pass           32247
run            21692
qb_scramble     1631
penalty            6
Name: count, dtype: int64

In [109]:
df2 = df2[~df2['play_type'].isin(['punt', 'qb_kneel', 'field_goal', 'qb_scramble', 'qb_spike'])] # retain penalty to capture previous penalty events

df2.shape

(53945, 328)

In [110]:
counts = df2.play_type.value_counts(normalize=False, dropna=False)
counts

play_type
pass       32247
run        21692
penalty        6
Name: count, dtype: int64

## FEATURE ENGINEERING

In [111]:
# Game-play sequence
df2['play_sequence_game'] = df2.groupby(['game_id'])['play_id'].cumcount() + 1

# Game-play-drive sequence
df2['play_sequence_series'] = df2.groupby(['game_id', 'drive']).cumcount() + 1

# Code turnover on play
df2['turnover'] = np.where((df2['fumble_lost'] == 1) | (df2['interception'] == 1), 1, 0)

In [112]:
df2['play_type_detail'] = np.where((df2['play_type']=='run') & (df2['desc'].str.find('left end')>= 0), 'run_outside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('right end')>= 0), 'run_outside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('end around')>= 0), 'run_outside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('left tackle')>= 0), 'run_outside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('left guard')>= 0), 'run_inside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('middle')>= 0), 'run_inside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('right guard')>= 0), 'run_inside',
                          np.where((df2['play_type']=='run') & (df2['desc'].str.find('right tackle')>= 0), 'run_outside',

                          np.where(df2['pass_length']=='deep', 'pass_deep',
                          np.where((df2['play_type']=='pass') & (df2['air_yards']>10), 'pass_deep',
                          np.where((df2['play_type']=='pass') & (df2['desc'].str.find('pass deep')>= 0),'pass_deep',
                          np.where((df2['play_type']=='pass') & (df2['desc'].str.find('complete deep')>= 0), 'pass_deep',

                          np.where(df2['pass_length']=='short', 'pass_short',
                          np.where((df2['play_type']=='pass') & (df2['air_yards']<= 10), 'pass_short',

                          np.where((df2['play_type']=='pass') & (df2['desc'].str.find('sack')>= 0),'pass_sack',
                          np.where((df2['play_type']=='pass') & (df2['air_yards']==np.nan), 'pass',

                          np.where((df2['play_type']=='pass') & (df2['desc'].str.find('aborted')>= 0),'aborted',
                          df2['play_type'])))))))))))))))))

counts = df2.play_type_detail.value_counts(normalize=True)
counts

play_type_detail
pass_short     0.392826
run_inside     0.208082
run_outside    0.193660
pass_deep      0.163908
pass_sack      0.041060
run            0.000371
pass           0.000056
penalty        0.000037
Name: proportion, dtype: float64

In [113]:
# Remove ® from stadium names
df2['stadium'] = df2['stadium'].str.replace('®', '').str.replace('-','').str.replace('&','').str.replace('.','').str.replace("'","").str.strip().str.lower()

# Impute nan values to NULL
df2['surface'] = np.where(df2['surface'] == '', np.nan, df2['surface'])

# Map stadiums to new values
stadium_map = {'allegiant stadium': 'grass',
               'azteca stadium': 'grass',
               'fedexfield': 'grass',
               'state farm stadium': 'grass',
               'mercedesbenz stadium': 'turf',
               'mt bank stadium': 'grass',
               'paycor stadium': 'turf',
               'metlife stadium':'turf',
               'highmark stadium':'turf',
               'bank of america stadium': 'grass',
               'soldier field': 'grass',
               'raymond james stadium': 'grass',
               'cleveland browns stadium': 'grass', 
               'firstenergy stadium': 'grass',
               'acrisure stadium': 'grass',
               'att stadium': 'turf',
               'wembley stadium': 'grass',
               'empower field at mile high': 'grass', 
               'geha field at arrowhead stadium': 'grass', 
               'ford field': 'turf',
               'tottenham hotspur stadium': 'turf',
               'nrg stadium': 'turf',
               'lucas oil stadium': 'turf',
               'everbank stadium': 'grass',
               'tiaa bank field': 'grass',
               "levis stadium": 'grass',
               'lumen field': 'turf',
               'sofi stadium': 'turf', 
               'nissan stadium': 'turf',
               'gillette stadium': 'turf',
               'us bank stadium': 'turf',
               'lincoln financial field': 'grass',
               'caesars superdome': 'turf',
               'allianz arena': 'turf',
               'hard rock stadium': 'grass',
               'lambeau field': 'grass',
               'frankfurt stadium': 'turf',
               }

# Group surface types
df2['surface'] = df2['stadium'].map(stadium_map).fillna(df2['surface'])
# Convert numbers to null
#df2['surface'] = np.where(df2['surface'] >= 0, np.nan, df2['surface'])

df2.surface.value_counts(dropna=False)

surface
grass    27011
turf     26934
Name: count, dtype: int64

In [114]:
# Formation properties
df2['pass_formation'] = np.where(df2['desc'].str.find('pass formation') >= 0, 1, 0)
df2['run_formation'] = np.where(df2['desc'].str.find('run formation') >= 0, 1, 0)

In [115]:
df2.offense_formation.unique()

array(['PISTOL', 'EMPTY', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT', 'I_FORM',
       'JUMBO', nan], dtype=object)

In [116]:
df2['offense_formation'] = df2['offense_formation'].str.strip().str.lower()

# Impute missing offense_formation with pass formation and run formation
df2['offense_formation'] = np.where((df2['offense_formation'].isnull()) & (df2['shotgun'] == 1),'shotgun', 
                                    np.where((df2['offense_formation'].isnull()) & (df2['pass_formation'] == 1),'empty',
                                             np.where((df2['offense_formation'].isnull()) & (df2['run_formation'] == 1),'jumbo',
                                                      np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str[0:4] == '6 OL'),'jumbo',
                                                               np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str.find('3 TE') >0),'jumbo',
                                                                        np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str.find('0 WR') >0),'jumbo',
                                                                                 np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str.find('3 WR') >0), 'shotgun',
                                                                                          np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str[0:4] == '2 RB'), 'shotgun',
                                                                                                   np.where((df2['offense_formation'].isnull()) & (df2['offense_personnel'].str[0:4] == '1 RB'), 'singleback',
                                                                                                            np.where(df2['offense_formation'].isnull(), 'shotgun',
                                                                                                                     df2['offense_formation']))))))))))


In [117]:
df2.offense_formation.value_counts(dropna=False)

offense_formation
shotgun       30638
singleback    12500
empty          4287
i_form         3153
pistol         2547
jumbo           507
wildcat         313
Name: count, dtype: int64

In [118]:
# Offsetting penalties
df2['offsetting_penalties'] = np.where((df2['desc'].str.find('offsetting') >= 0) & (df2['desc'].str.find('no play') >= 0), 1, 0)

# Impute missing yards_gained
df2['yards_gained'] = np.where(df2['offsetting_penalties'] == 1, 0, df2['yards_gained'])

In [119]:
# Code yardline zones
df2['dtg_99to95'] = np.where(df2['yardline_100'] >= 95, 1, 0)
df2['dtg_94to90'] = np.where((df2['yardline_100'] <= 94) & (df2['yardline_100'] >= 90), 1, 0)
df2['dtg_40to31'] = np.where((df2['yardline_100'] <= 40) & (df2['yardline_100'] >= 31), 1, 0)
df2['dtg_30to21'] = np.where((df2['yardline_100'] <= 30) & (df2['yardline_100'] >= 21), 1, 0)
df2['dtg_20to11'] = np.where((df2['yardline_100'] <= 20) & (df2['yardline_100'] >= 11), 1, 0)
df2['dtg_10to06'] = np.where((df2['yardline_100'] <= 10) & (df2['yardline_100'] >= 6), 1, 0)
df2['dtg_05to01'] = np.where(df2['yardline_100'] <= 5, 1, 0)

In [120]:
# Code big gains on previous run plays (>= 15 yards)
df2['big_play_pass'] = np.where((df2['play_type']=='pass') & (df2['yards_gained']>=15) & (df2['turnover']==0), 1, 0)

df2['prev1_big_play_pass'] = df2['big_play_pass'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_big_play_pass'] = df2['big_play_pass'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_big_play_pass'] = df2['big_play_pass'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_big_play_pass_pcnt'] = (df2.groupby(['game_id', 'drive'])['big_play_pass'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [121]:
# Code big gains on previous run plays (>= 7 yards)
df2['big_play_run'] = np.where((df2['play_type']=='run') & (df2['yards_gained']>=7) & (df2['turnover']==0), 1, 0)

df2['prev1_big_play_run'] = df2['big_play_run'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_big_play_run'] = df2['big_play_run'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_big_play_run'] = df2['big_play_run'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_big_play_run_pcnt'] = (df2.groupby(['game_id', 'drive'])['big_play_run'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [122]:
# Code negative run on previous plays
df2['negative_pass'] = np.where((df2['play_type']=='pass') & (df2['yards_gained']<0) & (df2['turnover']==0), 1, 0)

df2['prev1_negative_pass'] = df2['negative_pass'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_negative_pass'] = df2['negative_pass'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_negative_pass'] = df2['negative_pass'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

In [123]:
# Code negative run on previous plays
df2['negative_run'] = np.where((df2['play_type']=='run') & (df2['yards_gained'] < 0) & (df2['turnover']==0), 1, 0)

df2['prev1_negative_run'] = df2['negative_run'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_negative_run'] = df2['negative_run'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_negative_run'] = df2['negative_run'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

In [124]:
# Code the play_type on previous plays
df2['play_type_prev1'] = df2['play_type'].shift(1).where(df2['drive'].shift(1) == df2['drive'], '')
df2['play_type_prev2'] = df2['play_type'].shift(2).where(df2['drive'].shift(2) == df2['drive'], '')
df2['play_type_prev3'] = df2['play_type'].shift(3).where(df2['drive'].shift(3) == df2['drive'], '')

# Code binary penalty indicator on previous plays
df2['penalty_team_prev1'] = df2['penalty_team'].shift(1).where(df2['drive'].shift(1) == df2['drive'], '')
df2['penalty_team_prev2'] = df2['penalty_team'].shift(2).where(df2['drive'].shift(2) == df2['drive'], '')
df2['penalty_team_prev3'] = df2['penalty_team'].shift(3).where(df2['drive'].shift(3) == df2['drive'], '')

##########
df2['prev1_play_off_penalty'] = np.where((df2['play_type_prev1'] == 'penalty') & (df2['penalty_team_prev1'] == df2['posteam']), 1, 0)
df2['prev1_play_def_penalty'] = np.where((df2['play_type_prev1'] == 'penalty') & (df2['penalty_team_prev1'] == df2['defteam']), 1, 0)

##########
df2['prev2_play_off_penalty'] = np.where((df2['play_type_prev2'] == 'penalty') & (df2['penalty_team_prev2'] == df2['posteam']), 1, 0)
df2['prev2_play_def_penalty'] = np.where((df2['play_type_prev2'] == 'penalty') & (df2['penalty_team_prev2'] == df2['defteam']), 1, 0)

##########
df2['prev3_play_off_penalty'] = np.where((df2['play_type_prev3'] == 'penalty') & (df2['penalty_team_prev3'] == df2['posteam']), 1, 0)
df2['prev3_play_def_penalty'] = np.where((df2['play_type_prev3'] == 'penalty') & (df2['penalty_team_prev3'] == df2['defteam']), 1, 0)

#########
df2.drop(['play_type_prev1','play_type_prev2','play_type_prev3','penalty_team_prev1','penalty_team_prev2','penalty_team_prev3'], axis=1, inplace=True)

In [125]:
# Code the play_type_detail on previous plays
df2['play_type_detail_prev1'] = df2['play_type_detail'].shift(1).where(df2['drive'].shift(1) == df2['drive'], '')
df2['play_type_detail_prev2'] = df2['play_type_detail'].shift(2).where(df2['drive'].shift(2) == df2['drive'], '')
df2['play_type_detail_prev3'] = df2['play_type_detail'].shift(3).where(df2['drive'].shift(3) == df2['drive'], '')

df2['prev1_play_run_outside'] = np.where(df2['play_type_detail_prev1'] == 'run_outside', 1, 0)
df2['prev1_play_run_inside'] = np.where(df2['play_type_detail_prev1'] == 'run_inside', 1, 0)
df2['prev1_play_pass_deep'] = np.where(df2['play_type_detail_prev1'] == 'pass_deep', 1, 0)
df2['prev1_play_pass_short'] = np.where(df2['play_type_detail_prev1'] == 'pass_short', 1, 0)

df2['prev2_play_run_outside'] = np.where(df2['play_type_detail_prev2'] == 'run_outside', 1, 0)
df2['prev2_play_run_inside'] = np.where(df2['play_type_detail_prev2'] == 'run_inside', 1, 0)
df2['prev2_play_pass_deep'] = np.where(df2['play_type_detail_prev2'] == 'pass_deep', 1, 0)
df2['prev2_play_pass_short'] = np.where(df2['play_type_detail_prev2'] == 'pass_short', 1, 0)

df2['prev3_play_run_outside'] = np.where(df2['play_type_detail_prev3'] == 'run_outside', 1, 0)
df2['prev3_play_run_inside'] = np.where(df2['play_type_detail_prev3'] == 'run_inside', 1, 0)
df2['prev3_play_pass_deep'] = np.where(df2['play_type_detail_prev3'] == 'pass_deep', 1, 0)
df2['prev3_play_pass_short'] = np.where(df2['play_type_detail_prev3'] == 'pass_short', 1, 0)

df2.drop(['play_type_detail_prev1','play_type_detail_prev2','play_type_detail_prev3'], axis=1, inplace=True)

In [126]:
# Code incomplete passes on previous plays
df2['prev1_incomplete_pass'] = df2['incomplete_pass'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_incomplete_pass'] = df2['incomplete_pass'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_incomplete_pass'] = df2['incomplete_pass'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_incomplete_pass'].fillna(0, inplace = True)
df2['prev2_incomplete_pass'].fillna(0, inplace = True)
df2['prev3_incomplete_pass'].fillna(0, inplace = True)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_incomplete_pass_pcnt'] = (df2.groupby(['game_id', 'drive'])['incomplete_pass'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [127]:
# Code yards on previous plays
df2['prev1_yards_gained'] = df2['yards_gained'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_yards_gained'] = df2['yards_gained'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_yards_gained'] = df2['yards_gained'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_yards_gained'].fillna(0, inplace = True)
df2['prev2_yards_gained'].fillna(0, inplace = True)
df2['prev3_yards_gained'].fillna(0, inplace = True)

In [128]:
# Code win probability of previous plays
df2['prev1_wpa'] = df2['wpa'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_wpa'] = df2['wpa'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_wpa'] = df2['wpa'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_wpa'].fillna(0, inplace = True)
df2['prev2_wpa'].fillna(0, inplace = True)
df2['prev3_wpa'].fillna(0, inplace = True)

In [129]:
# Code shotgun formation of previous plays
df2['prev1_shotgun'] = df2['shotgun'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_shotgun'] = df2['shotgun'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_shotgun'] = df2['shotgun'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_shotgun'].fillna(0, inplace = True)
df2['prev2_shotgun'].fillna(0, inplace = True)
df2['prev3_shotgun'].fillna(0, inplace = True)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_shotgun_pcnt'] = (df2.groupby(['game_id', 'drive'])['shotgun'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [130]:
# Code qb_hit on previous plays
df2['prev1_qb_hit'] = df2['qb_hit'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_qb_hit'] = df2['qb_hit'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_qb_hit'] = df2['qb_hit'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_qb_hit'].fillna(0, inplace = True)
df2['prev2_qb_hit'].fillna(0, inplace = True)
df2['prev3_qb_hit'].fillna(0, inplace = True)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_qb_hit_pcnt'] = (df2.groupby(['game_id', 'drive'])['qb_hit'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [131]:
# Code no huddle of previous plays
df2['prev1_no_huddle'] = df2['no_huddle'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_no_huddle'] = df2['no_huddle'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_no_huddle'] = df2['no_huddle'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_no_huddle'].fillna(0, inplace = True)
df2['prev2_no_huddle'].fillna(0, inplace = True)
df2['prev3_no_huddle'].fillna(0, inplace = True)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_no_huddle_pcnt'] = (df2.groupby(['game_id', 'drive'])['no_huddle'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [132]:
# Code first down achieved on a previous play
df2['prev1_first_down_pass'] = df2['first_down_pass'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_first_down_pass'] = df2['first_down_pass'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_first_down_pass'] = df2['first_down_pass'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_first_down_pass'].fillna(0, inplace = True)
df2['prev2_first_down_pass'].fillna(0, inplace = True)
df2['prev3_first_down_pass'].fillna(0, inplace = True)

In [133]:
# Code first down achieved on a previous play
df2['prev1_first_down_run'] = df2['first_down_rush'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_first_down_run'] = df2['first_down_rush'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_first_down_run'] = df2['first_down_rush'].shift(3).where(df2['drive'].shift(2) == df2['drive'], 0)

df2['prev1_first_down_run'].fillna(0, inplace = True)
df2['prev2_first_down_run'].fillna(0, inplace = True)
df2['prev3_first_down_run'].fillna(0, inplace = True)

In [134]:
# Code play efficiency (1st down >= 4, 2nd down half the distance, 3rd and 4th down = first down)
df2['effct_play'] = np.where((df2['down']==1) & (df2['yards_gained'] >= 4) & (df2['turnover'] == 0), 1,
                    np.where((df2['down']==2) & (df2['yards_gained'] >= (df2['ydstogo']/2)) & (df2['turnover'] == 0), 1,
                    np.where((df2['down']==3) & (df2['yards_gained'] >= df2['ydstogo']) & (df2['turnover'] == 0), 1,
                    np.where((df2['down']==4) & (df2['yards_gained'] >= df2['ydstogo']) & (df2['turnover'] == 0), 1, 0))))

df2['prev1_effct_play'] = df2['effct_play'].shift(1).where(df2['drive'].shift(1) == df2['drive'], 0)
df2['prev2_effct_play'] = df2['effct_play'].shift(2).where(df2['drive'].shift(2) == df2['drive'], 0)
df2['prev3_effct_play'] = df2['effct_play'].shift(3).where(df2['drive'].shift(3) == df2['drive'], 0)

# Calculate the percent of play classification within prior plays of current drive
df2['drive_effct_play_pcnt'] = (df2.groupby(['game_id', 'drive'])['effct_play'].transform(lambda x: x.rolling(window=50, min_periods=1, closed='left').sum())/df2['play_sequence_series'] - 1).fillna(0)

In [135]:
# Code downs_remaining
df2['remaining_downs'] = np.where(df2['down'] == 1, 3, 
                         np.where(df2['down'] == 2, 2,
                         np.where(df2['down'] == 3, 1,
                         np.where(df2['down'] == 4, 1, 1))))

# Code yards_per_down remaining
df2['remaining_yards_per_down'] = df2['ydstogo'] / df2['remaining_downs']

In [136]:
df2['game_half'] = np.where(df2['game_half'] == 'Half1', 1, np.where(df2['game_half'] == 'Half2', 2, 3))

In [137]:
df2.drive_start_transition.unique()

array(['KICKOFF', 'PUNT', 'FUMBLE', 'DOWNS', 'INTERCEPTION', 'MISSED_FG',
       'ONSIDE_KICK', 'BLOCKED_FG_DOWNS', 'BLOCKED_PUNT', 'MUFFED_PUNT',
       'MUFFED_KICKOFF', 'BLOCKED_FG', 'BLOCKED_FG,_DOWNS',
       'BLOCKED_PUNT_DOWNS'], dtype=object)

In [138]:
df2['drive_start_transition'] = df2['drive_start_transition'].str.strip().str.upper()

# Modify drive start category
df2['drive_start'] = np.where(df2['drive_start_transition'] == 'INTERCEPTION', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'FUMBLE', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'MUFFED_PUNT', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'ONSIDE_KICK', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'BLOCKED_FG', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'BLOCKED_PUNT', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'BLOCKED_FG_DOWNS', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'BLOCKED_FG,_DOWNS', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'BLOCKED_PUNT_DOWNS', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'MUFFED_KICKOFF', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'OWN_KICKOFF', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'DOWNS', 'sudden_change',
                     np.where(df2['drive_start_transition'] == 'MISSED_FG', 'transfer_poss',
                     np.where(df2['drive_start_transition'] == 'KICKOFF', 'transfer_poss',
                     np.where(df2['drive_start_transition'] == 'PUNT', 'transfer_poss',
                     df2['drive_start_transition'])))))))))))))))

df2['drive_start'] = df2['drive_start'].str.strip().str.lower()

df2['drive_start'].fillna('NULL', inplace=True)

df2.drive_start.unique()

array(['transfer_poss', 'sudden_change'], dtype=object)

In [139]:
df2['two_min_warning'] = np.where(df2['half_seconds_remaining'] <= 120, 1, 0)

In [140]:
df2['ep_sec_ratio'] = (df2['ep'] / (df2['half_seconds_remaining'] + 1 ))

In [141]:
df2['posteam_season'] = df2['posteam'] + '_' + df2['season'].astype(str)

In [142]:
# Duplicate dataframe
df3 = df2.copy()

df3.shape

(53945, 416)

In [143]:
# Rename target classes
df3['play_type_detail'] = np.where(df3['play_type_detail'] == 'pass_short', 'short',
                                   np.where(df3['play_type_detail'] == 'pass_deep', 'deep',
                                            np.where(df3['play_type_detail'] == 'run_inside', 'inside',
                                                     np.where(df3['play_type_detail'] == 'run_outside', 'outside',
                                                              df3['play_type_detail']))))

## Impute missing values

In [144]:
# Impute missing offense_personnel using the previous play within drive
df3['offense_personnel'] = df3['offense_personnel'].fillna(df3.groupby(['game_id', 'drive'])['offense_personnel'].transform('last'))

df3.offense_personnel.unique()

array(['1 RB 1 TE 3 WR', '1 RB 2 TE 2 WR', '2 RB 0 TE 3 WR',
       '1 RB 0 TE 4 WR', '2 RB 1 TE 2 WR', '1 RB 3 TE 1 WR',
       '6 OL 1 RB 2 TE 1 WR', '2 RB 2 TE 1 WR', '1 RB 2 TE 1 WR1 LB',
       '1 RB 1 TE 2 WR1 LB', '0 RB 2 TE 3 WR', '0 RB 1 TE 4 WR',
       '0 RB 2 TE 1 WR1 P5 LB1 LS1 DB', '1 RB 1 TE 2 WR1 P4 LB1 LS',
       '0 RB 3 TE 2 WR', '6 OL 2 RB 2 TE 0 WR', '3 RB 1 TE 1 WR',
       '2 QB 2 RB 1 TE 1 WR', '2 QB 1 RB 1 TE 2 WR',
       '2 QB 1 RB 0 TE 3 WR', '2 QB 2 RB 0 TE 2 WR',
       '2 QB 1 RB 2 TE 1 WR', '2 RB 3 TE 0 WR', '1 RB 4 TE 0 WR',
       '6 OL 1 RB 1 TE 2 WR', '6 OL 2 RB 1 TE 1 WR', '0 RB 2 TE 2 WR',
       '3 RB 2 TE 0 WR', '7 OL 2 RB 1 TE 0 WR', '6 OL 2 RB 0 TE 2 WR',
       '6 OL 1 RB 0 TE 3 WR', '6 OL 1 RB 3 TE 0 WR',
       '8 OL 1 RB 0 TE 1 WR', '7 OL 1 RB 2 TE 0 WR',
       '7 OL 1 RB 1 TE 1 WR', '0 RB 2 TE 1 WR1 P4 LB1 LS2 DB',
       '6 OL 0 RB 1 TE 3 WR', '1 RB 3 TE 2 WR', '1 RB 3 TE 0 WR1 LB',
       '2 RB 0 TE 4 WR', '6 OL 1 RB 2 TE 0 WR1 DL', '2 

In [145]:
df3['defense_personnel'] = df3['defense_personnel'].fillna(df3.groupby(['game_id', 'drive'])['defense_personnel'].transform('last'))

df3.defense_personnel.unique()

array(['4 DL 2 LB 5 DB', '4 DL 3 LB 4 DB', '3 DL 3 LB 5 DB',
       '3 DL 2 LB 6 DB', '4 DL 1 LB 6 DB', '3 DL 1 LB 7 DB',
       '5 DL 1 LB 5 DB', '5 DL 2 LB 4 DB', '3 DL 4 LB 4 DB',
       '2 DL 4 LB 5 DB', '2 DL 3 LB 6 DB', '5 DL 3 LB 3 DB',
       '1 DL 5 LB 5 DB', '1 DL 4 LB 6 DB', '0 DL 4 LB 7 DB',
       '1 DL 3 LB 7 DB', '0 DL 2 LB 5 DB 1 RB2 TE1 WR',
       '2 DL 4 LB 2 DB 1 RB1 TE1 WR', '2 DL 5 LB 4 DB', '6 DL 2 LB 3 DB',
       '6 DL 4 LB 1 DB', '5 DL 0 LB 6 DB', '4 DL 4 LB 3 DB',
       '4 DL 1 LB 5 DB', '3 DL 5 LB 3 DB', '2 DL 6 LB 3 DB',
       '4 DL 5 LB 2 DB', '1 DL 6 LB 4 DB', '2 DL 4 LB 4 DB',
       '5 DL 4 LB 2 DB', '0 DL 5 LB 6 DB', '5 DL 5 LB 1 DB',
       '6 DL 3 LB 2 DB', '3 DL 3 LB 4 DB', '4 DL 3 LB 3 DB',
       '6 DL 1 LB 4 DB', '0 DL 6 LB 5 DB', '2 DL 2 LB 7 DB',
       '2 DL 3 LB 5 DB', '4 DL 2 LB 4 DB 1 WR', '3 DL 2 LB 5 DB',
       '3 DL 6 LB 2 DB', '4 DL 4 LB 2 DB', '6 DL 5 LB 0 DB',
       '3 DL 1 LB 6 DB 1 WR', '4 DL 6 LB 1 DB', '0 DL 5 LB 4 DB 1 RB1 WR

In [146]:
# Impute defenders_in_box
df3['defenders_in_box'] = df3['defenders_in_box'].fillna(df3.groupby(['game_id', 'defteam'])['defenders_in_box'].transform('last'))

## Additional features

In [147]:
# Combine features
df3['remain_yds_div_def_box'] = df3['remaining_yards_per_down'] / df3['defenders_in_box']
df3['remain_yds_prod_def_box'] = df3['remaining_yards_per_down'] * df3['defenders_in_box']

In [148]:
# Calculate average run ratio per game
rro_df = df3.groupby(['posteam','game_id','qtr','down'])['rush_attempt'].apply(lambda x : x.astype(float).mean()).reset_index()

# Sort dataframe
rro_df = rro_df.sort_values(by=['posteam','qtr','down'], ascending=True)

# Calculate rolling average per quarter
rro_df['run_ratio_off_priors'] = rro_df.groupby(['posteam','qtr','down'])['rush_attempt'].transform(lambda x: x.rolling(window=3,
                                                                                                                        min_periods=1,
                                                                                                                        closed='left',
                                                                                                                        center=False).mean())

rro_df = rro_df.sort_values(by=['posteam','qtr','down'], ascending=True)

rro_df.drop(['rush_attempt'], axis=1, inplace=True)

# Impute missing values by the league average per qtr/down
rro_df['run_ratio_off_priors'] = rro_df['run_ratio_off_priors'].fillna(rro_df.groupby(['qtr','down'])['run_ratio_off_priors'].transform('mean'))

# Merge msa_df with df
df3 = df3.merge(rro_df, how='left', left_on=['posteam','game_id','qtr','down'], right_on=['posteam','game_id','qtr','down'])

In [149]:
# Calculate average run ratio per game
rrd_df = df3.groupby(['defteam','game_id','qtr','down'])['rush_attempt'].apply(lambda x : x.astype(float).mean()).reset_index()

# Sort dataframe
rrd_df = rrd_df.sort_values(by=['defteam','qtr','down'], ascending=True)

# Calculate rolling average per quarter
rrd_df['run_ratio_def_priors'] = rrd_df.groupby(['defteam','qtr','down'])['rush_attempt'].transform(lambda x: x.rolling(window=3,
                                                                                                                        min_periods=1,
                                                                                                                        closed='left',
                                                                                                                        center=False).mean())

rrd_df = rrd_df.sort_values(by=['defteam','qtr','down'], ascending=True)

rrd_df.drop(['rush_attempt'], axis=1, inplace=True)

# Impute missing values by the league average per qtr/down
rrd_df['run_ratio_def_priors'] = rrd_df['run_ratio_def_priors'].fillna(rrd_df.groupby(['qtr','down'])['run_ratio_def_priors'].transform('mean'))

# Merge msa_df with df
df3 = df3.merge(rrd_df, how='left', left_on=['defteam','game_id','qtr','down'], right_on=['defteam','game_id','qtr','down'])

## Future features

## Play prediction filters

In [150]:
# Filters used to eliminate non-typical offensive plays / plays with alterior incentives (preseason) 
df3 = df3.loc[df3['season_type'] != 'PRE']
df3 = df3.loc[(df3['play_type'] == 'pass') | (df3['play_type'] == 'run')]
df3 = df3.loc[df3['play_type_detail'] != 'pass_sack']
df3 = df3.loc[df3['qb_scramble'] == 0]
df3 = df3.loc[df3['sack'] == 0]
df3 = df3.loc[df3['drive_start'] != 'NULL']

df3.shape

(51724, 420)

In [151]:
# Print a list of columns with only one value and then drop them
single_value_columns = df3.loc[:,df3.nunique() == 1].columns
df3 = df3.loc[:,df3.nunique() > 1]

print('Single value columns:', single_value_columns)
print('Remaining columns:', df3.shape[1])

Single value columns: Index(['qb_scramble', 'sack', 'offsetting_penalties', 'pass_formation',
       'prev3_play_def_penalty'],
      dtype='object')
Remaining columns: 407


In [152]:
# Convert properties into integer
df3['yardline_100'] = df3['yardline_100'].astype(int)
df3['down'] = df3['down'].astype(int)
df3['drive'] = df3['drive'].astype(int)

In [153]:
counts = df3.play_type_detail.value_counts(normalize=False)
counts

play_type_detail
short      21187
inside     11225
outside    10447
deep        8842
run           20
pass           3
Name: count, dtype: int64

## DROP COLUMNS

In [154]:
# Drop unused features
df3.drop(['old_game_id','game_alt_id','season_type','side_of_field','game_date','drive','time','yrdln','ydsnet','desc','qb_dropback','air_yards','yards_after_catch','timeout','timeout_team','td_team','td_player_name','td_player_id','total_home_score','total_away_score','posteam_score_post','defteam_score_post','score_differential_post','opp_fg_prob','opp_safety_prob','opp_td_prob','total_home_epa','total_away_epa','total_home_rush_epa','total_away_rush_epa','total_home_pass_epa','total_away_pass_epa','air_epa','yac_epa','comp_air_epa','comp_yac_epa','total_home_comp_air_epa','total_away_comp_air_epa','total_home_comp_yac_epa','total_away_comp_yac_epa','total_home_raw_air_epa','total_away_raw_air_epa','total_home_raw_yac_epa','total_away_raw_yac_epa','def_wp','home_wp','away_wp','wpa','vegas_wpa','vegas_home_wpa','home_wp_post','away_wp_post','vegas_wp','vegas_home_wp','total_home_rush_wpa','total_away_rush_wpa','total_home_pass_wpa','total_away_pass_wpa','air_wpa','yac_wpa','comp_air_wpa','comp_yac_wpa','total_home_comp_air_wpa','total_away_comp_air_wpa','total_home_comp_yac_wpa','total_away_comp_yac_wpa','total_home_raw_air_wpa','total_away_raw_air_wpa','total_home_raw_yac_wpa','total_away_raw_yac_wpa','first_down_rush','first_down_pass','first_down_penalty','third_down_converted','third_down_failed','fourth_down_converted','fourth_down_failed','incomplete_pass','touchback','interception','fumble_forced','fumble_not_forced','fumble_out_of_bounds','solo_tackle','safety','penalty','tackled_for_loss','fumble_lost','qb_hit','touchdown','pass_touchdown','rush_touchdown','return_touchdown','fumble','complete_pass','assist_tackle','lateral_reception','lateral_rush','lateral_return','lateral_recovery','passer_player_id','passer_player_name','passing_yards','receiver_player_id','receiver_player_name','receiving_yards','rusher_player_id','rusher_player_name','rushing_yards','lateral_receiver_player_id','lateral_receiver_player_name','lateral_receiving_yards','lateral_rusher_player_id','lateral_rusher_player_name','lateral_rushing_yards','interception_player_id','interception_player_name','lateral_interception_player_id','lateral_interception_player_name','tackle_for_loss_1_player_id','tackle_for_loss_1_player_name','qb_hit_1_player_id','qb_hit_1_player_name','forced_fumble_player_1_team','forced_fumble_player_1_player_id','forced_fumble_player_1_player_name','forced_fumble_player_2_team','forced_fumble_player_2_player_id','forced_fumble_player_2_player_name','solo_tackle_1_team','solo_tackle_2_team','solo_tackle_1_player_id','solo_tackle_2_player_id','solo_tackle_1_player_name','solo_tackle_2_player_name','assist_tackle_1_player_id','assist_tackle_1_player_name','assist_tackle_1_team','assist_tackle_2_player_id','assist_tackle_2_player_name','assist_tackle_2_team','tackle_with_assist','tackle_with_assist_1_player_id','tackle_with_assist_1_player_name','tackle_with_assist_1_team','pass_defense_1_player_id','pass_defense_1_player_name','pass_defense_2_player_id','pass_defense_2_player_name','fumbled_1_team','fumbled_1_player_id','fumbled_1_player_name','fumbled_2_player_id','fumbled_2_player_name','fumbled_2_team','fumble_recovery_1_team','fumble_recovery_1_yards','fumble_recovery_1_player_id','fumble_recovery_1_player_name','fumble_recovery_2_team','fumble_recovery_2_yards','fumble_recovery_2_player_id','fumble_recovery_2_player_name','return_team','return_yards','penalty_team','penalty_player_id','penalty_player_name','penalty_yards','replay_or_challenge','replay_or_challenge_result','penalty_type','safety_player_name','safety_player_id','series','series_success','series_result','start_time','time_of_day','stadium','weather','nfl_api_id','play_clock','play_type_nfl','end_clock_time','end_yard_line','fixed_drive','fixed_drive_result','drive_real_start_time','drive_time_of_possession','drive_first_downs','drive_inside20','drive_ended_with_score','drive_quarter_start','drive_quarter_end','drive_yards_penalized','drive_end_transition','drive_start_transition','drive_game_clock_start','drive_game_clock_end','drive_start_yard_line','drive_end_yard_line','drive_play_id_started','drive_play_id_ended','away_score','home_score','location','result','total','home_coach','away_coach','stadium_id','game_stadium','success','passer','passer_jersey_number','rusher','rusher_jersey_number','receiver','receiver_jersey_number','pass','rush','first_down','passer_id','rusher_id','receiver_id','name','jersey_number','id','fantasy_player_name','fantasy_player_id','fantasy','fantasy_id','out_of_bounds','home_opening_kickoff','qb_epa','xyac_epa','xyac_mean_yardage','xyac_median_yardage','xyac_success','xyac_fd','xpass','pass_oe','cp','cpoe','pass_length','pass_location','run_location','run_gap','pass_attempt','game_id','home_team','away_team','sp','yards_gained','home_timeouts_remaining','away_timeouts_remaining','order_sequence','play_id','posteam_type','epa','turnover','temp','wind','roof','effct_play','negative_run','negative_pass','big_play_run','big_play_pass','season','posteam_season','rush_attempt','nflverse_game_id','possession_team','players_on_play','offense_players','defense_players'], axis=1, inplace=True)
# 'offense_personnel', 'defense_personnel', 'number_of_pass_rushers','n_offense','n_defense','gameday','weekday','gametime','home_rest', 'away_rest'

df3.shape

(51724, 129)

In [155]:
null_columns = df3.columns[df3.isnull().any()]

# Print missing values counts for null_columns
print('Columns with missing values:')
print(df3[null_columns].isnull().sum())

Columns with missing values:
number_of_pass_rushers    21535
dtype: int64


In [156]:
# drop columns with null values
df3.dropna(axis=1, inplace=True)
print('Dropping the following columns with missing values:')
print(null_columns)
print('Remaining column count:', df3.shape[1])

Dropping the following columns with missing values:
Index(['number_of_pass_rushers'], dtype='object')
Remaining column count: 128


In [157]:
df4 = df3.copy()

## VIF exclusion criteria

## Create binary classification dataset

In [158]:
# Print list of columns
list(df4.columns)

['week',
 'posteam',
 'defteam',
 'yardline_100',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'goal_to_go',
 'ydstogo',
 'play_type',
 'shotgun',
 'no_huddle',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'no_score_prob',
 'fg_prob',
 'safety_prob',
 'td_prob',
 'ep',
 'wp',
 'drive_play_count',
 'spread_line',
 'total_line',
 'div_game',
 'surface',
 'play',
 'offense_formation',
 'offense_personnel',
 'defenders_in_box',
 'defense_personnel',
 'n_offense',
 'n_defense',
 'qb_rank',
 'qb_pts',
 'qb_plays',
 'qb_qbr_raw',
 'qb_qbr',
 'qb_pass',
 'qb_run',
 'play_sequence_game',
 'play_sequence_series',
 'play_type_detail',
 'run_formation',
 'dtg_99to95',
 'dtg_94to90',
 'dtg_40to31',
 'dtg_30to21',
 'dtg_20to11',
 'dtg_10to06',
 'dtg_05to01',
 'prev1_big_play_pass',
 'prev2_big_play_pass',
 'prev3_big_play_pass',
 'drive_big_play_pass_pcn

In [159]:
df5 = df4.copy()

# Drop column not used in classifier
df5.drop(['play_type_detail'], axis=1, inplace=True)

# Export binary classifier dataframe
df5.to_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/output_files/nfl_post_processing_run_pass_classification_data.csv', index=None, header=True)

df5.shape

(51724, 127)

## Create multi-classification dataset

In [160]:
df6 = df4.copy()

# Drop column not used in classifier
df6.drop(['play_type'], axis=1, inplace=True)
df6.shape

(51724, 127)

In [161]:
# Exclude sacks from dataset
df6 = df6.loc[df6['play_type_detail'] != 'pass_sack']
df6.shape

(51724, 127)

In [162]:
# Exclude passes with no depth designation
df6 = df6.loc[df6['play_type_detail'] != 'pass']
df6.shape

(51721, 127)

In [163]:
# Exclude runs with no direction/landmark designation
df6 = df6.loc[df6['play_type_detail'] != 'run']
df6.shape

(51701, 127)

In [164]:
# Rename target
df6['play_type'] = df6['play_type_detail']

# Drop duplicate target
df6.drop(['play_type_detail'], axis=1, inplace=True)

# Check class frequency
counts = df6.play_type.value_counts(normalize=True)
counts

play_type
short      0.409799
inside     0.217114
outside    0.202066
deep       0.171022
Name: proportion, dtype: float64

In [165]:
# Export binary classifier dataframe
df6.to_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/output_files/nfl_post_processing_multiclass_play_classification_data.csv', index=None, header=True)

df6.shape

(51701, 127)