# EDA - Yards Gained From Passing Play

## 1.0 Import Data

In [1]:
import nfl_data_py as nfl
import pandas as pd

## 2.0 Play Data

### 2.1 Load Data

In [2]:
years = [2020, 2021, 2022]
pbp = nfl.import_pbp_data(years, cache=False, alt_path=None)

2020 done.
2021 done.
2022 done.
Downcasting floats.


### 2.2 Data size

In [3]:
print(f"Shape: {pbp.shape}")

Shape: (149373, 384)


### 2.3 Columns

In [4]:
pbp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149373 entries, 0 to 149372
Columns: 384 entries, play_id to n_defense
dtypes: float64(204), int32(6), int64(1), object(173)
memory usage: 434.2+ MB


In [5]:
for col in nfl.see_pbp_cols():
    print(col)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


### 2.4 Columns of Importance
In relation to points scored by a team

In [6]:
game_info_cols = [
    'play_id', 'game_id', 'old_game_id', 'home_team', 'away_team', 'season_type', 'week', 'season',
    'posteam', 'defteam',
]

game_state_cols = [
    'game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential'
]

play_result_cols = [
    'play_type', 'epa', 'ep'
]

target = 'yards_gained'

cols = game_info_cols + game_state_cols + play_result_cols + [target]

### 2.4 Random sample

In [7]:
pbp.loc[pbp['play_type'] == 'pass', cols].sample(5)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,season,posteam,defteam,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,play_type,epa,ep,yards_gained
103023,955.0,2022_02_HOU_DEN,2022091810,DEN,HOU,REG,2,2022,HOU,DEN,2931.0,1.0,2.0,3.0,39.0,-3.0,pass,-0.300755,3.126375,2.0
461,1831.0,2020_01_CLE_BAL,2020091301,BAL,CLE,REG,1,2020,CLE,BAL,1850.0,2.0,3.0,2.0,23.0,-11.0,pass,-1.124036,3.533631,0.0
136321,3071.0,2022_15_ATL_NO,2022121809,NO,ATL,REG,15,2022,NO,ATL,723.0,4.0,3.0,8.0,87.0,11.0,pass,-2.118582,-0.672301,-9.0
41720,415.0,2020_16_IND_PIT,2020122709,PIT,IND,REG,16,2020,PIT,IND,3242.0,1.0,2.0,11.0,75.0,-7.0,pass,0.360012,0.600582,7.0
76738,3177.0,2021_11_IND_BUF,2021112100,BUF,IND,REG,11,2021,BUF,IND,802.0,4.0,1.0,10.0,10.0,-31.0,pass,-0.464854,5.14033,1.0


In [8]:
pbp.loc[pbp['game_date'] == '2023-02-12', 'game_id'].unique()

array(['2022_22_KC_PHI'], dtype=object)

### 2.5 Game ID == '2022_22_KC_PHI'

In [9]:
game_id = '2022_22_KC_PHI'
game_condition = pbp['game_id'] == game_id

pd.set_option('display.max_columns', None)
pbp.loc[game_condition, cols].sort_values('play_id').head(15)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,season,posteam,defteam,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,play_type,epa,ep,yards_gained
149195,1.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,,,3600.0,1.0,,0.0,,,,0.0,0.930688,
149196,41.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3600.0,1.0,,0.0,35.0,0.0,kickoff,0.0,0.930688,0.0
149197,56.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3600.0,1.0,1.0,10.0,75.0,0.0,run,-1.014878,0.930688,-1.0
149198,86.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3566.0,1.0,2.0,11.0,76.0,0.0,pass,0.210721,-0.08419,6.0
149199,110.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3524.0,1.0,3.0,5.0,70.0,0.0,pass,1.893867,0.126531,12.0
149200,139.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3496.0,1.0,1.0,10.0,58.0,0.0,pass,-0.515964,2.020398,0.0
149201,161.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3490.0,1.0,2.0,10.0,58.0,0.0,run,1.347135,1.504434,11.0
149202,192.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3451.0,1.0,1.0,10.0,47.0,0.0,pass,0.828182,2.851568,13.0
149203,216.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3422.0,1.0,1.0,10.0,34.0,0.0,pass,1.53077,3.67975,23.0
149204,240.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3388.0,1.0,1.0,10.0,11.0,0.0,run,0.963857,5.21052,9.0


## 3.0 Baseline Regressor

### 3.1 Split Data

In [10]:
# Split data
pass_condition = pbp['play_type'] == 'pass'
y = pbp.loc[pass_condition, target]

print(f"y shape: {y.shape}")

y shape: (61587,)


### 3.2 Random Model

In [11]:
import random
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

predictions = []
for _ in range(len(y)):
    defensive_play = random.choice(["run", "pass"])
    if defensive_play == "pass":
        predictions.append(random.randint(-5, 5))
    else:
        predictions.append(random.randint(0, 15))


print(f"R2: {r2_score(y, predictions)}")
print(f"MAE: {mean_absolute_error(y, predictions)}")
print(f"RMSE: {mean_squared_error(y, predictions, squared=True)}")

R2: -0.38018759946285163
MAE: 8.189601701657818
RMSE: 132.66609836491466


## 4.0 Best Splits

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

### 4.1 Split Data

In [13]:
# Bin target
bins = {
    lambda x: x < 0: "loss of yards",
    lambda x: x == 0: "no gain",
    lambda x: x > 0 and x <= 10: "short pass",
    lambda x: x > 10 and x <= 20: "medium pass",
    lambda x: x > 20: "long pass",
}
pbp[f"{target}_bin"] = pbp[target].apply(lambda x: next((v for k, v in bins.items() if k(x)), 0))

# Split data
pass_condition = pbp['play_type'] == "pass"
X = pbp.loc[pass_condition, game_state_cols]
y = pbp.loc[pass_condition, f"{target}_bin"]
label_encoder = LabelEncoder().fit(y)
y_enc = label_encoder.fit_transform(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {list(X.columns)}")
print(f"Targets: {y.unique()}")

X shape: (61587, 6)
y shape: (61587,)
Features: ['game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential']
Targets: ['short pass' 'medium pass' 'no gain' 'long pass' 'loss of yards']


### 4.2 Decision Tree

In [14]:
# Hypertune parameters
params = {
    'max_depth': [7, 9, 11, 13, 15, 17, 19, 21, 23, 25],
    'min_samples_leaf': [10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
    'max_leaf_nodes': [50, 75, 100, 125, 150],
    'random_state': [42]
}

# Randomized search w/ cross validation
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(
    dt, 
    params, 
    cv=5, 
    n_jobs=-1, 
    n_iter=50,
    verbose=1, 
    scoring=['accuracy', 'f1_weighted'],
    refit='f1_weighted'
)
dt_cv.fit(X, y_enc)
print(f"Best score: {dt_cv.best_score_}")
dt_cv.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best score: 0.29909107943617524


## 5.0 Priors

In [15]:
# Grab needed columns
pass_condition = pbp['play_type'] == 'pass'
df = pbp.loc[pass_condition, game_state_cols + [target, 'posteam', 'defteam']]

# Apply embedding to bin columns
df['bin'] = dt_cv.best_estimator_.apply(df[game_state_cols])

# Groupby bin, poss and def
windows = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for team in ['posteam', 'defteam']:
    for window in windows:
        df[f'prior__gs_{team}_pass_yds_gained___{window}'] = (
            df.groupby(['bin', team])[[target]]
            .rolling(window=window, min_periods=1)
            .mean()
            .reset_index(drop=True)
        )
        df[f'prior__gs_{team}_pass_yds_gained___{window}'].fillna(0, inplace=True)

print(f"Size of df: {df.shape}")
print(f"Number of NA: \n{df.isna().sum()}")

df['down'].fillna(4, inplace=True)

Size of df: (61587, 30)
Number of NA: 
game_seconds_remaining                       0
qtr                                          0
down                                       307
ydstogo                                      0
yardline_100                                 0
score_differential                           0
yards_gained                                 0
posteam                                      0
defteam                                      0
bin                                          0
prior__gs_posteam_pass_yds_gained___10       0
prior__gs_posteam_pass_yds_gained___20       0
prior__gs_posteam_pass_yds_gained___30       0
prior__gs_posteam_pass_yds_gained___40       0
prior__gs_posteam_pass_yds_gained___50       0
prior__gs_posteam_pass_yds_gained___60       0
prior__gs_posteam_pass_yds_gained___70       0
prior__gs_posteam_pass_yds_gained___80       0
prior__gs_posteam_pass_yds_gained___90       0
prior__gs_posteam_pass_yds_gained___100      0
prior__gs_defteam_pas

In [16]:
df.head()

Unnamed: 0,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,yards_gained,posteam,defteam,bin,prior__gs_posteam_pass_yds_gained___10,prior__gs_posteam_pass_yds_gained___20,prior__gs_posteam_pass_yds_gained___30,prior__gs_posteam_pass_yds_gained___40,prior__gs_posteam_pass_yds_gained___50,prior__gs_posteam_pass_yds_gained___60,prior__gs_posteam_pass_yds_gained___70,prior__gs_posteam_pass_yds_gained___80,prior__gs_posteam_pass_yds_gained___90,prior__gs_posteam_pass_yds_gained___100,prior__gs_defteam_pass_yds_gained___10,prior__gs_defteam_pass_yds_gained___20,prior__gs_defteam_pass_yds_gained___30,prior__gs_defteam_pass_yds_gained___40,prior__gs_defteam_pass_yds_gained___50,prior__gs_defteam_pass_yds_gained___60,prior__gs_defteam_pass_yds_gained___70,prior__gs_defteam_pass_yds_gained___80,prior__gs_defteam_pass_yds_gained___90,prior__gs_defteam_pass_yds_gained___100
2,3600.0,1.0,1.0,10.0,75.0,0.0,5.0,SF,ARI,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5
6,3459.0,1.0,3.0,14.0,45.0,0.0,11.0,SF,ARI,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
9,3410.0,1.0,1.0,10.0,75.0,-3.0,3.0,ARI,SF,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3384.0,1.0,2.0,7.0,72.0,-3.0,7.0,ARI,SF,26,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,3360.0,1.0,1.0,10.0,65.0,-3.0,0.0,ARI,SF,118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


## 6.0 Model

In [17]:
from xgboost import XGBRegressor

### 6.1 Split Data

In [18]:
# Split data
priors = [
    f'prior__gs_{team}_pass_yds_gained___{window}' 
    for team in ['posteam', 'defteam']
    for window in windows
]
X = df.loc[:, priors]
y = df.loc[:, target]
label_encoder = LabelEncoder().fit(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {list(X.columns)}")

X shape: (61587, 20)
y shape: (61587,)
Features: ['prior__gs_posteam_pass_yds_gained___10', 'prior__gs_posteam_pass_yds_gained___20', 'prior__gs_posteam_pass_yds_gained___30', 'prior__gs_posteam_pass_yds_gained___40', 'prior__gs_posteam_pass_yds_gained___50', 'prior__gs_posteam_pass_yds_gained___60', 'prior__gs_posteam_pass_yds_gained___70', 'prior__gs_posteam_pass_yds_gained___80', 'prior__gs_posteam_pass_yds_gained___90', 'prior__gs_posteam_pass_yds_gained___100', 'prior__gs_defteam_pass_yds_gained___10', 'prior__gs_defteam_pass_yds_gained___20', 'prior__gs_defteam_pass_yds_gained___30', 'prior__gs_defteam_pass_yds_gained___40', 'prior__gs_defteam_pass_yds_gained___50', 'prior__gs_defteam_pass_yds_gained___60', 'prior__gs_defteam_pass_yds_gained___70', 'prior__gs_defteam_pass_yds_gained___80', 'prior__gs_defteam_pass_yds_gained___90', 'prior__gs_defteam_pass_yds_gained___100']


In [19]:
xgb_clf = XGBRegressor()

# Hyperparameter to search
param_dist = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500],            
    'learning_rate': [0.001, 0.01, 0.05, .1],          
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],                     
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'alpha': [0, 0.5, 1, 1.5, 2, 5],
    'max_leaves': [100, 200, 300, 400, 500],
}

# Step 4: Create RandomizedSearchCV and fit the data
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=10,
    scoring=['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'],
    refit='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
)

random_search.fit(X, y_enc)

# Step 5: Print the best hyperparameters and their corresponding mean cross-validated score
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Mean Cross-validated Score: ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Best Hyperparameters:  {'subsample': 0.7, 'n_estimators': 150, 'max_leaves': 400, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.9, 'alpha': 0}
Best Mean Cross-validated Score:  -1.1974767536049906


### 6.2 Feature Importance

In [20]:
feature_importance = random_search.best_estimator_.feature_importances_
feature_names = random_search.best_estimator_.get_booster().feature_names

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
7,prior__gs_posteam_pass_yds_gained___80,0.066732
9,prior__gs_posteam_pass_yds_gained___100,0.065574
8,prior__gs_posteam_pass_yds_gained___90,0.064739
5,prior__gs_posteam_pass_yds_gained___60,0.057137
17,prior__gs_defteam_pass_yds_gained___80,0.055072
19,prior__gs_defteam_pass_yds_gained___100,0.052993
14,prior__gs_defteam_pass_yds_gained___50,0.052667
12,prior__gs_defteam_pass_yds_gained___30,0.049106
18,prior__gs_defteam_pass_yds_gained___90,0.048821
16,prior__gs_defteam_pass_yds_gained___70,0.048622


## 7.0 Save Model

In [21]:
import pickle

In [22]:
with open('/home/tylerengland/NFL/backend/models/pass_yards_gained/pass_yards_gained__00001.pkl', 'wb') as file:
    pickle.dump(random_search.best_estimator_, file)