# EDA for nfl_data_py package

## 1.0 Import Data

In [1]:
import nfl_data_py as nfl
import pandas as pd

## 2.0 Play Data

### 2.1 Load Data

In [2]:
years = [2020, 2021, 2022]
pbp = nfl.import_pbp_data(years, cache=False, alt_path=None)

2020 done.
2021 done.
2022 done.
Downcasting floats.


### 2.2 Data size

In [3]:
print(f"Shape: {pbp.shape}")

Shape: (149373, 384)


### 2.3 Columns

In [4]:
pbp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149373 entries, 0 to 149372
Columns: 384 entries, play_id to n_defense
dtypes: float64(204), int32(6), int64(1), object(173)
memory usage: 434.2+ MB


In [5]:
for col in nfl.see_pbp_cols():
    print(col)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


### 2.4 Columns of Importance
In relation to points scored by a team

In [6]:
game_info_cols = [
    'play_id', 'game_id', 'old_game_id', 'home_team', 'away_team', 'season_type', 'week', 'season',
    'posteam', 'defteam',
]

game_state_cols = [
    'game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential'
]

play_result_cols = [
    'yards_gained'
]

target = 'play_type'

cols = game_info_cols + game_state_cols + play_result_cols +  [target]

### 2.4 Random sample

In [7]:
pbp[cols].sample(5)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,season,posteam,defteam,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,yards_gained,play_type
72270,2956.0,2021_09_MIN_BAL,2021110700,BAL,MIN,REG,9,2021,BAL,MIN,1176.0,3.0,1.0,10.0,82.0,-7.0,1.0,run
79365,3400.0,2021_12_LV_DAL,2021112501,DAL,LV,REG,12,2021,LV,DAL,900.0,4.0,3.0,15.0,47.0,8.0,0.0,pass
8398,139.0,2020_03_TEN_MIN,2020092703,MIN,TEN,REG,3,2020,TEN,MIN,3508.0,1.0,1.0,20.0,36.0,0.0,-1.0,pass
54686,2501.0,2021_03_BAL_DET,2021092602,DET,BAL,REG,3,2021,DET,BAL,1636.0,3.0,1.0,10.0,75.0,-13.0,5.0,pass
39732,5046.0,2020_15_PHI_ARI,2020122011,ARI,PHI,REG,15,2020,PHI,ARI,21.0,4.0,2.0,10.0,49.0,-7.0,18.0,pass


In [8]:
pbp.loc[pbp['game_date'] == '2023-02-12', 'game_id'].unique()

array(['2022_22_KC_PHI'], dtype=object)

### 2.5 Game ID == '2022_22_KC_PHI'

In [9]:
game_id = '2022_22_KC_PHI'
game_condition = pbp['game_id'] == game_id

pd.set_option('display.max_columns', None)
pbp.loc[game_condition, cols].sort_values('play_id').head(15)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,season,posteam,defteam,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,yards_gained,play_type
149195,1.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,,,3600.0,1.0,,0.0,,,,
149196,41.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3600.0,1.0,,0.0,35.0,0.0,0.0,kickoff
149197,56.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3600.0,1.0,1.0,10.0,75.0,0.0,-1.0,run
149198,86.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3566.0,1.0,2.0,11.0,76.0,0.0,6.0,pass
149199,110.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3524.0,1.0,3.0,5.0,70.0,0.0,12.0,pass
149200,139.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3496.0,1.0,1.0,10.0,58.0,0.0,0.0,pass
149201,161.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3490.0,1.0,2.0,10.0,58.0,0.0,11.0,run
149202,192.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3451.0,1.0,1.0,10.0,47.0,0.0,13.0,pass
149203,216.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3422.0,1.0,1.0,10.0,34.0,0.0,23.0,pass
149204,240.0,2022_22_KC_PHI,2023021200,PHI,KC,POST,22,2022,PHI,KC,3388.0,1.0,1.0,10.0,11.0,0.0,9.0,run


## 3.0 Baseline Classifier

In [10]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

### 3.1 Split Data

In [11]:
# Split data
pass_or_rush_condition = pbp['play_type'].isin(['pass', 'run'])
y = pbp.loc[pass_or_rush_condition, target]
label_encoder = LabelEncoder().fit(y)
y_enc = label_encoder.fit_transform(y)

print(f"y shape: {y.shape}")
print(f"Targets: {y.unique()}")

y shape: (105434,)
Targets: ['pass' 'run']


### 3.2 Random Model

In [12]:
y_rand = np.random.randint(0, 2, len(y))
print(classification_report(y_enc, y_rand))

              precision    recall  f1-score   support

           0       0.59      0.50      0.54     61587
           1       0.42      0.51      0.46     43847

    accuracy                           0.50    105434
   macro avg       0.50      0.50      0.50    105434
weighted avg       0.52      0.50      0.51    105434



## 4.0 Best Splits

In [13]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

### 4.1 Split Data

In [14]:
# Split data
pass_or_rush_condition = pbp['play_type'].isin(['pass', 'run'])
X = pbp.loc[pass_or_rush_condition, game_state_cols]
y = pbp.loc[pass_or_rush_condition, target]
label_encoder = LabelEncoder().fit(y)
y_enc = label_encoder.fit_transform(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {list(X.columns)}")
print(f"Targets: {y.unique()}")

X shape: (105434, 6)
y shape: (105434,)
Features: ['game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential']
Targets: ['pass' 'run']


### 4.2 Decision Tree

In [15]:
# Hypertune parameters
params = {
    'max_depth': [7, 9, 11, 13, 15, 17, 19, 21, 23, 25],
    'min_samples_leaf': [10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
    'max_leaf_nodes': [50, 75, 100, 125, 150],
    'random_state': [42]
}

# Randomized search w/ cross validation
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(
    dt, 
    params, 
    cv=5, 
    n_jobs=-1, 
    n_iter=50,
    verbose=1, 
    scoring=['accuracy', 'recall', 'precision', 'f1'],
    refit='f1'
)
dt_cv.fit(X, y_enc)
print(f"Best score: {dt_cv.best_score_}")
dt_cv.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Best score: 0.6228002299029154


## 5.0 Priors

In [87]:
# Grab needed columns
df = pbp.loc[pass_or_rush_condition, game_state_cols + [target, 'posteam', 'defteam']]

# Convert target to numeric columns
df['pass'] = df[target].map({'pass': 1, 'run': 0})
df['run'] = df[target].map({'pass': 0, 'run': 1})

# Apply embedding to bin columns
df['bin'] = dt_cv.best_estimator_.apply(df[game_state_cols])

# Groupby bin, poss and def;  rolling mean over 100 plays
for play_type in ['pass', 'run']:
    for team in ['posteam', 'defteam']:
        df[f'prior__gs_{team}_{play_type}___100'] = (
            df.groupby(['bin', team])[[play_type]]
            .rolling(window=1000, min_periods=1)
            .mean()
            .reset_index(drop=True)
        )

print(f"Size of df: {df.shape}")
print(f"Number of NA: \n{df.isna().sum()}")

# Fill NA with 0.5
df = df.fillna(0.5)

df.head()

Size of df: (105434, 16)
Number of NA: 
game_seconds_remaining              0
qtr                                 0
down                              424
ydstogo                             0
yardline_100                        0
score_differential                  0
play_type                           0
posteam                             0
defteam                             0
pass                                0
run                                 0
bin                                 0
prior__gs_posteam_pass___100    31041
prior__gs_defteam_pass___100    31041
prior__gs_posteam_run___100     31041
prior__gs_defteam_run___100     31041
dtype: int64


Unnamed: 0,game_seconds_remaining,qtr,down,ydstogo,yardline_100,score_differential,play_type,posteam,defteam,pass,run,bin,prior__gs_posteam_pass___100,prior__gs_defteam_pass___100,prior__gs_posteam_run___100,prior__gs_defteam_run___100
2,3600.0,1.0,1.0,10.0,75.0,0.0,pass,SF,ARI,1,0,297,0.0,0.333333,1.0,0.666667
3,3582.0,1.0,1.0,10.0,55.0,0.0,run,SF,ARI,0,1,297,0.0,0.25,1.0,0.75
4,3539.0,1.0,1.0,10.0,41.0,0.0,run,SF,ARI,0,1,297,0.0,0.4,1.0,0.6
5,3501.0,1.0,2.0,8.0,39.0,0.0,run,SF,ARI,0,1,57,0.0,0.333333,1.0,0.666667
6,3459.0,1.0,3.0,14.0,45.0,0.0,pass,SF,ARI,1,0,205,0.142857,0.428571,0.857143,0.571429


## 6.0 Model

In [88]:
from xgboost import XGBClassifier

### 6.1 Split Data

In [130]:
# Apply embeddings for probabilities
df['gs_pass_proba'] = dt_cv.best_estimator_.predict_proba(df[game_state_cols])[:, 1]

# Split data
priors = [
    f'prior__gs_{team}_{play_type}___100' 
    for team in ['posteam', 'defteam'] 
    for play_type in ['pass']
]
X = df.loc[:, game_state_cols + priors]
y = df.loc[:, target]
label_encoder = LabelEncoder().fit(y)
y_enc = label_encoder.fit_transform(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {list(X.columns)}")
print(f"Targets: {y.unique()}")

X shape: (105434, 8)
y shape: (105434,)
Features: ['game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential', 'prior__gs_posteam_pass___100', 'prior__gs_defteam_pass___100']
Targets: ['pass' 'run']


In [131]:
xgb_clf = XGBClassifier()

# Hyperparameter to search
param_dist = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500],            
    'learning_rate': [0.001, 0.01, 0.05, .1],          
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],                     
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'alpha': [0, 0.5, 1, 1.5, 2, 5],
    'max_leaves': [100, 200, 300, 400, 500],
}

# Step 4: Create RandomizedSearchCV and fit the data
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=10,
    scoring=['accuracy', 'precision', 'recall', 'f1'],
    refit='f1',
    cv=5,
    n_jobs=-1,
    verbose=1,
)

random_search.fit(X, y_enc)

# Step 5: Print the best hyperparameters and their corresponding mean cross-validated score
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Mean Cross-validated Score: ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters:  {'subsample': 0.7, 'n_estimators': 150, 'max_leaves': 200, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 5, 'colsample_bytree': 1.0, 'alpha': 0}
Best Mean Cross-validated Score:  0.6334406093702158


### 6.2 Feature Importance

In [132]:
feature_importance = random_search.best_estimator_.feature_importances_
feature_names = random_search.best_estimator_.get_booster().feature_names

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
2,down,0.319681
3,ydstogo,0.297077
0,game_seconds_remaining,0.101619
5,score_differential,0.099831
1,qtr,0.088295
4,yardline_100,0.045513
6,prior__gs_posteam_pass___100,0.024199
7,prior__gs_defteam_pass___100,0.023785


## 7.0 Save Model

In [133]:
import pickle

In [134]:
with open('/home/tylerengland/NFL/backend/models/pass_run_prob/pass_run_prob__00001.pkl', 'wb') as file:
    pickle.dump(random_search.best_estimator_, file)