# EDA - Play duration

## 1.0 Import Data

In [1]:
import nfl_data_py as nfl
import pandas as pd

## 2.0 Play Data

### 2.1 Load Data

In [2]:
years = [2020, 2021, 2022]
pbp = nfl.import_pbp_data(years, downcast=True, cache=False, alt_path=None)

2020 done.
2021 done.
2022 done.
Downcasting floats.


### 2.2 Data size

In [3]:
print(f"Shape: {pbp.shape}")

Shape: (149373, 384)


### 2.3 Columns

In [4]:
pbp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149373 entries, 0 to 149372
Columns: 384 entries, play_id to n_defense
dtypes: float64(204), int32(6), int64(1), object(173)
memory usage: 434.2+ MB


In [5]:
for col in nfl.see_pbp_cols():
    print(col)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


### 2.4 Columns of Importance
In relation to points scored by a team

In [6]:
pbp['play_type'].unique()

array([None, 'kickoff', 'pass', 'run', 'field_goal', 'punt',
       'extra_point', 'no_play', 'qb_spike', 'qb_kneel'], dtype=object)

In [11]:
game_info_cols = [
    'play_id', 'game_id', 'game_date', 'home_team', 'away_team', 'season_type', 'week', 'season',
    'posteam', 'defteam',
]

game_state_cols = [
    'game_seconds_remaining', 'qtr', 'down', 'ydstogo', 'yardline_100', 'score_differential'
]

# Play types
pbp['play_type_pass'] = pbp['play_type'].apply(lambda x: 1 if x == 'pass' else 0)
pbp['play_type_run'] = pbp['play_type'].apply(lambda x: 1 if x == 'run' else 0)
pbp['play_type_punt'] = pbp['play_type'].apply(lambda x: 1 if x == 'punt' else 0)
pbp['play_type_field_goal'] = pbp['play_type'].apply(lambda x: 1 if x == 'field_goal' else 0)
pbp['play_type_kickoff'] = pbp['play_type'].apply(lambda x: 1 if x == 'kickoff' else 0)
pbp['play_type_extra_point'] = pbp['play_type'].apply(lambda x: 1 if x == 'extra_point' else 0)
pbp['play_type_no_play'] = pbp['play_type'].apply(lambda x: 1 if x == 'no_play' else 0)
pbp['play_type_spike'] = pbp['play_type'].apply(lambda x: 1 if x == 'qb_kneel' else 0)
pbp['play_type_qb_kneel'] = pbp['play_type'].apply(lambda x: 1 if x == 'qb_kneel' else 0)

play_result_cols = [
    'yards_gained', 'play_type_pass', 'play_type_run', 'play_type_punt', 'play_type_field_goal',
    'play_type_kickoff', 'play_type_extra_point', 'play_type_no_play', 'play_type_spike', 'play_type_qb_kneel'
]

# Previous play outcomes
pbp['timeout_prev'] = pbp.groupby(['game_id'])['timeout'].shift(1)
pbp['out_of_bounds_prev'] = pbp.groupby(['game_id'])['out_of_bounds'].shift(1)
pbp['interception_prev'] = pbp.groupby(['game_id'])['interception'].shift(1)
pbp['fumble_prev'] = pbp.groupby(['game_id'])['fumble'].shift(1)
pbp['safety_prev'] = pbp.groupby(['game_id'])['safety'].shift(1)
pbp['touchdown_prev'] = pbp.groupby(['game_id'])['touchdown'].shift(1)

time_cols = [
    'timeout_prev', 'out_of_bounds_prev', 'interception_prev', 'fumble_prev', 'safety_prev', 'touchdown_prev',
]

# New column that takes the difference between game_seconds_remaining and previous game_seconds_remaining
pbp['play_duration'] = pbp.groupby(['game_id'])['game_seconds_remaining'].diff(-1)
pbp['play_duration'] = pbp['play_duration'].fillna(0)
pbp['play_duration'] = pbp['play_duration'].apply(lambda x: 0 if x in [-900, -600] else x)
target = 'play_duration'

cols = game_info_cols + game_state_cols + play_result_cols + time_cols + [target]

## 3.0 Split Data

In [12]:
# Split data by year
train_df = pbp.loc[pbp['season'] < 2022, :].copy()
test_df = pbp.loc[pbp['season'] >= 2022, :].copy()

# Bin target column
bins = [-1, 0, 5, 25, 35, 40, 45, 100]
train_df.loc[:, f"{target}_bin"] = pd.cut(
    train_df.loc[:, target], 
    bins=bins, 
    labels=False
)
test_df.loc[:, f"{target}_bin"] = pd.cut(
    test_df.loc[:, target], 
    bins=bins,
    labels=False
)

# Split data
cols = game_info_cols + game_state_cols + play_result_cols + time_cols
features = game_state_cols + play_result_cols + time_cols
X_train = train_df.loc[:, cols]
X_test = test_df.loc[:, cols]
y_train = train_df.loc[:, f"{target}_bin"]
y_test = test_df.loc[:, f"{target}_bin"]

# Fill missing values
X_train['down'].fillna(0, inplace=True)
X_test['down'].fillna(0, inplace=True)

# Convert game_date to datetime
X_train['game_date'] = pd.to_datetime(X_train['game_date'])
X_test['game_date'] = pd.to_datetime(X_test['game_date'])

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 99226
Test size: 50147


## 4.0 Baseline Model

In [13]:
import random
from sklearn.metrics import classification_report

predictions = random.choices(y_test.unique(), k=len(y_test))

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.16      0.14      0.15      7874
           1       0.17      0.14      0.15      8313
           2       0.22      0.13      0.17     11655
           3       0.12      0.14      0.13      6109
           4       0.15      0.14      0.15      7524
           5       0.14      0.15      0.15      6896
           6       0.04      0.14      0.06      1776

    accuracy                           0.14     50147
   macro avg       0.14      0.14      0.14     50147
weighted avg       0.16      0.14      0.15     50147



## 5.0 Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

# Hypertune parameters
params = {
    'max_depth': [7, 9, 11, 13, 15, 17, 19, 21, 23, 25],
    'min_samples_leaf': [10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
    'max_leaf_nodes': [50, 75, 100, 125, 150],
    'random_state': [42]
}

# Randomized search w/ cross validation
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(
    dt, 
    params, 
    cv=5, 
    n_jobs=-1, 
    n_iter=50,
    verbose=1, 
    scoring=['f1_weighted'],
    refit='f1_weighted'
)

# Fit model
dt_cv.fit(X_train.loc[:, features], y_train)
print(f"Best score: {dt_cv.best_score_}")
print(f"Best params: {dt_cv.best_params_}")

# Evaluate model
dt_cv.best_estimator_.fit(X_train.loc[:, features], y_train)
y_pred = dt_cv.best_estimator_.predict(X_test.loc[:, features])
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best score: 0.5285598392261159
Best params: {'random_state': 42, 'min_samples_leaf': 150, 'max_leaf_nodes': 125, 'max_depth': 19}
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      7874
           1       0.67      0.73      0.70      8313
           2       0.62      0.52      0.56     11655
           3       0.32      0.24      0.27      6109
           4       0.34      0.50      0.41      7524
           5       0.39      0.38      0.39      6896
           6       0.24      0.02      0.04      1776

    accuracy                           0.55     50147
   macro avg       0.49      0.48      0.47     50147
weighted avg       0.54      0.55      0.54     50147



## 6.0 Save Model

In [15]:
import pickle
with open('/home/tylerengland/NFL/backend/models/play_duration/play_duration__00001.pkl', 'wb') as file:
    pickle.dump(dt_cv.best_estimator_, file)

In [16]:
import pickle
# Load model
with open('/home/tylerengland/NFL/backend/models/play_duration/play_duration__00001.pkl', 'rb') as f:
    model = pickle.load(f)
    
model