# EDA - Kickoff yards

## 1.0 Import Data

In [18]:
import nfl_data_py as nfl
import pandas as pd

## 2.0 Play Data

### 2.1 Load Data

In [19]:
years = [2020, 2021, 2022]
pbp = nfl.import_pbp_data(years, downcast=True, cache=False, alt_path=None)

2020 done.
2021 done.
2022 done.
Downcasting floats.


### 2.2 Data size

In [20]:
print(f"Shape: {pbp.shape}")

Shape: (149373, 384)


### 2.3 Columns

In [21]:
pbp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149373 entries, 0 to 149372
Columns: 384 entries, play_id to n_defense
dtypes: float64(204), int32(6), int64(1), object(173)
memory usage: 434.2+ MB


In [22]:
for col in nfl.see_pbp_cols():
    print(col)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


### 2.4 Columns of Importance
In relation to points scored by a team

In [33]:
import numpy as np
game_info_cols = [
    'play_id', 'game_id', 'game_date', 'home_team', 'away_team', 'season_type', 'week', 'season',
    'posteam', 'defteam',
]

game_state_cols = [
    'game_seconds_remaining', 'qtr', 'score_differential'
]

kick_cols = [
    'play_type', 'kickoff_attempt', 
    'kickoff_inside_twenty', 'kickoff_out_of_bounds', 'kickoff_in_endzone', 'kickoff_downed', 'kickoff_fair_catch',
    'touchback', 'return_yards'
]

target = 'kick_distance'

cols = game_info_cols + game_state_cols + kick_cols + [target]

## 3.0 Split Data

In [60]:
# Fill kick distance with 75 if touchback is true
pbp['kick_distance'] = np.where((pbp['kickoff_attempt'] == 1) & (pbp['touchback'] == 1), 75, pbp['kick_distance'])
# Fill kick distance with the differnece of the next plays yardline_100 - current plays return_yards if kick_distance is null
pbp['kick_distance'] = np.where((pbp['kickoff_attempt'] == 1) & (pbp['kick_distance'].isnull()), pbp['yardline_100'].shift(-1) - pbp['return_yards'], pbp['kick_distance'])

# Split data by year
train_df = pbp.loc[pbp['season'] < 2022, :].copy()
test_df = pbp.loc[pbp['season'] >= 2022, :].copy()

# Drop nulls
train_df = train_df.dropna(subset=[target])
test_df = test_df.dropna(subset=[target])

# Bin target column
bins = [-101, 30, 35, 40, 45, 50, 55, 60, 65, 100]
train_df.loc[:, f"{target}_bin"] = pd.cut(
    train_df.loc[:, target], 
    bins=bins, 
    labels=False
)
test_df.loc[:, f"{target}_bin"] = pd.cut(
    test_df.loc[:, target], 
    bins=bins,
    labels=False
)

# Split data
cols = game_info_cols + game_state_cols + kick_cols
X_train = train_df.loc[train_df['kickoff_attempt'] == 1, cols]
X_test = test_df.loc[test_df['kickoff_attempt'] == 1, cols]
y_train = train_df.loc[train_df['kickoff_attempt'] == 1, f"{target}_bin"]
y_test = test_df.loc[test_df['kickoff_attempt'] == 1, f"{target}_bin"]

# Convert game_date to datetime
X_train['game_date'] = pd.to_datetime(X_train['game_date'])
X_test['game_date'] = pd.to_datetime(X_test['game_date'])

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 5796
Test size: 2838


## 4.0 Baseline Model

In [61]:
import random
from sklearn.metrics import classification_report

predictions = random.choices(y_test.unique(), k=len(y_test))

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.04      0.09      0.05       132
           1       0.00      0.00      0.00        20
           2       0.00      0.04      0.01        25
           3       0.01      0.04      0.01        49
           4       0.03      0.13      0.05        76
           5       0.04      0.12      0.06       109
           6       0.07      0.08      0.07       240
           7       0.12      0.10      0.11       413
           8       0.61      0.11      0.18      1774

    accuracy                           0.10      2838
   macro avg       0.10      0.08      0.06      2838
weighted avg       0.41      0.10      0.14      2838



## 5.0 Decision Tree

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

# Hypertune parameters
params = {
    'max_depth': [7, 9, 11, 13, 15, 17, 19, 21, 23, 25],
    'min_samples_leaf': [10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
    'max_leaf_nodes': [50, 75, 100, 125, 150],
    'random_state': [42]
}

# Randomized search w/ cross validation
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(
    dt, 
    params, 
    cv=5, 
    n_jobs=-1, 
    n_iter=50,
    verbose=1, 
    scoring=['f1_weighted'],
    refit='f1_weighted'
)

# Fit model
dt_cv.fit(X_train.loc[:, game_state_cols], y_train)
print(f"Best score: {dt_cv.best_score_}")
print(f"Best params: {dt_cv.best_params_}")

# Evaluate model
dt_cv.best_estimator_.fit(X_train.loc[:, game_state_cols], y_train)
y_pred = dt_cv.best_estimator_.predict(X_test.loc[:, game_state_cols])
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Best score: 0.5068184706283152
Best params: {'random_state': 42, 'min_samples_leaf': 10, 'max_leaf_nodes': 125, 'max_depth': 13}
              precision    recall  f1-score   support

           0       0.84      0.36      0.51       132
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        49
           4       0.00      0.00      0.00        76
           5       0.00      0.00      0.00       109
           6       0.10      0.00      0.01       240
           7       0.12      0.03      0.04       413
           8       0.64      0.96      0.77      1774

    accuracy                           0.62      2838
   macro avg       0.19      0.15      0.15      2838
weighted avg       0.46      0.62      0.51      2838



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 6.0 Save Model

In [63]:
import pickle
with open('/home/tylerengland/NFL/backend/models/kickoff_distance/kickoff_distance__00001.pkl', 'wb') as file:
    pickle.dump(dt_cv.best_estimator_, file)

In [64]:
import pickle
# Load model
with open('/home/tylerengland/NFL/backend/models/kickoff_distance/kickoff_distance__00001.pkl', 'rb') as f:
    model = pickle.load(f)
    
model