# Introduction

This model will serve as our first foray into time-series forecasting using LSTMs. We will be following [this tutorial](https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/).

The code will be broken into the following sections:

```{raw}
I. Data and Imports
II. Data Processing
    a. Cleaning data
    b. Separating data into drives (drive_id)
    c. Next-play feature
III. Model Creation
IV. Model Training
V. Model Evaluation
```

# I. Data and Imports

In [207]:
import numpy as np
import tensorflow as tf
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

In [208]:
BATCH_SIZE = 100
EPOCHS = 10

In [209]:
data = pd.read_csv("../data/NFL_Play_by_Play_2009-2018_(v5).csv")

  data = pd.read_csv("../data/NFL_Play_by_Play_2009-2018_(v5).csv")


In [210]:
[col for col in data.columns.to_list() if "pos" in col]

['posteam',
 'posteam_type',
 'posteam_timeouts_remaining',
 'posteam_score',
 'posteam_score_post',
 'defteam_score_post',
 'score_differential_post',
 'home_wp_post',
 'away_wp_post']

# II. Data Processing

## II.a Data Cleaning

In [211]:
# Selecting only valid plays
data = data[data['play_type'].notna()]

In [212]:
# Dropping columns with too many missing values
data = data.dropna(axis = 1, thresh=10000)

In [213]:
# Selecting only useful columns
useful_columns = ["game_id", 'yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 
                  'game_seconds_remaining', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 
                  'ydstogo', 'ydsnet', 'yards_gained', 'shotgun', 'no_huddle', 'home_timeouts_remaining', 
                  'defteam_timeouts_remaining','defteam_score','away_timeouts_remaining', 
                  'timeout', 'defteam_timeouts_remaining', 'total_home_score',  
                  'posteam_timeouts_remaining', 'posteam_score', 'total_away_score', 'defteam_score',
                  'score_differential', 'defteam_score_post',  'score_differential_post', 'touchdown', "play_type"]
data = data[useful_columns]

In [214]:
def classify_play_type(x):
    if x == "kickoff" or x == "punt" or x == "field_goal" or x == "extra_point":
        return 0    # Special Teams
    elif x == "pass" or x == "qb_spike":
        return 1    # pass
    elif x == "run" or x == "qb_kneel":
        return 2    # run
    else:
        return 3    # no play
    
# Classifying play type
data["play_type"] = data["play_type"].apply(classify_play_type)

In [215]:
data.head()

Unnamed: 0,game_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,qtr,down,...,total_home_score,posteam_timeouts_remaining,posteam_score,total_away_score,defteam_score,score_differential,defteam_score_post,score_differential_post,touchdown,play_type
0,2009091000,30.0,900.0,1800.0,3600.0,0,1,0,1,,...,0,3.0,,0,,,0.0,0.0,0.0,0
1,2009091000,58.0,893.0,1793.0,3593.0,0,1,0,1,1.0,...,0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1
2,2009091000,53.0,856.0,1756.0,3556.0,0,1,0,1,2.0,...,0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,2
3,2009091000,56.0,815.0,1715.0,3515.0,0,1,0,1,3.0,...,0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1
4,2009091000,56.0,807.0,1707.0,3507.0,0,1,0,1,4.0,...,0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0


In [216]:
# Confirming data types are numeric
data.dtypes

game_id                         int64
yardline_100                  float64
quarter_seconds_remaining     float64
half_seconds_remaining        float64
game_seconds_remaining        float64
quarter_end                     int64
drive                           int64
sp                              int64
qtr                             int64
down                          float64
goal_to_go                    float64
ydstogo                         int64
ydsnet                          int64
yards_gained                  float64
shotgun                         int64
no_huddle                       int64
home_timeouts_remaining         int64
defteam_timeouts_remaining    float64
defteam_score                 float64
away_timeouts_remaining         int64
timeout                       float64
defteam_timeouts_remaining    float64
total_home_score                int64
posteam_timeouts_remaining    float64
posteam_score                 float64
total_away_score                int64
defteam_scor

In [217]:
# Checking missing values
data.isna().sum()

game_id                           0
yardline_100                    368
quarter_seconds_remaining         1
half_seconds_remaining           25
game_seconds_remaining           23
quarter_end                       0
drive                             0
sp                                0
qtr                               0
down                          55563
goal_to_go                        5
ydstogo                           0
ydsnet                            0
yards_gained                    213
shotgun                           0
no_huddle                         0
home_timeouts_remaining           0
defteam_timeouts_remaining        5
defteam_score                  2545
away_timeouts_remaining           0
timeout                           0
defteam_timeouts_remaining        5
total_home_score                  0
posteam_timeouts_remaining        5
posteam_score                  2545
total_away_score                  0
defteam_score                  2545
score_differential          

In [218]:
# Down missing is likely due to undowned plays, such as kickoff, extra point, etc.
data = data[~data["down"].isna()]

In [219]:
# Rechecking missing values
data.isna().sum()

game_id                         0
yardline_100                    0
quarter_seconds_remaining       0
half_seconds_remaining         19
game_seconds_remaining         20
quarter_end                     0
drive                           0
sp                              0
qtr                             0
down                            0
goal_to_go                      0
ydstogo                         0
ydsnet                          0
yards_gained                  213
shotgun                         0
no_huddle                       0
home_timeouts_remaining         0
defteam_timeouts_remaining      0
defteam_score                   0
away_timeouts_remaining         0
timeout                         0
defteam_timeouts_remaining      0
total_home_score                0
posteam_timeouts_remaining      0
posteam_score                   0
total_away_score                0
defteam_score                   0
score_differential              0
defteam_score_post              0
score_differen

In [220]:
# There arent many, so dropping remaining
data = data.dropna()
data.shape

(380692, 32)

In [None]:
# Creating a unique drive id
data["game_id_str"] = data["game_id"].astype("str")
data["drive_str"] = data["drive"].astype('str')

data["drive_id"] = data["game_id_str"].str.cat(data["drive_str"])

In [None]:
# Dropping temporary columns i created
data = data.drop(["game_id", "game_id_str", "drive_str"],axis=1)

In [None]:
# Reordering columns
col_order = ["drive_id"] + list(data.columns)[:-1]
data = data[col_order]

In [224]:
data.head()

Unnamed: 0,drive_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,qtr,down,...,posteam_timeouts_remaining,posteam_score,total_away_score,defteam_score,defteam_score.1,score_differential,defteam_score_post,score_differential_post,touchdown,play_type
1,20090910001,58.0,893.0,1793.0,3593.0,0,1,0,1,1.0,...,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,20090910001,53.0,856.0,1756.0,3556.0,0,1,0,1,2.0,...,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,20090910001,56.0,815.0,1715.0,3515.0,0,1,0,1,3.0,...,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,20090910001,56.0,807.0,1707.0,3507.0,0,1,0,1,4.0,...,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,20090910002,98.0,796.0,1696.0,3496.0,0,2,0,1,1.0,...,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [None]:
# Checking drive_ids
ids = list(data["drive_id"].unique())
ids[:5]

['20090910001', '20090910002', '20090910003', '20090910004', '20090910005']

In [None]:
# Number of drives
len(ids)

58729

In [None]:
# Checking shape
data.shape

(380692, 36)

In [None]:
# WARNING: Takes 23 minutes
# Splitting the dataframe by drive ID and storing each drive as its own numpy array. Each drive frame has shame (?, 35), where the question mark depends on the number of plays in the drive.
# I also drop drive_id since it is non-numeric. I finally take the dataframe and insert it as a numpy array

broken_data = [data[data["drive_id"] == i].drop("drive_id", axis=1).to_numpy() for i in ids]


# Buffering data for consistency

The longest drive was 34 plays, so we need to have each "drive" frame be of shape (34, 36)

In [None]:
# Renaming the list
drive_data = broken_data

# Initializing some values
MAX_DRIVES = 0                      # To store the longest drive (# plays); in this data, MAX_DRIVES = 34
FEATURES = drive_data[0].shape[1]   # To store the num of features: 35

# Finding the longest drive
for drive in drive_data:
    if drive.shape[0] > MAX_DRIVES:
        MAX_DRIVES = drive.shape[0]

# Extending each drive frame by buffer of 0s
for i, drive in enumerate(drive_data):
    rows = drive.shape[0]

    # # removing drive_id from columns
    # temp_drive = []
    # for row in drive:
    #     temp_drive.append(row[1:])
    # temp_drive = np.array(temp_drive)

    # Pad with rows of 0s
    if rows != MAX_DRIVES:
        buffer = np.zeros((MAX_DRIVES-rows, FEATURES))  # Create an array of 0s to fit onto the data to ensure it is of shape (MAX_DRIVES=34, 35)
        drive_data[i] = np.concatenate((buffer, drive)) # Concatenating the 0-padding and the drive data into one numpy array and storing it

# Setting drive data to an NP.array
drive_data = np.array(drive_data)   


In [None]:
# Checking that the shape is (# Drives, # Plays in each Drive, # Features) = (58729, MAX_DRIVES, FEATURES)
drive_data.shape

(58729, 34, 35)

In [None]:
# Examining data. Note the 0 padding and real data at the end.
drive_data[0]

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [5.300e+01, 8.560e+02, 1.756e+03, ..., 0.000e+00, 0.000e+00,
        2.000e+00],
       [5.600e+01, 8.150e+02, 1.715e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [5.600e+01, 8.070e+02, 1.707e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [None]:
# Creating x and y data
# X data is all plays except the last play
# Y data is the last play of the drive
# TODO: Explore what this would look like if we ignored the last play of the drive (i.e. punt, FG, TD).
    # x.append(drive[:-2])
    # y.append(drive[-2])

x = []
y = []

for drive in drive_data:
    x.append(drive[:-1])
    y.append(drive[-1])
    
# Saving x and y lists as np.arrays
x = np.array(x)
y = np.array(y)

In [None]:
# Deprecated, but could be useful later
# Requires that broken_data is a list of pd.DataFrames

play_pairs = []

for drive in broken_data:
    for i in range(len(drive)-1):
        cur_play = drive.iloc[i,:].to_numpy()
        next_play = drive.iloc[i+1, :].to_numpy()
        play_pairs.append(np.array([cur_play, next_play]))

np.array(play_pairs)

array([[['20090910001', 58.0, 893.0, ..., 0.0, 0.0, 1],
        ['20090910001', 53.0, 856.0, ..., 0.0, 0.0, 2]],

       [['20090910001', 53.0, 856.0, ..., 0.0, 0.0, 2],
        ['20090910001', 56.0, 815.0, ..., 0.0, 0.0, 1]],

       [['20090910001', 56.0, 815.0, ..., 0.0, 0.0, 1],
        ['20090910001', 56.0, 807.0, ..., 0.0, 0.0, 0]],

       ...,

       [['201812170020', 66.0, 64.0, ..., -5.0, 0.0, 1],
        ['201812170020', 66.0, 63.0, ..., -5.0, 0.0, 1]],

       [['201812170020', 66.0, 63.0, ..., -5.0, 0.0, 1],
        ['201812170020', 66.0, 58.0, ..., -5.0, 0.0, 1]],

       [['201812170020', 66.0, 58.0, ..., -5.0, 0.0, 1],
        ['201812170020', 61.0, 38.0, ..., -5.0, 0.0, 1]]], dtype=object)

In [161]:
play_pairs = np.array(play_pairs)

In [162]:
play_pairs.shape

(321963, 2, 36)

In [None]:
# NUM_DRIVES = 58279
NUM_PLAYS = 33
NUM_FEATURES = 35
hidden_size = 128

# Creating basic 2 layer LSTM
model = Sequential([
    layers.Input((NUM_PLAYS, NUM_FEATURES)), 
    layers.LSTM(hidden_size, recurrent_activation="tanh", kernel_regularizer="l2", return_sequences=True),
    layers.LSTM(hidden_size, recurrent_activation="tanh", kernel_regularizer="l2"),
    layers.Dense(NUM_FEATURES)
])

model.compile(optimizer='adam',
                loss="mean_squared_error",
                metrics=['accuracy', "f1_score"])

model.summary()

NameError: name 'Sequential' is not defined

In [None]:
# Fit the model
history = model.fit(x=x, y=y, epochs=EPOCHS)