# Imports and Data

In [93]:
import numpy as np
import tensorflow as tf
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

In [94]:
data_path = "../data/cleaned_data.csv"

data = pd.read_csv(data_path, index_col=0)

In [95]:
data

Unnamed: 0,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,qtr,down,goal_to_go,...,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv,PC1,PC2,PC3
0,58.0,893.0,1793.0,3593.0,0,1,0,1,1.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,2085.346919,-393.322974,-153.988402
1,53.0,856.0,1756.0,3556.0,0,1,0,1,2.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,2035.623481,-360.113164,-130.900189
2,56.0,815.0,1715.0,3515.0,0,1,0,1,3.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,1980.530480,-323.309756,-105.318776
3,98.0,796.0,1696.0,3496.0,0,2,0,1,1.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,1955.081945,-306.474579,-93.646573
4,98.0,760.0,1660.0,3460.0,0,2,0,1,2.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,1906.702242,-274.156278,-71.177440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319364,71.0,82.0,82.0,82.0,0,20,0,4,2.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,-1839.696207,266.475571,112.730842
319365,71.0,77.0,77.0,77.0,0,20,0,4,3.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,-1846.411351,270.954912,115.844913
319366,66.0,63.0,63.0,63.0,0,20,0,4,2.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,-1865.234946,283.544464,124.600636
319367,66.0,58.0,58.0,58.0,0,20,0,4,3.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,-1871.948992,288.023649,127.713734


In [110]:
BATCH_SIZE = 200
EPOCHS = 5

# Creating a subset of the data to train on (Dimensionality Reduction)

Using domain knowledge, we reduced the number of columns to train on in order to experiment with the columns we hypothesized would be the most useful in determining play type (run/pass).

In [97]:
useful_columns = ['yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'ydstogo', 'ydsnet', 'yards_gained', 'shotgun', 'no_huddle', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'defteam_timeouts_remaining', 'total_home_score', 'total_away_score', 'defteam_score', 'score_differential', 'defteam_score_post', 'score_differential_post', 'touchdown', 'PC1', 'PC2', 'PC3', "play_type"]

In [98]:
data = data[useful_columns]

In [99]:
train_data, test_data = train, test = train_test_split(data, test_size=0.2)

In [100]:
train_y, train_x = train_data["play_type"], train_data.drop("play_type", axis=1)

In [101]:
test_y, test_x = test_data["play_type"], test_data.drop("play_type", axis=1)

# Creating a `tf.Dataset` from `pd.DataFrame`

In [102]:
# train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
# test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y))

In [103]:
# train_batches = train_dataset.shuffle(1000).batch(BATCH_SIZE).prefetch(BATCH_SIZE)
# test_batches = test_dataset.shuffle(1000).batch(BATCH_SIZE).prefetch(BATCH_SIZE)

# Model

In [130]:
normalizer = layers.Normalization(axis=1)
normalizer.adapt(np.array(train_x))

model = Sequential([
    layers.Input((29,)),
    normalizer,
    layers.Dense(128, activation="relu", kernel_regularizer="l2", kernel_initializer="he_normal"),
    # layers.Dropout(0.1),
    layers.Dense(128, activation="relu", kernel_regularizer="l2", kernel_initializer="he_normal"),
    # layers.Dropout(0.1),
    layers.Dense(128, activation="relu", kernel_regularizer="l2", kernel_initializer="he_normal"),
    # layers.Dropout(0.1),
    layers.Dense(32, activation="relu", kernel_regularizer="l2", kernel_initializer="he_normal"),
    # layers.Dropout(0.1),
    layers.Dense(1, activation="sigmoid", kernel_regularizer="l2", kernel_initializer="he_normal")
])

model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics=['accuracy', "f1_score"])

model.summary()

In [131]:
history = model.fit(x=train_x, y = train_y, validation_data=(test_x, test_y), epochs=EPOCHS)

Epoch 1/5
[1m7985/7985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.7171 - f1_score: 0.7379 - loss: 1.3677 - val_accuracy: 0.7223 - val_f1_score: 0.7377 - val_loss: 0.6146
Epoch 2/5
[1m7985/7985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.7239 - f1_score: 0.7372 - loss: 0.6154 - val_accuracy: 0.7260 - val_f1_score: 0.7377 - val_loss: 0.6141
Epoch 3/5
[1m7985/7985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.7271 - f1_score: 0.7391 - loss: 0.6128 - val_accuracy: 0.7240 - val_f1_score: 0.7377 - val_loss: 0.6173
Epoch 4/5
[1m7985/7985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - accuracy: 0.7273 - f1_score: 0.7380 - loss: 0.6124 - val_accuracy: 0.7272 - val_f1_score: 0.7377 - val_loss: 0.6149
Epoch 5/5
[1m7985/7985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 995us/step - accuracy: 0.7277 - f1_score: 0.7382 - loss: 0.6120 - val_accuracy: 0.7278 - val_f1_scor

In [134]:
data["play_type"].value_counts() # Pass: 1; Run: 0

play_type
1    186677
0    132692
Name: count, dtype: int64

In [133]:
total = 186677 + 132692

pass_perc = 186677/total
run_perc = 132692/total

print(pass_perc, run_perc)

0.5845182218687475 0.4154817781312526
