In [2]:
import torch
import torch.nn as nn
torch.__version__


'2.1.0'

In [3]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence



import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

from datasets import PlayByPlayDataset

dataset = PlayByPlayDataset("../data/nfl-big-data-bowl-2024/play_by_play_val.pkl")


{'offense_geometric':       35472.0                                                   \
      distance relative_speed relative_acceleration delta_angle   
 0    4.240814       6.255264              3.332859      153.53   
 1    4.471532       6.467044              4.004275      160.74   
 2    4.800021       6.663768              4.336356      166.06   
 3    5.204652       6.795110              4.853028      172.40   
 4    5.678424       6.713280              5.328899      179.56   
 5    6.205159       4.608615              1.648093      189.02   
 6    6.752725       5.714793              3.530717      202.29   
 7    7.312510       6.050045              3.180493      227.36   
 8    7.893960       6.724510              4.787880      285.12   
 9    8.498500       6.640879              2.548027      -32.53   
 10   9.101088       7.794722              3.852110      -15.58   
 11   9.695690       7.473211              2.321397       -4.59   
 12  10.301713       6.446417            

In [18]:
PAD_VALUE = -1.
GEOMETRIC_KEYS = ["offense_geometric", "defense_geometric"]
RAW_KEYS = ["offense_raw", "defense_raw"]
SPECIAL_RAW_KEYS = ["ball_carrier_raw", "tacklers_raw"]
EVENT_KEYS = ["event_timeseries"]
TIME_SERIES_KEYS = GEOMETRIC_KEYS + RAW_KEYS + EVENT_KEYS

TARGET_KEY = "yards_after_contact"
TREATMENT_KEY = "tackle_successful"
STATIC_KEYS = []  # future: play features and on-field player info

from collections import defaultdict

def create_batchdict(batch):
    batchdict = defaultdict(list)
    for item in batch:
        for k, v in item.items():
            if k in TIME_SERIES_KEYS + STATIC_KEYS:
                batchdict[k].append(torch.from_numpy(v.to_numpy()))
            elif k in [TARGET_KEY, TREATMENT_KEY]:
                batchdict[k].append(v)
    return batchdict


def collate_padded_play_data(batch):
    """
        Deprecated. Used for earlier models.
    """
    batchdict = create_batchdict(batch)
    X_padded = torch.cat([pad_sequence(batchdict[k], batch_first=True, padding_value=PAD_VALUE) for k in TIME_SERIES_KEYS], dim=2)
    X_padded_static = torch.empty((len(batch), 0))
    return {
        "time_series_features": X_padded,
        "features": X_padded_static,
        "target": torch.tensor(batchdict[TARGET_KEY], dtype=torch.float),
        "treatment": torch.tensor(batchdict[TREATMENT_KEY], dtype=torch.float),
    }

def collate_padded_play_data_geometric_only(batch):
    batchdict = create_batchdict(batch)
    print(batchdict["event_timeseries"])
    X_padded = torch.cat([pad_sequence(batchdict[k], batch_first=True, padding_value=PAD_VALUE) for k in GEOMETRIC_KEYS + EVENT_KEYS], dim=2)
    X_padded_static = torch.empty((len(batch), 0))
    return {
        "time_series_features": X_padded,
        "features": X_padded_static,
        "target": torch.tensor(batchdict[TARGET_KEY], dtype=torch.float),
        "treatment": torch.tensor(batchdict[TREATMENT_KEY], dtype=torch.float),
    }



dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_padded_play_data_geometric_only)
batch = next(iter(dataloader))


[tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0

In [17]:
dataset[25]["event_timeseries"]


Unnamed: 0,first_contact,ball_snap,pass_outcome_caught,handoff,pass_arrived,out_of_bounds,run,man_in_motion,play_action,touchdown,fumble
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,0


In [59]:
# 248 is the dimensionality of the time-series features

encoder_layer = nn.TransformerEncoderLayer(d_model=248, nhead=8, batch_first=True)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
X = batch["time_series_features"].float()
out = transformer_encoder(X, src_key_padding_mask=(X[..., 0] == PAD_VALUE))


In [60]:
out.size() # batch size, time, n_features
# in the future -- concat with batch_size, n_features' shaped things -- need to squish out the time dim


torch.Size([8, 39, 248])

In [61]:
39 * 248


9672