In [3]:
import numpy as np
import pandas as pd
import os
import dask.dataframe as dd
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=6)

from IPython.display import HTML

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Preprocess Data

In [3]:
# helper functions to get nearest offensive and defensive players in grouped df
def get_nearest_player(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'OFF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

def get_nearest_player_off(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'OFF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

def get_nearest_player_def(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'DEF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

# get bal
min_dist_off = all_plays.groupby(['playId', 'gameId']).apply(get_nearest_player_off).reset_index()
min_dist_def = all_plays.groupby(['playId', 'gameId']).apply(get_nearest_player_def).reset_index()

NameError: name 'all_plays' is not defined

## Plays Dataset

In [4]:
class PlaysDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir):
        # load csvs
        tracking_df = pd.read_csv(os.path.join(data_dir, 'week1_norm.csv'))

        # get valid frames for tuning from tracking df (consider every pass, labels are 1 if there is a player close by)
        #tracking_df = tracking_df[tracking_df['event'].isin(['pass_forward', 'pass_arrived', 
        #    'pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_touchdown', 'pass_outcome_intercepted'])]
        pass_forward_plays = tracking_df[tracking_df['event'] == 'pass_forward'][['gameId', 'playId']].drop_duplicates()
        pass_attempted_plays = tracking_df[tracking_df['event'] == 'pass_arrived'][['gameId', 'playId']].drop_duplicates()
        tracking_df = pass_forward_plays.merge(pass_attempted_plays.merge(tracking_df, on=['gameId', 'playId']), on=['gameId', 'playId'])
        
        # calculate ball ending position
        ball_end = tracking_df[(tracking_df.nflId == 0) & (tracking_df.event == 'pass_arrived')][['gameId', 'playId', 'x', 'y']]
        ball_end = ball_end.rename(columns={'x': 'ball_x', 'y': 'ball_y'})

        # remove plays where ball is thrown out of bounds
        ball_end = ball_end[(ball_end.ball_x <= 119.5) & (ball_end.ball_x >= 0.5) & (ball_end.ball_y <= 53.5) & (ball_end.ball_y >= -0.5)]
        
        # merge tracking_df with ball_end
        tracking_df = tracking_df[tracking_df.nflId != 0].merge(ball_end, on=['gameId', 'playId'])
                
        # for each player, label whether they reached the ball (radius of 1.5 yds)
        self.player_reached = tracking_df[tracking_df.event == 'pass_arrived'][['gameId', 'playId', 'nflId', 'x', 'y', 'ball_x', 'ball_y']]
        self.player_reached['close_to_ball'] = np.less_equal(np.linalg.norm(np.stack([self.player_reached.x.values,
                    self.player_reached.y.values], axis=-1) - np.stack([self.player_reached.ball_x.values,
                    self.player_reached.ball_y.values], axis=-1), axis=1), 1.5).astype(int)
        
        # store tracking_df
        self.all_plays = tracking_df
        
        # turn play list into np array
        self.play_list = tracking_df[['gameId', 'playId']].drop_duplicates().values
        
        # max number of players per play
        self.max_num = 17
        
    def __len__(self):
        return len(self.play_list)
    
    def __getitem__(self, idx):
        gameId = self.play_list[idx, 0]
        playId = self.play_list[idx, 1]
        
        # load frame, sigma_label, and ball_end
        frame = self.all_plays[(self.all_plays.gameId == gameId) & (self.all_plays.playId == playId)]
        sigma_label = self.player_reached[(self.player_reached.gameId == gameId) & (self.player_reached.playId == playId)][['nflId', 'close_to_ball']]
        
        try:
            ball_end = self.player_reached[(self.player_reached.gameId == gameId) & (self.player_reached.playId == playId)][['ball_x', 'ball_y']].iloc[0].values
        except IndexError:
            print(self.player_reached[(self.player_reached.gameId == gameId) & (self.player_reached.playId == playId)])
            raise IndexError
        # clean up frame (remove QB, merge with sigma_label, ball_end, remove pass_arrived event)
        frame = frame.loc[frame.position != 'QB'].merge(sigma_label, on='nflId')
        frame = frame.replace('OFF', 1)
        frame = frame.replace('DEF', 0)
        try:
            frame['tof'] = pd.to_timedelta(pd.to_datetime(frame[frame.event == 'pass_arrived'].time.iloc[0]) - pd.to_datetime(frame[frame.event == 'pass_forward'].time.iloc[0])).total_seconds()
        except IndexError:
            print(frame[frame.event == 'pass_arrived'])
            print(frame[frame.event == 'pass_forward'])
            raise IndexError
        frame['ball_x'] = ball_end[0]
        frame['ball_y'] = ball_end[1]
        frame = frame[frame.event == 'pass_forward']

        # generate data, label, fill missing data
        data = torch.tensor(frame[['nflId', 'x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'team_pos', 'ball_x', 'ball_y', 'tof']].values).float()
        label = torch.tensor(frame['close_to_ball'].values)
        if data.size(0) < self.max_num:
            data = torch.cat([data, torch.ones([self.max_num - data.size(0), data.size(1)])], dim=0)
            label = torch.cat([label, torch.zeros([self.max_num - label.size(0)])], dim=0)
        
        return data, label.long()

## Completion Probability Model

In [19]:
class CompProbModel(torch.nn.Module):
    def __init__(self, a_max=7, s_max=9, avg_ball_speed=20, tti_sigma=0.5, tti_lambda=1.0):
        super().__init__()
        
        # define parameters and whether or not to optimize
        self.a_max = Parameter(torch.tensor([a_max]), requires_grad=False).float()
        self.s_max = Parameter(torch.tensor([s_max]), requires_grad=False).float()
        self.avg_ball_speed = Parameter(torch.tensor([avg_ball_speed]), requires_grad=False).float()
        self.tti_sigma = Parameter(torch.tensor([tti_sigma]), requires_grad=True).float()
        self.tti_lambda = Parameter(torch.tensor([tti_lambda]), requires_grad=True).float()
        self.reax_t = self.s_max / self.a_max
        self.pi = torch.tensor([3.1416])

        
        # define field grid
        self.x = torch.linspace(0.5, 119.5, 120)
        self.y = torch.linspace(-0.5, 53.5, 55)
        self.y[0] = -0.2
        xx, yy = torch.meshgrid(self.x, self.y)
        self.field_locs = Parameter(torch.flatten(torch.stack((xx, yy), dim=-1), end_dim=-2), requires_grad=False)  # (F, 2)
        self.T = Parameter(torch.linspace(0.1, 4, 40), requires_grad=False) # (T,)
    
    def forward(self, frame):
        v_x_r = frame[:, :, 5] * self.reax_t + frame[:, :, 3]
        v_y_r = frame[:, :, 6] * self.reax_t + frame[:, :, 4]
        v_r_mag = torch.norm(torch.stack([v_x_r, v_y_r], dim=-1), dim=-1)
        v_r_theta = torch.arctan(v_y_r / v_x_r)
        # fill nan
        v_r_theta[v_r_theta != v_r_theta] = 0

        x_r = frame[:, :, 1] + frame[:, :, 3] * self.reax_t + 0.5 * frame[:, :, 5] * self.reax_t**2
        y_r = frame[:, :, 2] + frame[:, :, 4] * self.reax_t + 0.5 * frame[:, :, 6] * self.reax_t**2
        
        # get each player's team, location, and velocity
        #player_teams = frame[:, :, -1] # J,
        reaction_player_locs = torch.stack([x_r, y_r], dim=-1).int() # (J, 2)
        reaction_player_vels = torch.stack([v_x_r, v_y_r], dim=-1) #(J, 2)
        
        # calculate each player's distance from each field location
        int_d_vec = self.field_locs.unsqueeze(1).unsqueeze(0) - reaction_player_locs.unsqueeze(1) #F, J, 2
        int_d_mag = torch.norm(int_d_vec, dim=-1) # F, J
        
        # take dot product of velocity and direction
        int_s0 = torch.clip(torch.sum(int_d_vec * reaction_player_vels.unsqueeze(1), dim=-1) / int_d_mag, -1 * self.s_max.item(), self.s_max.item()) #F, J
        #int_s0 = torch.sum(int_d_vec * reaction_player_vels.unsqueeze(1), dim=-1) / int_d_mag
        
        # calculate time it takes for each player to reach each field position accounting for their current velocity and acceleration
        t_lt_smax = (self.s_max - int_s0) / self.a_max  #F, J,
        d_lt_smax = t_lt_smax * ((int_s0 + self.s_max) / 2) #F, J,
        d_at_smax = int_d_mag - d_lt_smax               #F, J,
        t_at_smax = d_at_smax / self.s_max              #F, J,
        t_tot = self.reax_t + t_lt_smax + t_at_smax     # F, J,

        # subtract the arrival time (t_tot) from time of flight of ball
        int_dT = self.T.view(1, 1, -1, 1) - t_tot.unsqueeze(2)         #F, T, J
        
        # calculate interception probability for each player, field loc, time of flight (logistic function)
        p_int = 1. / (1. + torch.exp(-1 * self.pi / (torch.sqrt(torch.tensor([3.0])) / self.tti_sigma * int_dT))) #F, T, J
        
        # get p_int for actual tof
        tof = torch.round(frame[:, 0, -1] * 10).long().view(-1, 1, 1, 1).repeat(1, p_int.size(1), 1, p_int.size(-1))
        p_int = torch.gather(p_int, 2, tof).squeeze() # F, J
        
        # index into ball position
        ball_end_x = frame[:, 0, -3].int()
        ball_end_y = frame[:, 0, -2].int()
        ball_field_ind = (ball_end_y * self.x.shape[0] + ball_end_x).long().view(-1, 1, 1).repeat(1, 1, p_int.size(-1))
        
        p_int = torch.gather(p_int, 1, ball_field_ind).squeeze()
        
        return p_int
        
        # TODO(adit98) change this
        #lambda_z = torch.where((traj_locs_z<3) & (traj_locs_z>0), 1, 0) #F, T, T
        # apply lambda
        # probs = probs * lambda_z.unsqueeze(-1) * lambda_dt
        # norm_factor = torch.maximum(1., probs.sum(dim=-1))  #F, T, T
        # probs_norm = (probs / norm_factor[..., None])  #F, T, T, J
        
        # check if this is valid

        # compute reach vecs (path of ball)
        #reach_vecs = ball_start - field_locs  # (F, 2)
        
        # invert direction signs
        #dx = -1 * reach_vecs[:, 0]
        #dy = -1 * reach_vecs[:, 1]

        # calculate ball velocity for each possible time of flight (0 to 4)
        #vx = dx.unsqueeze(-1) / T.unsqueeze(0)
        #vy = dy.unsqueeze(-1) / T.unsqueeze(0)
        #g = 10.72468 # gravitation constant (yards/s^2)
        #vz_0 = (T * g) / 2

        # calculate every (x, y, z) location that ball passes through
        # note that idx (i, j, k) into below arrays is invalid when j < k
        #traj_ts = torch.tile(T, (len(field_locs), len(T), 1)) #(F, T, T)
        #traj_locs_x_idx = torch.round(torch.clip((ball_start[0]+vx.unsqueeze(-1)*T), 0, len(x)-1)) # F, T, T
        #traj_locs_y_idx = torch.round(torch.clip((ball_start[1]+vy.unsqueeze(-1)*T), 0, len(y)-1)) # F, T, T
        #traj_locs_z = 2.0+vz_0.view(1, -1, 1)*traj_ts-0.5*g*traj_ts*traj_ts #F, T, T
        
        # TODO(adit98) fix this
        #path_idxs = torch.ravel_multi_index(torch.stack((traj_locs_y_idx, traj_locs_x_idx)).reshape(2, -1), xx.shape)  # (F*T*T,)
        #traj_t_idxs = torch.round(10*traj_ts - 1).flatten()  # (F, T, T)
        #probs = p_int[path_idxs, t] # F*T*T, J
        #probs = probs.reshape((*traj_locs_x_idx.shape, len(reaction_player_locs)))  # F, T, T, J
        
        # TODO(adit98) this is where to add lambda tuning
        #lambda_dt = 1
        #probs = probs * lambda_z.unsqueeze(-1) * lambda_dt
        #norm_factor = torch.maximum(1., probs.sum(dim=-1))  #F, T, T
        #probs_norm = (probs / norm_factor[..., None])  #F, T, T, J

        #total_probs = torch.sum(probs_norm, dim=-1)  # F, T, T
        #compl_total_probs = 1 - total_probs  # F, T, T
        #remaining_compl_probs = torch.cumprod(compl_total_probs, dim=-1)  # F, T, T
        #off_probs = torch.sum(probs_norm * player_teams, dim=-1)
        #def_probs = torch.sum(probs_norm * (1 - player_teams), dim=-1)

        # maximum 0 because if it goes negative the pass has been caught by then and theres no residual probability

        #shift_compl_cumsum = torch.roll(remaining_compl_probs, 1, dim=-1)  # F, T, T
        #shift_compl_cumsum[:, :, 0] = 1
        #total_completion_prob_dt = shift_compl_cumsum * total_probs  # F, T, T
        #total_completion_prob = torch.cumsum(total_completion_prob_dt, dim=-1)  # F, T, T

        #completion_prob_off_dt = shift_compl_cumsum * off_probs  # F, T, T
        #completion_prob_def_dt = shift_compl_cumsum * def_probs  # F, T, T
        #completion_prob_off = torch.cumsum(completion_prob_off_dt, dim=-1)  # F, T, T
        #completion_prob_def = torch.cumsum(completion_prob_def_dt, dim=-1)  # F, T, T
        
        # this einsum takes the diagonal values over the last two axes
        # where T = t. this takes care of the t > T issue.
        #throw_p_int_off = torch.einsum('ijj->ij', total_completion_prob_off)  # F, T
        #throw_p_int_def = torch.einsum('ijj->ij', total_completion_prob_def)  # F, T
        #throw_p_int = torch.einsum('ijj->ij', total_completion_prob)  # F, T
        
        # below gets cutoff for combined model
        #field_p_int_off = throw_p_int_off.mean(dim=1)  # F,
        #field_p_int_def = throw_p_int_def.mean(dim=1)  # F,
        #field_p_int = throw_p_int.mean(dim=1)  # F,

        #field_p_no_int = 1-field_p_int

        #field_df = pd.DataFrame({
        #    'ball_start_x': ball_start[0],
        #    'ball_start_y': ball_start[1], 
        #    'ball_end_x': field_locs[:,0],
        #    'ball_end_y': field_locs[:,1],
        #    'p_mass_1': (((field_p_int_off-field_p_int_def)+1.)/2.).round(3),
        #    'p_mass_2': field_p_no_int.round(3),
            # 'p_mass_players': p_int_norm,
        #})



## Initialize Dataset, Model and Run Training Loop

In [20]:
ds = PlaysDataset(data_dir = '../data/')
loader = torch.utils.data.DataLoader(ds, batch_size=16, num_workers=4, shuffle=True)
data, label = ds[8]

In [21]:
%time
from tqdm import tqdm_notebook as tqdm
model = CompProbModel()
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
for data, target in tqdm(loader):
    output = model(data)
    print(output, target)
    loss = loss_fn(output, target.float())
    print(loss)
    loss.backward()
    optimizer.step()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.53 µs


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data, target in tqdm(loader):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=49.0), HTML(value='')))

Parameter containing:
tensor([0.5000], requires_grad=True)
tensor([[0.4704, 0.4792, 0.4787, 0.4799, 0.4762, 0.4796, 0.4741, 0.4662, 0.4770,
         0.4751, 0.4799, 0.4814, 0.4792, 0.4771, 0.4700, 0.4215, 0.4215],
        [0.4636, 0.4619, 0.4666, 0.4684, 0.4620, 0.4484, 0.4427, 0.4451, 0.4385,
         0.4657, 0.4649, 0.4560, 0.4768, 0.4768, 0.4768, 0.4768, 0.4768],
        [0.4405, 0.3727, 0.3727, 0.4437, 0.4330, 0.4456, 0.4495, 0.4582, 0.4226,
         0.4384, 0.4374, 0.4615, 0.4762, 0.4762, 0.4762, 0.4762, 0.4762],
        [0.4400, 0.4522, 0.4424, 0.4694, 0.4386, 0.4513, 0.4465, 0.4650, 0.4655,
         0.4482, 0.4336, 0.4684, 0.4644, 0.4644, 0.4644, 0.4644, 0.4644],
        [0.4832, 0.4759, 0.4808, 0.4823, 0.4832, 0.4754, 0.4764, 0.4787, 0.4815,
         0.4720, 0.4795, 0.4829, 0.4394, 0.4394, 0.4394, 0.4394, 0.4394],
        [0.4377, 0.3779, 0.4466, 0.4470, 0.4484, 0.4408, 0.4708, 0.4661, 0.4552,
         0.4339, 0.4537, 0.4065, 0.4145, 0.4767, 0.4767, 0.4767, 0.4767],
        [0.

RuntimeError: all elements of input should be between 0 and 1

In [None]:
model.tti_sigma
list(model.parameters())

### Sandbox

In [None]:
# load files
data_dir = '../data/'
tracking_df = pd.read_csv('../data/week1_norm.csv')
plays_df = pd.read_csv('../data/plays.csv')

#print(tracking_df.columns)

# get valid frames for tuning from tracking df
tracking_df = tracking_df[tracking_df['event'].isin(['pass_forward', 'pass_arrived', 
    'pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_touchdown', 'pass_outcome_intercepted'])]
tracking_df['valid_frame'] = tracking_df['event'].str.contains('pass_forward')
tracking_df = tracking_df.groupby(['playId', 'gameId']).filter(lambda l: l['valid_frame'].any()).reset_index()

# merge tracking df and plays df
all_plays = plays_df.merge(tracking_df, how='left', on=['playId', 'gameId'])

In [87]:
play = tracking_df[(tracking_df.playId == 81) & (tracking_df.gameId == 2018090902)]
print(pd.to_timedelta(pd.to_datetime(play[play.event == 'pass_arrived'].time.iloc[0]) - pd.to_datetime(play[play.event == 'pass_forward'].time.iloc[0])).total_seconds())

2018-09-09 17:04:03+00:00
2018-09-09 17:04:04.500000+00:00
1.5


In [61]:
tracking_df[tracking_df.position != 'QB'][['gameId', 'playId', 'nflId']].drop_duplicates().groupby(['gameId', 'playId']).nflId.count().max()

17

In [58]:
min_dist_off = min_dist_off.rename(columns={0:'ball_dist'})
complete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_caught', 'pass_outcome_touchdown'])][['gameId', 'playId']]
incomplete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_incomplete'])][['gameId', 'playId']]

print(complete_passes.merge(min_dist_off).drop_duplicates().ball_dist.median())
print(incomplete_passes.merge(min_dist_off).drop_duplicates().ball_dist.mean())

min_dist_def = min_dist_def.rename(columns={0:'ball_dist'})
complete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_caught', 'pass_outcome_touchdown'])][['gameId', 'playId']]
incomplete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_incomplete'])][['gameId', 'playId']]

print(complete_passes.merge(min_dist_def).drop_duplicates().ball_dist.mean())
print(incomplete_passes.merge(min_dist_def).drop_duplicates().ball_dist.mean())

1.085102305866526
1.6439466513218899
4.790180627434475
2.435306899869004


In [None]:
all_plays.loc[(all_plays.gameId == 2018090600) & (all_plays.playId == 146)][['displayName', 'x', 'y']]
import matplotlib.pyplot as plt
%matplotlib inline

from visualize2 import AnimatePlay
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
play_df = tracking_df[(tracking_df.gameId == 2018091000) & (tracking_df.playId == 3016)]

animated_play = AnimatePlay(play_df, 20)#play_df[play_df.frameId <= 46], 20)
HTML(animated_play.ani.to_jshtml())

In [None]:
print(get_nearest_player(play_df))