In [7]:
import numpy as np
import pandas as pd
import os
import dask.dataframe as dd
import torch
from torch.autograd import Variable

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=6)

from IPython.display import HTML

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Preprocess Data

In [10]:
# helper functions to get nearest offensive and defensive players in grouped df
def get_nearest_player(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'OFF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

def get_nearest_player_off(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'OFF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

def get_nearest_player_def(df):
    df = df[df.event.isin(['pass_arrived'])]
    if len(df) == 0:
        return np.nan
    
    ball_end = df[df.nflId == 0][['x', 'y']].head(1)
    assert len(ball_end) == 1, print(len(ball_end))
    
    players_end = df[(df.nflId != 0) & (df.team_pos == 'DEF')][['x', 'y']]
    min_dist = np.linalg.norm(players_end.values - ball_end.values, axis=1).min()
    
    return min_dist

# get bal
min_dist_off = all_plays.groupby(['playId', 'gameId']).apply(get_nearest_player_off).reset_index()
min_dist_def = all_plays.groupby(['playId', 'gameId']).apply(get_nearest_player_def).reset_index()

## Plays Dataset

In [89]:
class PlaysDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir):
        # load csvs
        tracking_df = pd.read_csv(os.path.join(data_dir, 'week1_norm.csv'))

        # get valid frames for tuning from tracking df
        #tracking_df = tracking_df[tracking_df['event'].isin(['pass_forward', 'pass_arrived', 
        #    'pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_touchdown', 'pass_outcome_intercepted'])]
        self.play_list = tracking_df[tracking_df['event'] == 'pass_arrived'][['gameId', 'playId']].drop_duplicates()
        tracking_df = self.play_list.merge(tracking_df, on=['gameId', 'playId'])
        
        # calculate ball ending position
        ball_end = tracking_df[(tracking_df.nflId == 0) & (tracking_df.event == 'pass_arrived')][['gameId', 'playId', 'x', 'y']]
        ball_end = ball_end.rename(columns={'x': 'ball_x', 'y': 'ball_y'})
        
        # merge tracking_df with ball_end
        tracking_df = tracking_df[tracking_df.nflId != 0].merge(ball_end, on=['gameId', 'playId'])
                
        # for each player, label whether they reached the ball (radius of 1.5 yds)
        self.player_reached = tracking_df[tracking_df.event == 'pass_arrived'][['gameId', 'playId', 'nflId', 'x', 'y', 'ball_x', 'ball_y']]
        self.player_reached['close_to_ball'] = np.less_equal(np.linalg.norm(np.stack([self.player_reached.x.values,
                    self.player_reached.y.values], axis=-1) - np.stack([self.player_reached.ball_x.values,
                    self.player_reached.ball_y.values], axis=-1), axis=1), 1.5).astype(int)
        
        # only keep frame when ball is passed
        self.all_plays = tracking_df[tracking_df.event == 'pass_forward']
        
        # turn play list into np array
        self.play_list = self.play_list.values
        
        # max number of players per play
        self.max_num = 17
        
    def __len__(self):
        return len(self.play_list)
    
    def __getitem__(self, idx):
        gameId, playId = self.play_list[idx]
        frame = self.all_plays[(self.all_plays.gameId == gameId) & (self.all_plays.playId == playId)]
        sigma_label = self.player_reached[(self.all_plays.gameId == gameId) & (self.all_plays.playId == playId)][['nflId', 'close_to_ball']]

        # calculate important metrics in frame
        ball_end = torch.tensor(frame[(frame.nflId == 0) & (frame.event == 'pass_arrived')][['x', 'y']].head(1))

        ball_end = torch.tensor(frame.loc[frame.position == 'QB'][['x', 
                'y']].iloc[0].round().values)
        
        # clean up frame
        frame = frame.loc[frame.position != 'QB'].merge(sigma_label, on='nflId')
        frame = frame.replace('OFF', 1)
        frame = frame.replace('DEF', 0)
        frame['tof'] = pd.to_timedelta(pd.to_datetime(frame[frame.event == 'pass_arrived'].time.iloc[0]) - pd.to_datetime(frame[frame.event == 'pass_forward'].time.iloc[0])).total_seconds()
        
        # generate data, label, fill missing data
        data = torch.tensor(frame[['nflId', 'x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'team_pos', 'ball_x', 'ball_y', 'tof']].values).float()
        label = torch.tensor(frame['close_to_ball'].values)
        if data.size(0) < 17:
            data = torch.vstack([data, torch.ones[17 - data.size(0), data.size(1)]])
            label = torch.vstack([label, torch.zeros[17 - label.size(0)]])
        
        return data, label.long()

## Completion Probability Model

In [None]:
class CompProbModel(torch.nn.Module):
    def __init__(self, a_max=7, s_max=9, avg_ball_speed=20, tti_sigma=0.5, tti_lambda=1.0):
        # define parameters and whether or not to optimize
        self.a_max = Variable(torch.tensor([a_max]), requires_grad=False)
        self.s_max = Variable(torch.tensor([s_max]), requires_grad=False)
        self.avg_ball_speed = Variable(torch.tensor([avg_ball_speed]), requires_grad=False)
        self.tti_sigma = Variable(torch.tensor([tti_sigma.astype(torch.float32)]), requires_grad=True)
        self.tti_lambda = Variable(torch.tensor([tti_lambda.astype(torch.float32)]), requires_grad=True)
        self.reax_t = self.s_max / self.a_max
        
        # define field grid
        x = torch.linspace(0.5, 119.5, 120)
        y = torch.linspace(-0.5, 53.5, 55)
        y[0] = -0.2
        xx, yy = torch.meshgrid(x, y)
        self.field_locs = torch.flatten(torch.stack((xx, yy), dim=-1), end_dim=-2)  # (F, 2)
        self.T = torch.linspace(0.1, 4, 40) # (T,)
    
    def forward(self, frame):
        v_x_r = frame[:, 5] * self.reax_t + frame[:, 3]
        v_y_r = frame.[:, 6] * self.reax_t + frame[:, 4]
        v_r_mag = torch.norm(torch.tensor([v_x_r, v_y_r]), dim=0)
        v_r_theta = torch.arctan(v_y_r / v_x_r).nan_to_num()

        x_r = frame[:, 1] + frame[:, 3] * self.reax_t + 0.5 * frame[:, 5] * self.reax_t**2
        y_r = frame[:, 2] + frame[:, 4] * self.reax_t + 0.5 * frame[:, 6] * self.reax_t**2
        
        # get each player's team, location, and velocity
        player_teams = frame[:, -1] # J,
        reaction_player_locs = torch.stack([x_r, y_r]).int() # (J, 2)
        reaction_player_vels = torch.stack([v_x_r, v_y_r], dim=-1) #(J,2)
        
        # calculate each player's distance from each field location
        int_d_vec = self.field_locs.unsqueeze(1) - reaction_player_locs #F, J, 2
        int_d_mag = torch.norm(int_d_vec, dim=2) # F, J
        
        # take dot product of velocity and direction
        int_s0 = torch.clip(torch.sum(int_d_vec * reaction_player_vels, dim=2) / int_d_mag, -1 * self.s_max, self.s_max) #F, J

        # calculate time it takes for each player to reach each field position accounting for their current velocity and acceleration
        t_lt_smax = (self.s_max - int_s0) / self.a_max  #F, J,
        d_lt_smax = t_lt_smax * ((int_s0 + self.s_max) / 2) #F, J,
        d_at_smax = int_d_mag - d_lt_smax               #F, J,
        t_at_smax = d_at_smax / self.s_max              #F, J,
        t_tot = t_lt_smax + t_at_smax                     #F, J,

        # subtract the arrival time (t_tot) from time of flight of ball
        int_dT = self.T.view(1, -1, 1) - t_tot.unsqueeze(1)         #F, T, J
        
        # calculate interception probability for each player, field loc, time of flight (logistic function)
        p_int = 1 / (1. + torch.exp(-1 * torch.pi / torch.sqrt(3.0) / self.tti_sigma * int_dT)) #F, T, J
        
        # get p_int for actual tof
        tof = torch.round(frame[:, -1] * 10)
        p_int = p_int[:, tof, :] # F, J
        
        # index into ball position
        ball_end_x = data[:, -3]
        ball_end_y = data[:, -2]
        ball_field_ind = self.field_locs[:, 0] == 

        p_int = p_int[, :]
        
        return p_int
        
        # TODO(adit98) change this
        #lambda_z = torch.where((traj_locs_z<3) & (traj_locs_z>0), 1, 0) #F, T, T
        # apply lambda
        # probs = probs * lambda_z.unsqueeze(-1) * lambda_dt
        # norm_factor = torch.maximum(1., probs.sum(dim=-1))  #F, T, T
        # probs_norm = (probs / norm_factor[..., None])  #F, T, T, J
        
        # check if this is valid

        # compute reach vecs (path of ball)
        #reach_vecs = ball_start - field_locs  # (F, 2)
        
        # invert direction signs
        #dx = -1 * reach_vecs[:, 0]
        #dy = -1 * reach_vecs[:, 1]

        # calculate ball velocity for each possible time of flight (0 to 4)
        #vx = dx.unsqueeze(-1) / T.unsqueeze(0)
        #vy = dy.unsqueeze(-1) / T.unsqueeze(0)
        #g = 10.72468 # gravitation constant (yards/s^2)
        #vz_0 = (T * g) / 2

        # calculate every (x, y, z) location that ball passes through
        # note that idx (i, j, k) into below arrays is invalid when j < k
        #traj_ts = torch.tile(T, (len(field_locs), len(T), 1)) #(F, T, T)
        #traj_locs_x_idx = torch.round(torch.clip((ball_start[0]+vx.unsqueeze(-1)*T), 0, len(x)-1)) # F, T, T
        #traj_locs_y_idx = torch.round(torch.clip((ball_start[1]+vy.unsqueeze(-1)*T), 0, len(y)-1)) # F, T, T
        #traj_locs_z = 2.0+vz_0.view(1, -1, 1)*traj_ts-0.5*g*traj_ts*traj_ts #F, T, T
        
        # TODO(adit98) fix this
        #path_idxs = torch.ravel_multi_index(torch.stack((traj_locs_y_idx, traj_locs_x_idx)).reshape(2, -1), xx.shape)  # (F*T*T,)
        #traj_t_idxs = torch.round(10*traj_ts - 1).flatten()  # (F, T, T)
        #probs = p_int[path_idxs, t] # F*T*T, J
        #probs = probs.reshape((*traj_locs_x_idx.shape, len(reaction_player_locs)))  # F, T, T, J
        
        # TODO(adit98) this is where to add lambda tuning
        #lambda_dt = 1
        #probs = probs * lambda_z.unsqueeze(-1) * lambda_dt
        #norm_factor = torch.maximum(1., probs.sum(dim=-1))  #F, T, T
        #probs_norm = (probs / norm_factor[..., None])  #F, T, T, J

        #total_probs = torch.sum(probs_norm, dim=-1)  # F, T, T
        #compl_total_probs = 1 - total_probs  # F, T, T
        #remaining_compl_probs = torch.cumprod(compl_total_probs, dim=-1)  # F, T, T
        #off_probs = torch.sum(probs_norm * player_teams, dim=-1)
        #def_probs = torch.sum(probs_norm * (1 - player_teams), dim=-1)

        # maximum 0 because if it goes negative the pass has been caught by then and theres no residual probability

        #shift_compl_cumsum = torch.roll(remaining_compl_probs, 1, dim=-1)  # F, T, T
        #shift_compl_cumsum[:, :, 0] = 1
        #total_completion_prob_dt = shift_compl_cumsum * total_probs  # F, T, T
        #total_completion_prob = torch.cumsum(total_completion_prob_dt, dim=-1)  # F, T, T

        #completion_prob_off_dt = shift_compl_cumsum * off_probs  # F, T, T
        #completion_prob_def_dt = shift_compl_cumsum * def_probs  # F, T, T
        #completion_prob_off = torch.cumsum(completion_prob_off_dt, dim=-1)  # F, T, T
        #completion_prob_def = torch.cumsum(completion_prob_def_dt, dim=-1)  # F, T, T
        
        # this einsum takes the diagonal values over the last two axes
        # where T = t. this takes care of the t > T issue.
        #throw_p_int_off = torch.einsum('ijj->ij', total_completion_prob_off)  # F, T
        #throw_p_int_def = torch.einsum('ijj->ij', total_completion_prob_def)  # F, T
        #throw_p_int = torch.einsum('ijj->ij', total_completion_prob)  # F, T
        
        # below gets cutoff for combined model
        #field_p_int_off = throw_p_int_off.mean(dim=1)  # F,
        #field_p_int_def = throw_p_int_def.mean(dim=1)  # F,
        #field_p_int = throw_p_int.mean(dim=1)  # F,

        #field_p_no_int = 1-field_p_int

        #field_df = pd.DataFrame({
        #    'ball_start_x': ball_start[0],
        #    'ball_start_y': ball_start[1], 
        #    'ball_end_x': field_locs[:,0],
        #    'ball_end_y': field_locs[:,1],
        #    'p_mass_1': (((field_p_int_off-field_p_int_def)+1.)/2.).round(3),
        #    'p_mass_2': field_p_no_int.round(3),
            # 'p_mass_players': p_int_norm,
        #})



## Initialize Dataset, Model and Run Training Loop

In [42]:
ds = PlaysDataset(data_dir = '../data/')
print(ds[0].shape)

(779, 2)
torch.Size([12, 7])


### Sandbox

In [70]:
# load files
data_dir = '../data/'
tracking_df = pd.read_csv('../data/week1_norm.csv')
plays_df = pd.read_csv('../data/plays.csv')

#print(tracking_df.columns)

# get valid frames for tuning from tracking df
tracking_df = tracking_df[tracking_df['event'].isin(['pass_forward', 'pass_arrived', 
    'pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_touchdown', 'pass_outcome_intercepted'])]
tracking_df['valid_frame'] = tracking_df['event'].str.contains('pass_forward')
tracking_df = tracking_df.groupby(['playId', 'gameId']).filter(lambda l: l['valid_frame'].any()).reset_index()

# merge tracking df and plays df
all_plays = plays_df.merge(tracking_df, how='left', on=['playId', 'gameId'])

Index(['gameId', 'playId', 'frameId', 'event', 'nflId', 'displayName',
       'jerseyNumber', 'position', 'position_general', 'team', 'team_pos',
       'teamAbbr', 'route', 'time', 'los', 'x', 'y', 'dis', 'o', 's', 's_dir',
       's_dir_rad', 'v_x', 'v_y', 'v_theta', 'v_mag', 'a_old', 'a_x', 'a_y',
       'a_theta', 'a_mag'],
      dtype='object')
2018-09-09T17:04:04.500Z


In [87]:
play = tracking_df[(tracking_df.playId == 81) & (tracking_df.gameId == 2018090902)]
print(pd.to_timedelta(pd.to_datetime(play[play.event == 'pass_arrived'].time.iloc[0]) - pd.to_datetime(play[play.event == 'pass_forward'].time.iloc[0])).total_seconds())

2018-09-09 17:04:03+00:00
2018-09-09 17:04:04.500000+00:00
1.5


In [61]:
tracking_df[tracking_df.position != 'QB'][['gameId', 'playId', 'nflId']].drop_duplicates().groupby(['gameId', 'playId']).nflId.count().max()

17

In [58]:
min_dist_off = min_dist_off.rename(columns={0:'ball_dist'})
complete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_caught', 'pass_outcome_touchdown'])][['gameId', 'playId']]
incomplete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_incomplete'])][['gameId', 'playId']]

print(complete_passes.merge(min_dist_off).drop_duplicates().ball_dist.median())
print(incomplete_passes.merge(min_dist_off).drop_duplicates().ball_dist.mean())

min_dist_def = min_dist_def.rename(columns={0:'ball_dist'})
complete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_caught', 'pass_outcome_touchdown'])][['gameId', 'playId']]
incomplete_passes = all_plays.loc[all_plays.event.isin(['pass_outcome_incomplete'])][['gameId', 'playId']]

print(complete_passes.merge(min_dist_def).drop_duplicates().ball_dist.mean())
print(incomplete_passes.merge(min_dist_def).drop_duplicates().ball_dist.mean())

1.085102305866526
1.6439466513218899
4.790180627434475
2.435306899869004


In [None]:
all_plays.loc[(all_plays.gameId == 2018090600) & (all_plays.playId == 146)][['displayName', 'x', 'y']]
import matplotlib.pyplot as plt
%matplotlib inline

from visualize2 import AnimatePlay
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
play_df = tracking_df[(tracking_df.gameId == 2018091000) & (tracking_df.playId == 3016)]

animated_play = AnimatePlay(play_df, 20)#play_df[play_df.frameId <= 46], 20)
HTML(animated_play.ani.to_jshtml())

In [None]:
print(get_nearest_player(play_df))