<a href="https://colab.research.google.com/github/therocket290/cs228-material/blob/master/Predict_Rush_Pass_and_Left_Center_Right_using_offensive_pre_snap_tracking_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/NFL_Big_Data_Bowl/'

In [4]:
games = pd.read_csv(path+'games.csv')
player_play = pd.read_csv(path+'player_play.csv')
players = pd.read_csv(path+'players.csv')
plays = pd.read_csv(path+'plays.csv')

In [5]:
player_position_dict = players.set_index('nflId')['position'].to_dict()

In [6]:
def prep_training_data(df):
    df['position'] = df.nflId.map(player_position_dict)
    df = df[ df.playId.isin(list(df[ (df.event=='line_set') ].playId.unique())) ]
    return(df)

Add 'week' to the plays dataframe.

In [7]:
plays = plays.merge(games[['gameId', 'week']], on='gameId', how='outer')

In [8]:
# A function to get tracking data for each week to make a dataframe
def get_move_data(week, data):
    week = week
    df = data
    position_list = []
    game_list = []
    play_list = []
    overall_player_list = []
    overall_xlist = []
    overall_ylist = []
    overall_dist_list = []
    games_df = games[games.week==week]
    plays_df = plays[plays['gameId'].isin(list(games_df.gameId.unique()))].reset_index()
    play_count = 0
    for i in range(plays_df.shape[0]):
        if i%500==0:
            print('Working on play ', i)
        game = plays_df.loc[i,'gameId']
        play = plays_df.loc[i,'playId']
        offTeam = plays_df.loc[i, 'possessionTeam']
        if (game in list(df.gameId.unique())) & (play in list(df.playId.unique())) & (len( df[(df.gameId==game) & (df.playId==play) & (df.event=='line_set') ]) > 0) & ( df[df.event=='line_set'].frameId.values[0] < df[(df.event=='ball_snap')|(df.event=='snap_direct')|(df.event=='autoevent_ballsnap')].frameId.values[0] + 1 ):
            play_count = play_count+1
            #print('found game')
            df1 = df[(df.gameId==game) & (df.playId==play)]
            offense = list(df1[df1.club==offTeam].nflId.unique())
            line_set = df1[df1.event=='line_set'].frameId.values[0]
            snap = df1[(df1.event=='ball_snap')|(df1.event=='snap_direct')|(df1.event=='autoevent_ballsnap')].frameId.values[0] + 1
            df1 = df1[df1.frameId.isin(list(range(line_set,snap)))]
            positions = []
            player_list = []
            dist_list = []
            play_xlist = []
            play_ylist = []
            for player in offense:
                player_list.append(player)
                positions.append(players[players.nflId==player].position.values[0])
                x_list = list(df1[(df1.nflId==player)].x.values.astype('float'))
                y_list = list(df1[(df1.nflId==player)].y.values.astype('float'))
                play_xlist.append(x_list)
                play_ylist.append(y_list)
                dist_list.append(df1[df1.nflId==player].dis.sum())
            game_list.append(game)
            play_list.append(play)
            position_list.append(positions)
            overall_player_list.append(player_list)
            overall_xlist.append(play_xlist)
            overall_ylist.append(play_ylist)
            overall_dist_list.append(dist_list)
        #print('List lengths:')
        #print('play count:' + str(play_count))
        #print(len(game_list))
        #print(len(play_list))
        #print(len(position_list))
        #print(len(overall_player_list))
        #print(len(overall_xlist))
        #print(len(overall_ylist))
        #print(len(overall_dist_list))
    return(game_list, play_list, position_list, overall_player_list, overall_xlist, overall_ylist, overall_dist_list)

In [9]:
# A function to create a pandas dataframe using the lists produced by get_move_data()
def make_move_df(game_list, play_list, position_list, overall_player_list, overall_xlist, overall_ylist, overall_dist_list):

    move_df = pd.DataFrame(play_list, columns=['playId']).copy()
    move_df['gameId'] = game_list
    for i in range(move_df.shape[0]):
        #print(i)
        for p in range(11):
            if len(overall_xlist[i][p]) > 0:
                move_df.loc[i,'p'+str(p)+'_pos'] = position_list[i][p]
                move_df.loc[i,'p'+str(p)+'_dist'] = overall_dist_list[i][p]
                move_df.loc[i,'p'+str(p)+'_xstart'] = overall_xlist[i][p][0]
                move_df.loc[i,'p'+str(p)+'_xend'] = overall_xlist[i][p][-1]
                move_df.loc[i,'p'+str(p)+'_ystart'] = overall_ylist[i][p][0]
                move_df.loc[i,'p'+str(p)+'_yend'] = overall_ylist[i][p][-1]
                move_df.loc[i,'p'+str(p)+'_xmin'] = np.min(overall_xlist[i][p])
                move_df.loc[i,'p'+str(p)+'_xmax'] = np.max(overall_xlist[i][p])
                move_df.loc[i,'p'+str(p)+'_ymin'] = np.min(overall_ylist[i][p])
                move_df.loc[i,'p'+str(p)+'_ymax'] = np.max(overall_ylist[i][p])
            if len(overall_xlist[i][p]) == 0:
                move_df.loc[i,'p'+str(p)+'_pos'] = np.nan
                move_df.loc[i,'p'+str(p)+'_dist'] = np.nan
                move_df.loc[i,'p'+str(p)+'_xstart'] = np.nan
                move_df.loc[i,'p'+str(p)+'_xend'] = np.nan
                move_df.loc[i,'p'+str(p)+'_ystart'] = np.nan
                move_df.loc[i,'p'+str(p)+'_yend'] = np.nan
                move_df.loc[i,'p'+str(p)+'_xmin'] = np.nan
                move_df.loc[i,'p'+str(p)+'_xmax'] = np.nan
                move_df.loc[i,'p'+str(p)+'_ymin'] = np.nan
                move_df.loc[i,'p'+str(p)+'_ymax'] = np.nan

    move_df = move_df.merge(plays[['playId', 'gameId', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
                     'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'isDropback']], on=['playId','gameId'], how='left')
    return move_df

In [10]:
def add_pos_move(df, pos):
    move_dfm = df
    new_cols = 0
    for i in range(move_dfm.shape[0]):
        ind = np.where(move_dfm.iloc[i,:]==pos)
        move_dfm.loc[i,pos+'_total_dist'] = move_dfm.iloc[i,ind[0]+1].sum()
    for i in range(move_dfm.shape[0]):
        ind = np.where(move_dfm.iloc[i,:]==pos)
        #move_dfm.loc[i,'WR_total_dist'] =
        #print('row ', i)
        displacements = []
        xdisplacements = []
        indices = []
        ymaxes = []
        ymins = []
        xmaxes = []
        xmins = []
        for k in ind[0]:
            indices.append(k)
            displacements.append(move_dfm.iloc[i,k+5] - move_dfm.iloc[i,k+4])
            xdisplacements.append(move_dfm.iloc[i,k+3] - move_dfm.iloc[i,k+2])
            ymins.append(move_dfm.iloc[i,k+8])
            ymaxes.append(move_dfm.iloc[i,k+9])
            xmins.append(move_dfm.iloc[i,k+6])
            xmaxes.append(move_dfm.iloc[i,k+7])
        temp_df = pd.DataFrame(displacements, columns=['disp'])
        temp_df['ind'] = indices
        temp_df['ymin'] = ymins
        temp_df['ymax'] = ymaxes
        temp_df['xdisp'] = xdisplacements
        temp_df['xmin'] = xmins
        temp_df['xmax'] = xmaxes
        temp_df = temp_df.sort_values(by='disp', ascending=False).reset_index(drop=False)
        move_dfm.loc[i,pos+'_total_ydisp'] = temp_df['disp'].sum()
        new_cols = new_cols+1
        move_dfm.loc[i,pos+'_overall_ymin'] = np.min(temp_df['ymin'])
        new_cols = new_cols+1
        move_dfm.loc[i,pos+'_overall_ymax'] = np.max(temp_df['ymax'])
        new_cols = new_cols+1
        move_dfm.loc[i,pos+'_total_xdisp'] = temp_df['xdisp'].sum()
        new_cols = new_cols+1
        move_dfm.loc[i,pos+'_overall_xmin'] = np.min(temp_df['xmin'])
        new_cols = new_cols+1
        move_dfm.loc[i,pos+'_overall_xmax'] = np.max(temp_df['xmax'])
        new_cols = new_cols+1
        for j in range(len(ind[0])):
            move_dfm.loc[i,pos+'_'+str(j+1)+'_disp'] = temp_df.loc[j,'disp']
            new_cols = new_cols+1
            move_dfm.loc[i,pos+'_'+str(j+1)+'_ymin'] = temp_df.loc[j,'ymin']
            new_cols = new_cols+1
            move_dfm.loc[i,pos+'_'+str(j+1)+'_ymax'] = temp_df.loc[j,'ymax']
            new_cols = new_cols+1
            move_dfm.loc[i,pos+'_'+str(j+1)+'_xdisp'] = temp_df.loc[j,'xdisp']
            new_cols = new_cols+1
            move_dfm.loc[i,pos+'_'+str(j+1)+'_xmin'] = temp_df.loc[j,'xmin']
            new_cols = new_cols+1
            move_dfm.loc[i,pos+'_'+str(j+1)+'_xmax'] = temp_df.loc[j,'xmax']
            new_cols = new_cols+1
    move_dfm.iloc[:,-new_cols:] = move_dfm.iloc[:,-new_cols:].replace(to_replace={np.nan:-99})
    return(move_dfm)

In [None]:
move_dfs = []
for w in range(9):
    print('Working on week '+str(w)+'...')
    df = pd.read_csv(path+'tracking_week_'+str(w+1)+'.csv')
    data = get_move_data(w+1,df)
    move_dfs.append(make_move_df(data[0], data[1], data[2], data[3], data[4], data[5], data[6]))
move_df = pd.concat(move_dfs).reset_index(drop=True)

Working on week 0...
Working on play  0
Working on play  500
Working on play  1000
Working on play  1500


## Play location information

In [None]:
pass_plays = plays[~plays.passLocationType.isna()]
rush_plays = plays[~plays.rushLocationType.isna()]
rush_plays['locationType'] = rush_plays['rushLocationType']
rush_plays = rush_plays.drop(columns=['rushLocationType', 'passLocationType'])
pass_plays['locationType'] = pass_plays['passLocationType']
pass_plays = pass_plays.drop(columns=['passLocationType', 'rushLocationType'])
marked_plays = pd.concat([rush_plays, pass_plays])
marked_plays = marked_plays[marked_plays.locationType!='UNKNOWN'].reset_index(drop=True)
marked_plays['locationType'] = marked_plays['locationType'].replace(to_replace={'INSIDE_RIGHT':'INSIDE',
                                                                                'INSIDE_LEFT':'INSIDE',
                                                                               'INSIDE_BOX':'INSIDE'})
marked_plays['score_difference'] = marked_plays['preSnapHomeScore'] - marked_plays['preSnapVisitorScore']

In [None]:
marked_plays.locationType.value_counts()

In [None]:
move_df = move_df.merge(marked_plays[['gameId', 'playId', 'locationType',
                                       'score_difference', 'preSnapHomeTeamWinProbability']], on=['gameId', 'playId'], how='outer').dropna(subset='p0_pos')

In [None]:
for position in ['WR', 'TE', 'T', 'RB', 'FB', 'G']:
    print('Working on ', position, '...')
    move_df = add_pos_move(move_df, position)

In [None]:
move_df.to_csv('move_df.csv', index=False)

In [None]:
enc = LabelEncoder()

In [None]:
move_df['yardlineSideOff'] = move_df['possessionTeam'] == move_df['yardlineSide']
move_df['yardlineSideOff'] = move_df['yardlineSideOff'].astype('int')
move_df = move_df.drop(columns=['yardlineSide'])
move_df = pd.get_dummies(move_df, columns=['possessionTeam','defensiveTeam'])
for col in list(move_df.columns):
    if (move_df[col].dtype =='object')&(col!='possessionTeam')&(col!='defensiveTeam'):
        move_df[col] = enc.fit_transform(move_df[col])

In [None]:
X = move_dfm_dum.drop(columns=['playId', 'gameId', 'isDropback', 'locationType'])
y = move_dfm_dum['isDropback'].astype('int')

In [None]:
tree = lgb.LGBMClassifier()

In [None]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=290)

In [None]:
score_list = cross_val_score(tree, X, y, cv=kf, scoring='accuracy')

In [None]:
average_acc = np.mean(score_list2)

print(f"Accuracy Score for each fold: {[round(score, 4) for score in score_list2]}")
print(f"Average accuracy across {k} folds: {average_acc:.2f}")

In [None]:
tree.fit(X,y)

In [None]:
lgb.plot_importance(tree, importance_type="gain", figsize=(7,6), max_num_features=30,
                    title="Feature importance for predicting rush vs pass (Gain)")
plt.show()

In [None]:
lgb.plot_importance(tree, importance_type="split", figsize=(7,6), max_num_features=30,
                    title="Feature importance for predicting rush vs pass (Split)")
plt.show()