### Explanation

This kernel used LGBM and treated it as a regression problem. I only did a little feature engineering so far.(just transform some date format features into numeric)

The ideas is that:
- if we treated it as a regression problem, it's better to do some smooth operation. See the [kernel](https://www.kaggle.com/hukuda222/nfl-simple-evluation-trick).
- I used the distribution in [kernel](https://www.kaggle.com/jpmiller/simple-distribution) as my smooth distribution.
- We can see the simple distribution in [kernel](https://www.kaggle.com/jpmiller/simple-distribution) get the 1436 LB. If we use LGBM to do regression prediction and shift the distribution based on the yards we predicte, we should get a better LB. 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error
import sqlite3
import xgboost as xgb
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
from bayes_opt import BayesianOptimization
from kaggle.competitions import nflrush
import math
import tqdm
from scipy.spatial import Delaunay, delaunay_plot_2d, Voronoi, voronoi_plot_2d, ConvexHull
env = nflrush.make_env()

In [2]:
train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv',low_memory=False)

In [3]:
train.loc[train.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
train.loc[train.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

train.loc[train.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
train.loc[train.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

train.loc[train.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
train.loc[train.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

train.loc[train.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
train.loc[train.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"

train['is_run'] = train.NflId == train.NflIdRusher

In [4]:
# my original idea for feature engineering---------------------------------------------------
#def strtoseconds(txt):
#    txt = txt.split(':')
#    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
#    return ans

#def str_to_float(txt):
#    try:
#        return float(txt)
#    except:
#        return -1

# age #
#FMT_birth = '%m/%d/%Y'
#FMT_gamedate = '%Y-%m-%d'
#train["Age"] = train["TimeSnap"].apply(lambda t: t.split("T")[0])
#train["Age"] = train["Age"].apply(lambda t: datetime.strptime(t, FMT_gamedate))
#tmp_birth = train["PlayerBirthDate"].apply(lambda t: datetime.strptime(t, FMT_birth))
#train["Age"] = train["Age"] - tmp_birth
#train["Age"] = train["Age"].apply(lambda t: t.days//365)

# momentum 
train["Momentum"] = train["S"] * train["PlayerWeight"]

# on offense
def func(row):
    if row["PossessionTeam"] == row["HomeTeamAbbr"]:
        return "home"
    else:
        return "away"
train["OnOffense"] = train[["PossessionTeam", "HomeTeamAbbr"]].apply(func, axis=1)
train["OnOffense"] = train["OnOffense"] == train["Team"]

#train["FieldvsPosession"] = train["FieldPosition"] == train["PossessionTeam"]
#train["Distance10"] = train["Distance"].apply(lambda x: 1 if x > 10 else 0)
#train["DownQuarter"] = train[["Down", "Quarter"]].apply(lambda x: "D{}_Q{}".format(x[0], x[1]), axis=1)

# exercise energy
#train["ExerciseEnegy"] = 0.5 * train["PlayerWeight"] * (train["S"]**2)

rusher_x = np.array(train.groupby(["PlayId", "is_run"])["X"].agg(np.mean)[1::2])
rusher_x = np.repeat(rusher_x, 22) # repeat each elemnt 22 times train["RusherX"]
rusher_y = np.array(train.groupby(["PlayId", "is_run"])["Y"].agg(np.mean)[1::2])
rusher_y = np.repeat(rusher_y, 22) # train["RusherY"]
train["DisToRusher"] = np.sqrt((train["X"] - rusher_x) ** 2 + (train["Y"] - rusher_y) ** 2)
train["TackleTimeToRusher"] = train["DisToRusher"] / train["S"] 

rusher_s = np.array(train.groupby(["PlayId", "is_run"]).agg(np.mean)["S"][1::2])
rusher_s = np.repeat(rusher_s, 22)
train["RatioSToRusher"] = train["S"] / rusher_s

# distance without no restriction if the difference between distance is large, the player is restricted by defenders
#train["MoveDist"] = train["S"] * train["TimeFromSnapDiff"] + 0.5 * train["A"] * (train["TimeFromSnapDiff"] **2)

# ratio of real movement distance to theoretical movement distance
#train["RealToTheoryDis"] = train["Dis"] / train["MoveDist"]

print("Preprocess finished")
# my original idea end ---------------------------------------------------

Preprocess finished


In [5]:
train_single = train[train.is_run==True].copy()

def transform_time_quarter(str1):
    return int(str1[:2])*60 + int(str1[3:5])
def transform_time_all(str1,quarter):
    if quarter<=4:
        return 15*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
    if quarter ==5:
        return 10*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
train_single['time_quarter'] = train_single.GameClock.map(lambda x:transform_time_quarter(x))
train_single['time_end'] = train_single.apply(lambda x:transform_time_all(x.loc['GameClock'],x.loc['Quarter']),axis=1)

train_single['TimeHandoff'] = train_single['TimeHandoff'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
train_single['TimeSnap'] = train_single['TimeSnap'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
train_single['handoff_snap_diff'] = (train_single['TimeHandoff'] - train_single['TimeSnap']).map(lambda x:x.seconds)
# my original idea -----
#train_single['WindSpeed'] = train_single['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
#train_single['WindSpeed'] = train_single['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
#train_single['WindSpeed'] = train_single['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
#train_single['WindSpeed'] = train_single['WindSpeed'].apply(str_to_float)

#train_single['WindDirection'] = train_single['WindDirection'].apply(lambda x: "north" if x == "N" or x == "FROM S"
#                                                   else ("south" if x == 'S' or x== 'FROM N'
#                                                   else ("west" if x == 'W' or x == 'FROM E'
#                                                   else ("east" if x == 'E' or x == 'FROM W'
#                                                   else ("north east" if x == 'FROM SW' or x == 'FROM SSW' or x == 'FROM WSW'
#                                                   else ("north west" if x == 'FROM SE' or x == 'FROM SSE' or x == 'FROM ESE'
#                                                   else ("south east" if x == 'FROM NW' or x == 'FROM NNW' or x == 'FROM WNW'
#                                                   else ("south west" if x == 'FROM NE' or x == 'FROM NNE' or x == 'FROM ENE'
#                                                   else ("north west" if x == 'NW' or x == 'NORTHWEST'
#                                                   else ("north east" if x == 'NE' or x == 'NORTH EAST'
#                                                   else ("south west" if x == 'SW' or x == 'SOUTHWEST'
#                                                   else ("south east" if x == 'SE' or x == 'SOUTHEAST' else "unknown"))))))))))))

#rain = ['Rainy', 'Rain Chance 40%', 'Showers', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
#          'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain']
#overcast = ['Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain','Coudy', 'Cloudy, 50% change of rain', 
#            'Rain likely, temps in low 40s.', 'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter', 'Partly Clouidy', 
#            '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool', 'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 
#            'Mostly Cloudy', 'Partly Cloudy', 'Cloudy']
#clear = ['Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny', 'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
#        'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold', 'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
#        'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny', 'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny']
#snow = ['Heavy lake effect snow', 'Snow']
#none = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']

#train_single['GameWeather'] = train_single['GameWeather'].apply(lambda x: "rain" if x in rain 
#                                                        else ("overcast" if x in overcast
#                                                        else ("clear" if x in clear
#                                                        else ("snow" if x in snow
#                                                        else ("indoor" if x in none else "unknown")))))

#outdoor =['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 'Outside', 'Outddors', 'Outdoor Retr Roof-Open', 'Oudoor', 'Bowl']
#indoor_closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed', 
#                 'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed']
#indoor_open = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
#dome_closed = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
#dome_open = ['Domed, Open', 'Domed, open']

#train_single['StadiumType'] = train_single['StadiumType'].apply(lambda x: "outdoor" if x in outdoor 
#                                                        else ("indoor closed" if x in indoor_closed
#                                                        else ("indoor open" if x in indoor_open
#                                                        else ("dome_closed" if x in dome_closed
#                                                        else ("dome_open" if x in dome_open else "unknown")))))

#train_single["Stadium"] = train_single["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
#                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
#                                             else ("Everbank Field" if x == "EverBank Field"
#                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
#                                             else ("Lambeau Field" if x == "Lambeau field"
#                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
#                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
#                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
#                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
#                                             else ("NRG Stadium" if x == "NRG"
#                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
#                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
#                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))

#train_single["Location"] = train_single["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
#                        else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
#                        else ("Charlotte, NC" if x == "Charlotte, North Carolina"
#                        else ("Chicago, IL" if x == "Chicago. IL"
#                        else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
#                        else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
#                        else ("Detroit, MI" if x == "Detroit"
#                        else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
#                        else ("Foxborough, MA" if x == "Foxborough, Ma"
#                        else ("Houston, TX" if x == "Houston, Texas"
#                        else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
#                        else ("London" if x == "London, England"
#                        else ("Los Angeles, CA" if x == "Los Angeles, Calif."
#                        else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
#                        else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
#                        else ("Orchard Park, NY" if x == "Orchard Park NY"
#                        else ("Philadelphia, PA" if x == "Philadelphia, Pa."
#                        else ("Pittsburgh, PA" if x == "Pittsburgh"
#                        else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))

#train_single["Turf"] = train_single["Turf"].map(lambda x: "Artificial" if x == "Artifical"
#                                       else ("Field Turf" if x == "FieldTurf" or x == "Field turf"
#                                       else ("FieldTurf 360" if x == "FieldTurf360"
#                                       else ("Natural Grass" if x == "natural grass" or x == "Naturall Grass" or x == "Natural grass" or x == "Natural"
#                                       else ("Grass" if x == "grass"
#                                       else ("UBU Speed Series-S5-M" if x == "UBU Sports Speed S5-M" else x))))))

train_single["OffenseFormation"] = train_single["OffenseFormation"].fillna("Unknown") 
train_single['DefendersInTheBox_vs_Distance'] = train_single['DefendersInTheBox'] / train_single['Distance']

# defense personnel -----
#arr = [[int(s[0]) for s in t.split(", ")] for t in train_single["DefensePersonnel"]]
#train_single["DefenseDL"] = np.array([a[0] for a in arr])
#train_single["DefenseLB"] = np.array([a[1] for a in arr])
#train_single["DefenseDB"] = np.array([a[2] for a in arr])
#train_single["DefenseOL"] = np.array([a[3] if len(a) == 4 else 0 for a in arr])

# offense personnel -----
#train_single["OffenseRB"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" RB")[0][-1]) if "RB" in x else 0)
#train_single["OffenseTE"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" TE")[0][-1]) if "TE" in x else 0)
#train_single["OffenseWR"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" WR")[0][-1]) if "WR" in x else 0)
#train_single["OffenseOL"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" OL")[0][-1]) if "OL" in x else 0)
#train_single["OffenseDL"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" DL")[0][-1]) if "DL" in x else 0)
#train_single["OffenseQB"] = train_single["OffensePersonnel"].apply(lambda x: 
#                        int(x.replace(",", "").split(" QB")[0][-1]) if "QB" in x else 0)

# necessary yard per remaining down 
train_single["NecDisPerDown"] = train_single["Distance"] / (5 - train_single["Down"])

train_single["Margin"] = (train_single["HomeScoreBeforePlay"] - 
                      train_single["VisitorScoreBeforePlay"]) + 2 * (1 - (train_single["PossessionTeam"] == 
                                                                      train_single["HomeTeamAbbr"]).astype(int)) * (train_single["VisitorScoreBeforePlay"] 
                                                                                                                - train_single["HomeScoreBeforePlay"])
# my original idea ----

In [6]:
remove_features = ['GameId','PlayId','DisplayName','GameClock','TimeHandoff','TimeSnap', 'Stadium', 'StadiumType', 'Location', 'PlayerCollegeName', 'Week', 'WindDirection', 'WindSpeed', 'GameWeather', 'Turf', 
                   'Humidity', 'Temperature', 'JerseyNumber', 'Season', 'Position', 'Quarter', 'NflId', 'HomeScoreBeforePlay', "VisitorScoreBeforePlay", 'Down', 'OffensePersonnel', 'DefensePersonnel', 'PlayerWeight']#, 'X', 'Y', 'Dir', "YardLine"]
train_single['date_game'] = train_single.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
#train_single['runner_age'] = (train_single.date_game.map(pd.to_datetime) - train_single.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365
remove_features.append('HomeTeamAbbr')
remove_features.append('VisitorTeamAbbr')
remove_features.append('PlayerBirthDate')
remove_features.append('is_run')
#def transform_height(te):
#    return (int(te.split('-')[0])*12 + int(te.split('-')[1]))*2.54/100
#train_single['runner_height'] = train_single.PlayerHeight.map(transform_height)
remove_features.append('PossessionTeam')
remove_features.append('FieldPosition')
remove_features.append('PlayerHeight')
remove_features.append('NflIdRusher')
remove_features.append('date_game')
train_single['own_field'] = (train_single['FieldPosition'] == train_single['PossessionTeam']).astype(int)
dist_to_end_train = train_single.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
remove_features.append('own_field')
train_single.drop(remove_features,axis=1,inplace=True)
train_single.fillna(-999,inplace=True)

In [7]:
y_train = train_single.Yards
X_train = train_single.drop(['Yards'],axis=1)
for f in X_train.columns:
    if X_train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f])+[-999])
        X_train[f] = lbl.transform(list(X_train[f]))

In [8]:
# additional feature engineering -----------------------------------------------------------
def voronoi_volumes(points, selected_index):
    v = Voronoi(points)
    vol = np.zeros(v.npoints)
    
    for i, reg_num in enumerate(v.point_region):
        indices = v.regions[reg_num]
        if -1 in indices: # some regions can be opened
            vol[i] = -999 ## insert missing value when the area is open
        else:
            vol[i] = ConvexHull(v.vertices[indices]).volume
        
        if reg_num == v.point_region[selected_index]: # in the case of rusher or 1st defender etc...
            index = i
            rusher_reg_num = reg_num         
        
    return vol[index]

tmp = train.groupby(["PlayId", "OnOffense"]).agg(np.mean)[["S", "X", "Y"]]
X_train["DefenseAveSpeed"] = np.array(tmp[0::2]["S"])
X_train["OffenseAveSpeed"] = np.array(tmp[1::2]["S"])

X_train["DefenseAveX"] = np.array(tmp[0::2]["X"])
X_train["OffenseAveX"] = np.array(tmp[1::2]["X"])

X_train["DefenseAveY"] = np.array(tmp[0::2]["Y"]) 
X_train["OffenseAveY"] = np.array(tmp[1::2]["Y"]) 

#X_train["DefenseAveAge"] = np.array(tmp[0::2]["Age"])
#X_train["OffenseAveAge"] = np.array(tmp[1::2]["Age"])

tmp = train.groupby(["PlayId", "OnOffense"]).agg(["std"])[["X", "Y"]]
X_train["DefenseStdX"] = np.array(tmp[0::2]["X"])
X_train["OffenseStdX"] = np.array(tmp[1::2]["X"])

X_train["DefenseStdY"] = np.array(tmp[0::2]["Y"])
X_train["OffenseStdY"] = np.array(tmp[1::2]["Y"])

X_train["RunnerToDefenseCentoid"] = np.sqrt((X_train["X"] - X_train["DefenseAveX"]) ** 2 + (X_train["Y"] - X_train["DefenseAveY"]) ** 2)
X_train["RunnerToOffenseCentoid"] = np.sqrt((X_train["X"] - X_train["OffenseAveX"]) ** 2 + (X_train["Y"] - X_train["OffenseAveY"]) ** 2)

# defense x spread, offense x spread
tmp_max = train.groupby(["PlayId", "OnOffense"])["X"].max()
tmp_min = train.groupby(["PlayId", "OnOffense"])["X"].min()
X_train["DefenseSpreadX"] = np.array(tmp_max[0::2]- tmp_min[0::2])
X_train["OffenseSpreadX"] = np.array(tmp_max[1::2]- tmp_min[1::2])

X_train["RunnerToScrimmage"] = X_train["X"] - X_train["YardLine"]

# runner horizontal and vertical speed
radian_angle = (90 - X_train['Dir']) * np.pi / 180.0
X_train['v_horizontal'] = np.abs(X_train['S'] * np.cos(radian_angle))
X_train['v_vertical'] = np.abs(X_train['S'] * np.sin(radian_angle))

# runner horizontal and vertical momentum
X_train['m_horizontal'] = np.abs(X_train['Momentum'] * np.cos(radian_angle))
X_train['m_vertical'] = np.abs(X_train['Momentum'] * np.sin(radian_angle))

# minimum distance to rusher from defenders
X_train["MinDisFromRushToDef"] = np.array(train.groupby(["PlayId", "OnOffense"])["DisToRusher"].min()[0::2])

# tackle time from closest defender to rusher
X_train["MinTackleTime"] = np.array(train.groupby(["PlayId", "OnOffense"])["TackleTimeToRusher"].min()[0::2])

# average tackle time from all defenders to rusher
X_train["AveTackleTime"] = np.array(train.groupby(["PlayId", "OnOffense"]).agg(np.mean)["TackleTimeToRusher"][0::2])

# runner vs 1st defender speed: runner's velocity divided by closest defender's speed
X_train["RatioSRusherToCloseDef"] = np.array(train.loc[train.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["RatioSToRusher"])

# runner horizontal and vertical distance
X_train["dis_horizontal"] = np.abs(X_train['Dis'] * np.cos(radian_angle))
X_train["dis_vertical"] = np.abs(X_train['Dis'] * np.sin(radian_angle))
X_train["RunnerMoveRatio"] = X_train["dis_horizontal"] / X_train["dis_vertical"]

# the momentum of 1st closest defender to rusher, horizontal momentum, vertical momentum
X_train["DefMomentumCloToRusher"] = np.array(train.loc[train.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["Momentum"])
X_train["DefMomentumCloToRusher_horizontal"] = np.abs(X_train['DefMomentumCloToRusher'] * np.cos(radian_angle))
X_train["DefMomentumCloToRusher_vertical"] = np.abs(X_train['DefMomentumCloToRusher'] * np.sin(radian_angle))

# the horizontal, vertical mometum of rusher
X_train["RusherMomentum_horizontal"] =  np.abs(X_train['Momentum'] * np.cos(radian_angle))
X_train["RusherMomentum_vertical"] =  np.abs(X_train['Momentum'] * np.sin(radian_angle))

# difference of horizontal momentum 
X_train["hMomentum_rusher_vs_defender"] = X_train["RusherMomentum_horizontal"] - X_train["DefMomentumCloToRusher_horizontal"]

# voronoi area
pts = np.array(train[["X", "Y"]]).reshape(train.shape[0]//22, 22, 2) # plays * players * (X, Y, rusher)
# index of row where rusher data is included when separated by each play
rusher_index = list(train[train.is_run==True].index % 22) 
closest_def_index = list(train.loc[train.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]].index % 22)
rusher_voronoi = []
closest_def_voronoi = []
for i in range(0, train.shape[0] //22):
    rusher_voronoi.append(voronoi_volumes(pts[i], rusher_index[i]))
    closest_def_voronoi.append(voronoi_volumes(pts[i], closest_def_index[i]))
X_train["RusherVoronoi"] = rusher_voronoi    
X_train["FirstDefenderVoronoi"] = closest_def_voronoi 
X_train.drop('OnOffense', axis=1)
# additional feature engineering end ---------------------------------------------------------

Unnamed: 0,Team,X,Y,S,A,Dis,Orientation,Dir,YardLine,Distance,...,dis_vertical,RunnerMoveRatio,DefMomentumCloToRusher,DefMomentumCloToRusher_horizontal,DefMomentumCloToRusher_vertical,RusherMomentum_horizontal,RusherMomentum_vertical,hMomentum_rusher_vs_defender,RusherVoronoi,FirstDefenderVoronoi
18,2,78.75,30.53,3.63,3.35,0.38,161.98,245.74,35,2,...,0.156134,2.218883,120.96,110.278064,49.699800,678.434371,305.754848,568.156307,25.077409,0.639328
40,2,71.07,27.16,3.06,2.41,0.34,210.70,312.20,43,10,...,0.228385,1.102846,369.60,273.801379,248.267930,464.706723,421.370326,190.905344,14.993787,4.604930
62,2,48.66,19.11,5.77,2.42,0.60,140.82,221.96,35,10,...,0.446167,0.899141,757.68,506.593660,563.419778,790.867267,879.581201,284.273607,477.126397,3.577924
84,2,15.53,25.36,4.45,3.20,0.46,186.22,275.44,2,2,...,0.043610,10.500645,331.10,329.608733,31.389380,930.291033,88.593706,600.682300,20.176370,0.743314
98,1,29.99,27.12,3.90,2.53,0.44,34.27,157.92,25,10,...,0.407730,0.405651,844.80,317.561013,782.842285,316.658851,780.618301,-0.902162,12.513534,4.556508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509657,1,49.57,32.27,3.99,3.38,0.39,287.44,290.23,35,10,...,0.134858,2.713535,902.70,847.014329,312.144223,838.625819,309.052864,-8.388510,-999.000000,5.563660
509691,2,31.00,30.99,4.18,2.95,0.40,122.97,150.98,25,10,...,0.349780,0.554765,706.85,342.903460,618.105282,448.139434,807.799812,105.235973,11.897363,3.743602
509713,2,39.93,29.20,4.67,3.68,0.45,63.11,41.25,34,1,...,0.338328,0.876976,808.96,533.384391,608.208331,680.491035,775.951310,147.106645,51.931095,12.563757
509739,2,81.19,23.75,4.23,2.43,0.41,66.34,41.24,25,10,...,0.308301,0.876668,737.28,486.025729,554.401289,619.042047,706.130744,133.016319,10.231950,5.013091


In [9]:
def get_cdf_df(yards_array):
    pdf, edges = np.histogram(yards_array, bins=199,
                 range=(-99,100), density=True)
    cdf = pdf.cumsum().clip(0, 1)
    cdf_df = pd.DataFrame(data=cdf.reshape(-1, 1).T, 
                            columns=['Yards'+str(i) for i in range(-99,100)])
    return cdf_df
cdf = get_cdf_df(y_train).values.reshape(-1,)

def get_score(y_pred,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    return y_pred_array    

def get_score_pingyi1(y_pred,y_true,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    y_true_array = np.zeros(199)
    y_true_array[(y_true+99):]=1
    return np.mean((y_pred_array - y_true_array)**2)


def CRPS_pingyi1(y_preds,y_trues,w,cdf,dist_to_ends):
    if len(y_preds) != len(y_trues):
        print('length does not match')
        return None
    n = len(y_preds)
    tmp = []
    for a,b,c in zip(y_preds, y_trues,dist_to_ends):
        tmp.append(get_score_pingyi1(a,b,cdf,w,c))
    return np.mean(tmp)

In [10]:
kf=KFold(n_splits = 5)
resu1 = 0
impor1 = 0
resu2_cprs = 0
resu3_mae=0
##y_pred = 0
stack_train = np.zeros([X_train.shape[0],])
models = []
for train_index, test_index in kf.split(X_train, y_train):
    X_train2= X_train.iloc[train_index,:]
    y_train2= y_train.iloc[train_index]
    X_test2= X_train.iloc[test_index,:]
    y_test2= y_train.iloc[test_index]
#     clf = lgb.LGBMRegressor(n_estimators=10000, random_state=47,subsample=0.7,
#                              colsample_bytree=0.7,learning_rate=0.005,importance_type = 'gain',
#                      max_depth = -1, num_leaves = 100,min_child_samples=20,min_split_gain = 0.001,
#                        bagging_freq=1,reg_alpha = 0,reg_lambda = 0,n_jobs = -1)
    clf = lgb.LGBMRegressor(n_estimators=10000, random_state=47,learning_rate=0.005,importance_type = 'gain',
                     n_jobs = -1,metric='mae')
    
    clf.fit(X_train2,y_train2,eval_set = [(X_train2,y_train2),(X_test2,y_test2)],early_stopping_rounds=200,verbose=50)
    
    temp_predict = clf.predict(X_test2)
    stack_train[test_index] = temp_predict
    ##y_pred += clf.predict(X_test)/5
    mse = mean_squared_error(y_test2, temp_predict)
    crps = CRPS_pingyi1(temp_predict,y_test2,4,cdf,dist_to_end_train.iloc[test_index])
    mae = mean_absolute_error(y_test2, temp_predict)
    print(crps)
    if crps < 0.013:
        models.append(clf)
    
    resu1 += mse/5
    resu2_cprs += crps/5
    resu3_mae += mae/5 
    impor1 += clf.feature_importances_/5
    gc.collect()
print('mean mse:',resu1)
print('oof mse:',mean_squared_error(y_train,stack_train))
print('mean mae:',resu3_mae)
print('oof mae:',mean_absolute_error(y_train,stack_train))
print('mean cprs:',resu2_cprs)
print('oof cprs:',CRPS_pingyi1(stack_train,y_train,4,cdf,dist_to_end_train))

Training until validation scores don't improve for 200 rounds.
[50]	training's l1: 3.79147	valid_1's l1: 3.72998
[100]	training's l1: 3.71995	valid_1's l1: 3.6356
[150]	training's l1: 3.66501	valid_1's l1: 3.56691
[200]	training's l1: 3.6224	valid_1's l1: 3.52126
[250]	training's l1: 3.58856	valid_1's l1: 3.4872
[300]	training's l1: 3.56017	valid_1's l1: 3.46145
[350]	training's l1: 3.53396	valid_1's l1: 3.44088
[400]	training's l1: 3.5093	valid_1's l1: 3.42245
[450]	training's l1: 3.48705	valid_1's l1: 3.4092
[500]	training's l1: 3.46646	valid_1's l1: 3.40031
[550]	training's l1: 3.4473	valid_1's l1: 3.39358
[600]	training's l1: 3.42901	valid_1's l1: 3.38664
[650]	training's l1: 3.41212	valid_1's l1: 3.38003
[700]	training's l1: 3.39584	valid_1's l1: 3.37568
[750]	training's l1: 3.3802	valid_1's l1: 3.37293
[800]	training's l1: 3.36475	valid_1's l1: 3.37078
[850]	training's l1: 3.34953	valid_1's l1: 3.36902
[900]	training's l1: 3.33488	valid_1's l1: 3.367
[950]	training's l1: 3.32118	

In [11]:
def transform_test(test):
    test.loc[test.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
    test.loc[test.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

    test.loc[test.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
    test.loc[test.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

    test.loc[test.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
    test.loc[test.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

    test.loc[test.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
    test.loc[test.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"
    
    test['is_run'] = test.NflId == test.NflIdRusher
    
    # my original idea for feature engineering -------------------------------------------------------------
    
    # age 
    #FMT_birth = '%m/%d/%Y'
    #FMT_gamedate = '%Y-%m-%d'
    #test["Age"] = test["TimeSnap"].apply(lambda t: t.split("T")[0])
    #test["Age"] = test["Age"].apply(lambda t: datetime.strptime(t, FMT_gamedate))
    #tmp_birth = test["PlayerBirthDate"].apply(lambda t: datetime.strptime(t, FMT_birth))
    #test["Age"] = test["Age"] - tmp_birth
    #test["Age"] = test["Age"].apply(lambda t: t.days//365)

    # momentum 
    test["Momentum"] = test["S"] * test["PlayerWeight"]

    # on offense
    test["OnOffense"] = test[["PossessionTeam", "HomeTeamAbbr"]].apply(func, axis=1)
    test["OnOffense"] = test["OnOffense"] == test["Team"]
    
    #test['ToLeft'] = test.PlayDirection == "left"
    #test['Dir_rad'] = np.mod(90 - test.Dir, 360) * math.pi/180.0
    #test['TeamOnOffense'] = "home"
    #test.loc[test.PossessionTeam != test.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    #test['IsOnOffense'] = test.Team == test.TeamOnOffense # Is player on offense?
    #test['YardLine_std'] = 100 - test.YardLine
    #test.loc[test.FieldPosition.fillna('') == test.PossessionTeam,  
    #      'YardLine_std'
    #     ] = test.loc[test.FieldPosition.fillna('') == test.PossessionTeam,  
    #      'YardLine']
    #test['X_std'] = test.X
    #test.loc[test.ToLeft, 'X_std'] = 120 - test.loc[test.ToLeft, 'X'] 
    #test['Y_std'] = test.Y
    #test.loc[test.ToLeft, 'Y_std'] = 160/3 - test.loc[test.ToLeft, 'Y'] 
     #train['Orientation_std'] = -90 + train.Orientation
     #train.loc[train.ToLeft, 'Orientation_std'] = np.mod(180 + train.loc[train.ToLeft, 'Orientation_std'], 360)
    #test['Dir_std'] = test.Dir_rad
    #test.loc[test.ToLeft, 'Dir_std'] = np.mod(np.pi + test.loc[test.ToLeft, 'Dir_rad'], 2*np.pi)
    
    # exercise energy
    #test["Exercise enegy"] = 0.5 * test["PlayerWeight"] * (test["S"]**2)
        
    rusher_x = np.array(test.groupby(["PlayId", "is_run"])["X"].agg(np.mean)[1::2])
    rusher_x = np.repeat(rusher_x, 22) # repeat each elemnt 22 times train["RusherX"]
    rusher_y = np.array(test.groupby(["PlayId", "is_run"])["Y"].agg(np.mean)[1::2])
    rusher_y = np.repeat(rusher_y, 22) # train["RusherY"]
    test["DisToRusher"] = np.sqrt((test["X"] - rusher_x) ** 2 + (test["Y"] - rusher_y) ** 2)
    test["TackleTimeToRusher"] = test["DisToRusher"] / test["S"] 

    rusher_s = np.array(test.groupby(["PlayId", "is_run"]).agg(np.mean)["S"][1::2])
    rusher_s = np.repeat(rusher_s, 22)
    test["RatioSToRusher"] = test["S"] / rusher_s
    
    #test["MoveDist"] = test["S"] * test["TimeFromSnapDiff"] + 0.5 * test["A"] * (test["TimeFromSnapDiff"] **2)
    # ratio of real movement distance to theoretical movement distance
    #test["RealToTheoryDis"] = test["Dis"] / test["MoveDist"]
    
    # my original idea end ------------------------------------------------------------
    
    test_single = test[test.is_run==True].copy()
    test_single['time_quarter'] = test_single.GameClock.map(lambda x:transform_time_quarter(x))
    test_single['time_end'] = test_single.apply(lambda x:transform_time_all(x.loc['GameClock'],x.loc['Quarter']),axis=1)
    test_single['TimeHandoff'] = test_single['TimeHandoff'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    test_single['TimeSnap'] = test_single['TimeSnap'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    test_single['handoff_snap_diff'] = (test_single['TimeHandoff'] - test_single['TimeSnap']).map(lambda x:x.seconds)
    
    # my original idea -----------
    #test_single['WindSpeed'] = test_single['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    #test_single['WindSpeed'] = test_single['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    #test_single['WindSpeed'] = test_single['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    #test_single['WindSpeed'] = test_single['WindSpeed'].apply(str_to_float)
    
    #test_single['WindDirection'] = test_single['WindDirection'].apply(lambda x: "north" if x == "N" or x == "FROM S"
    #                                               else ("south" if x == 'S' or x== 'FROM N'
    #                                               else ("west" if x == 'W' or x == 'FROM E'
    #                                               else ("east" if x == 'E' or x == 'FROM W'
    #                                               else ("north east" if x == 'FROM SW' or x == 'FROM SSW' or x == 'FROM WSW'
    #                                               else ("north west" if x == 'FROM SE' or x == 'FROM SSE' or x == 'FROM ESE'
    #                                               else ("south east" if x == 'FROM NW' or x == 'FROM NNW' or x == 'FROM WNW'
    #                                               else ("south west" if x == 'FROM NE' or x == 'FROM NNE' or x == 'FROM ENE'
    #                                               else ("north west" if x == 'NW' or x == 'NORTHWEST'
    #                                               else ("north east" if x == 'NE' or x == 'NORTH EAST'
    #                                               else ("south west" if x == 'SW' or x == 'SOUTHWEST'
    #                                               else ("south east" if x == 'SE' or x == 'SOUTHEAST' else "unknown"))))))))))))
    
    #test_single['GameWeather'] = test_single['GameWeather'].apply(lambda x: "rain" if x in rain 
    #                                                    else ("overcast" if x in overcast
    #                                                    else ("clear" if x in clear
    #                                                    else ("snow" if x in snow
    #                                                    else ("indoor" if x in none else "unknown")))))
    
    #test_single['StadiumType'] = test_single['StadiumType'].apply(lambda x: "outdoor" if x in outdoor 
    #                                                    else ("indoor closed" if x in indoor_closed
    #                                                    else ("indoor open" if x in indoor_open
    #                                                    else ("dome_closed" if x in dome_closed
    #                                                    else ("dome_open" if x in dome_open else "unknown")))))
    
    #test_single["Location"] = test_single["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
    #                    else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
    #                    else ("Charlotte, NC" if x == "Charlotte, North Carolina"
    #                    else ("Chicago, IL" if x == "Chicago. IL"
    #                    else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
    #                    else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
    #                    else ("Detroit, MI" if x == "Detroit"
    #                    else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
    #                    else ("Foxborough, MA" if x == "Foxborough, Ma"
    #                    else ("Houston, TX" if x == "Houston, Texas"
    #                    else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
    #                    else ("London" if x == "London, England"
    #                    else ("Los Angeles, CA" if x == "Los Angeles, Calif."
    #                    else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
    #                    else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
    #                    else ("Orchard Park, NY" if x == "Orchard Park NY"
    #                    else ("Philadelphia, PA" if x == "Philadelphia, Pa."
    #                    else ("Pittsburgh, PA" if x == "Pittsburgh"
    #                    else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))
    
    #test_single["Stadium"] = test_single["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
    #                                        else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
    #                                         else ("Everbank Field" if x == "EverBank Field"
    #                                         else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
    #                                         else ("Lambeau Field" if x == "Lambeau field"
    #                                         else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
    #                                         else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
    #                                         else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
    #                                         else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
    #                                         else ("NRG Stadium" if x == "NRG"
    #                                         else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
    #                                         else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
    #                                         else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))
    
    test_single["OffenseFormation"] = test_single["OffenseFormation"].fillna("Unknown") 
    test_single['DefendersInTheBox_vs_Distance'] = test_single['DefendersInTheBox'] / test['Distance']
    
    #test_single["Turf"] = test_single["Turf"].map(lambda x: "Artificial" if x == "Artifical"
    #                                   else ("Field Turf" if x == "FieldTurf" or x == "Field turf"
    #                                   else ("FieldTurf 360" if x == "FieldTurf360"
    #                                   else ("Natural Grass" if x == "natural grass" or x == "Naturall Grass" or x == "Natural grass" or x == "Natural"
    #                                   else ("Grass" if x == "grass"
    #                                   else ("UBU Speed Series-S5-M" if x == "UBU Sports Speed S5-M" else x))))))
    
    #arr = [[int(s[0]) for s in t.split(", ")] for t in test_single["DefensePersonnel"]]
    #test_single["DefenseDL"] = np.array([a[0] for a in arr])
    #test_single["DefenseLB"] = np.array([a[1] for a in arr])
    #test_single["DefenseDB"] = np.array([a[2] for a in arr])
    #test_single["DefenseOL"] = np.array([a[3] if len(a) == 4 else 0 for a in arr])
    
    # offense personnel -----
    #test_single["OffenseRB"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" RB")[0][-1]) if "RB" in x else 0)
    #test_single["OffenseTE"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" TE")[0][-1]) if "TE" in x else 0)
    #test_single["OffenseWR"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" WR")[0][-1]) if "WR" in x else 0)
    #test_single["OffenseOL"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" OL")[0][-1]) if "OL" in x else 0)
    #test_single["OffenseDL"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" DL")[0][-1]) if "DL" in x else 0)
    #test_single["OffenseQB"] = test_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" QB")[0][-1]) if "QB" in x else 0)
    
    test_single["NecDisPerDown"] = test_single["Distance"] / (5 - test_single["Down"])
    
    test_single["Margin"] = (test_single["HomeScoreBeforePlay"] - 
                          test_single["VisitorScoreBeforePlay"]) + 2 * (1 - (test_single["PossessionTeam"] == 
                                                                  test_single["HomeTeamAbbr"]).astype(int)) * (test_single["VisitorScoreBeforePlay"] 
                                                                                                                - test_single["HomeScoreBeforePlay"])
    # my original idea -----------
    test_single['date_game'] = test_single.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
    #test_single['runner_age'] = (test_single.date_game.map(pd.to_datetime) - test_single.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365
    #test_single['runner_height'] = test_single.PlayerHeight.map(transform_height)
    return test_single.drop(remove_features,axis=1)

In [12]:
for (test_df, sample_prediction_df) in env.iter_test():
    test_df['own_field'] = (test_df['FieldPosition'] == test_df['PossessionTeam']).astype(int)
    dist_to_end_test = test_df.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
    X_test = transform_test(test_df)
    X_test.fillna(-999,inplace=True)
    
    # additional feature engineering -----------------------------------------------------------
    tmp = test_df.groupby(["PlayId", "OnOffense"]).agg(np.mean)[["S", "X", "Y"]]
    X_test["DefenseAveSpeed"] = np.array(tmp[0::2]["S"])
    X_test["OffenseAveSpeed"] = np.array(tmp[1::2]["S"])

    X_test["DefenseAveX"] = np.array(tmp[0::2]["X"])
    X_test["OffenseAveX"] = np.array(tmp[1::2]["X"])

    X_test["DefenseAveY"] = np.array(tmp[0::2]["Y"]) 
    X_test["OffenseAveY"] = np.array(tmp[1::2]["Y"]) 

    #X_test["DefenseAveAge"] = np.array(tmp[0::2]["Age"])
    #X_test["OffenseAveAge"] = np.array(tmp[1::2]["Age"])
    
    tmp = test_df.groupby(["PlayId", "OnOffense"]).agg(["std"])[["X", "Y"]]
    X_test["DefenseStdX"] = np.array(tmp[0::2]["X"])
    X_test["OffenseStdX"] = np.array(tmp[1::2]["X"])

    X_test["DefenseStdY"] = np.array(tmp[0::2]["Y"])
    X_test["OffenseStdY"] = np.array(tmp[1::2]["Y"])

    X_test["RunnerToDefenseCentoid"] = np.sqrt((X_test["X"] - X_test["DefenseAveX"]) ** 2 + (X_test["Y"] - X_test["DefenseAveY"]) ** 2)
    X_test["RunnerToOffenseCentoid"] = np.sqrt((X_test["X"] - X_test["OffenseAveX"]) ** 2 + (X_test["Y"] - X_test["OffenseAveY"]) ** 2)

    # defense x spread, offense x spread
    tmp_max = test_df.groupby(["PlayId", "OnOffense"])["X"].max()
    tmp_min = test_df.groupby(["PlayId", "OnOffense"])["X"].min()
    X_test["DefenseSpreadX"] = np.array(tmp_max[0::2]- tmp_min[0::2])
    X_test["OffenseSpreadX"] = np.array(tmp_max[1::2]- tmp_min[1::2])
    
    X_test["RunnerToScrimmage"] = X_test["X"] - X_test["YardLine"]

    # runner horizontal and vertical speed
    radian_angle = (90 - X_test['Dir']) * np.pi / 180.0
    X_test['v_horizontal'] = np.abs(X_test['S'] * np.cos(radian_angle))
    X_test['v_vertical'] = np.abs(X_test['S'] * np.sin(radian_angle))

    # runner horizontal and vertical momentum
    X_test['m_horizontal'] = np.abs(X_test['Momentum'] * np.cos(radian_angle))
    X_test['m_vertical'] = np.abs(X_test['Momentum'] * np.sin(radian_angle))

    # minimum distance to rusher from defenders
    X_test["MinDisFromRushToDef"] = np.array(test_df.groupby(["PlayId", "OnOffense"])["DisToRusher"].min()[0::2])

    # tackle time from closest defender to rusher
    X_test["MinTackleTime"] = np.array(test_df.groupby(["PlayId", "OnOffense"])["TackleTimeToRusher"].min()[0::2])

    # average tackle time from all defenders to rusher
    X_test["AveTackleTime"] = np.array(test_df.groupby(["PlayId", "OnOffense"]).agg(np.mean)["TackleTimeToRusher"][0::2])

    # runner vs 1st defender speed: runner's velocity divided by closest defender's speed
    X_test["RatioSRusherToCloseDef"] = np.array(test_df.loc[test_df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["RatioSToRusher"])
    
    # runner horizontal and vertical distance
    X_test["dis_horizontal"] = np.abs(X_test['Dis'] * np.cos(radian_angle))
    X_test["dis_vertical"] = np.abs(X_test['Dis'] * np.sin(radian_angle))
    X_test["RunnerMoveRatio"] = X_test["dis_horizontal"] / X_test["dis_vertical"]
    
    # the momentum of 1st closest defender to rusher, horizontal momentum, vertical momentum
    X_test["DefMomentumCloToRusher"] = np.array(test_df.loc[test_df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["Momentum"])
    X_test["DefMomentumCloToRusher_horizontal"] = np.abs(X_test['DefMomentumCloToRusher'] * np.cos(radian_angle))
    X_test["DefMomentumCloToRusher_vertical"] = np.abs(X_test['DefMomentumCloToRusher'] * np.sin(radian_angle))

    # the horizontal, vertical mometum of rusher
    X_test["RusherMomentum_horizontal"] =  np.abs(X_test['Momentum'] * np.cos(radian_angle))
    X_test["RusherMomentum_vertical"] =  np.abs(X_test['Momentum'] * np.sin(radian_angle))

    # difference of horizontal momentum 
    X_test["hMomentum_rusher_vs_defender"] = X_test["RusherMomentum_horizontal"] - X_test["DefMomentumCloToRusher_horizontal"]
    
    # voronoi area
    pts = np.array(test_df[["X", "Y"]]).reshape(test_df.shape[0]//22, 22, 2)
    # index of row where rusher data is included when separated by each play
    rusher_index = list(test_df[test_df.is_run==True].index % 22) 
    closest_def_index = list(test_df.loc[test_df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]].index % 22)
    rusher_voronoi = []
    closest_def_voronoi = []
    for i in range(0, test_df.shape[0] //22):
        rusher_voronoi.append(voronoi_volumes(pts[i], rusher_index[i]))
        closest_def_voronoi.append(voronoi_volumes(pts[i], closest_def_index[i]))
    X_test["RusherVoronoi"] = rusher_voronoi    
    X_test["FirstDefenderVoronoi"] = closest_def_voronoi 
    
    # ------------------------------------------------------
    for f in X_test.columns:
        if X_test[f].dtype=='object':
            X_test[f] = X_test[f].map(lambda x:x if x in set(X_train[f]) else -999)
    for f in X_test.columns:
        if X_test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(X_train[f])+[-999])
            X_test[f] = lbl.transform(list(X_test[f])) 
    pred_value = 0
    for model in models:
        pred_value += model.predict(X_test)[0]/5
    pred_data = list(get_score(pred_value,cdf,4,dist_to_end_test.values[0]))
    pred_data = np.array(pred_data).reshape(1,199)
    pred_target = pd.DataFrame(index = sample_prediction_df.index, \
                               columns = sample_prediction_df.columns, \
                               #data = np.array(pred_data))
                               data = pred_data)
    env.predict(pred_target)
env.write_submission_file()

Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
