In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error
import xgboost as xgb
from datetime import datetime
from scipy.stats import pearsonr
import gc
from bayes_opt import BayesianOptimization
from kaggle.competitions import nflrush
import math
import tqdm
from scipy.spatial import Delaunay, delaunay_plot_2d, Voronoi, voronoi_plot_2d, ConvexHull
env = nflrush.make_env()

In [2]:
train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv',low_memory=False)

# preprocess and feature engineering 

In [3]:
train.loc[train.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
train.loc[train.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

train.loc[train.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
train.loc[train.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

train.loc[train.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
train.loc[train.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

train.loc[train.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
train.loc[train.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"

train['is_run'] = train.NflId == train.NflIdRusher

In [4]:
train['ToLeft'] = train.PlayDirection == "left"
train['TeamOnOffense'] = "home"
train.loc[train.PossessionTeam != train.HomeTeamAbbr, 'TeamOnOffense'] = "away"
train['OnOffense'] = train.Team == train.TeamOnOffense # Is player on offense?
train['YardLine_std'] = 100 - train.YardLine.copy()
train.loc[train.FieldPosition.fillna('') == train.PossessionTeam,  
            'YardLine_std'
             ] = train.loc[train.FieldPosition.fillna('') == train.PossessionTeam,  
              'YardLine']
train['X_std'] = train.X.copy()
train.loc[train.ToLeft, 'X_std'] = 120 - train.loc[train.ToLeft, 'X'] 
train['Y_std'] = train.Y.copy()
train.loc[train.ToLeft, 'Y_std'] = 53.3 - train.loc[train.ToLeft, 'Y'] 
train['Orientation_std'] = train.Orientation.copy()
train.loc[train.ToLeft, 'Orientation_std'] = np.mod(180 + train.loc[train.ToLeft, 'Orientation_std'], 360)
train['Dir_std'] = train.Dir.copy()
train.loc[train.ToLeft, 'Dir_std'] = np.mod(180 + train.loc[train.ToLeft, 'Dir_std'], 360)
train.loc[train['Season'] == 2017, 'Orientation'] = np.mod(90 + train.loc[train['Season'] == 2017, 'Orientation'], 360) 
train.drop(["X", "Y", "Orientation", "YardLine", "Dir", "ToLeft"], axis=1, inplace=True)
train.rename(columns={'X_std': 'X', 'Y_std': 'Y', 'Orientation_std': 'Orientation', 'Dir_std': 'Dir', "YardLine_std": "YardLine"}, inplace=True)

In [5]:
# age #
FMT_birth = '%m/%d/%Y'
FMT_gamedate = '%Y-%m-%d'
train["Age"] = train["TimeSnap"].apply(lambda t: t.split("T")[0])
train["Age"] = train["Age"].apply(lambda t: datetime.strptime(t, FMT_gamedate))
tmp_birth = train["PlayerBirthDate"].apply(lambda t: datetime.strptime(t, FMT_birth))
train["Age"] = train["Age"] - tmp_birth
train["Age"] = train["Age"].apply(lambda t: t.days//365)

# momentum 
train["Momentum"] = train["S"] * train["PlayerWeight"]

rusher_x = np.array(train.groupby(["PlayId", "is_run"])["X"].agg(np.mean)[1::2])
rusher_x = np.repeat(rusher_x, 22) # repeat each elemnt 22 times train["RusherX"]
rusher_y = np.array(train.groupby(["PlayId", "is_run"])["Y"].agg(np.mean)[1::2])
rusher_y = np.repeat(rusher_y, 22) # train["RusherY"]
train["DisToRusher"] = np.sqrt((train["X"] - rusher_x) ** 2 + (train["Y"] - rusher_y) ** 2)
train["TackleTimeToRusher"] = train["DisToRusher"] / train["S"] 

rusher_s = np.array(train.groupby(["PlayId", "is_run"]).agg(np.mean)["S"][1::2])
rusher_s = np.repeat(rusher_s, 22)
train["RatioSToRusher"] = train["S"] / rusher_s

In [6]:
train_single = train[train.is_run==True].copy()

def transform_time_quarter(str1):
    return int(str1[:2])*60 + int(str1[3:5])
def transform_time_all(str1,quarter):
    if quarter<=4:
        return 15*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
    if quarter ==5:
        return 10*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
train_single['time_quarter'] = train_single.GameClock.map(lambda x:transform_time_quarter(x))
train_single['time_end'] = train_single.apply(lambda x:transform_time_all(x.loc['GameClock'],x.loc['Quarter']),axis=1)

train_single['TimeHandoff'] = train_single['TimeHandoff'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
train_single['TimeSnap'] = train_single['TimeSnap'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
train_single['handoff_snap_diff'] = (train_single['TimeHandoff'] - train_single['TimeSnap']).map(lambda x:x.seconds)

train_single["Stadium"] = train_single["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
                                             else ("Everbank Field" if x == "EverBank Field"
                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
                                             else ("Lambeau Field" if x == "Lambeau field"
                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
                                             else ("NRG Stadium" if x == "NRG"
                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))

train_single["Location"] = train_single["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
                        else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
                        else ("Charlotte, NC" if x == "Charlotte, North Carolina"
                        else ("Chicago, IL" if x == "Chicago. IL"
                        else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
                        else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
                        else ("Detroit, MI" if x == "Detroit"
                        else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
                        else ("Foxborough, MA" if x == "Foxborough, Ma"
                        else ("Houston, TX" if x == "Houston, Texas"
                        else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
                        else ("London" if x == "London, England"
                        else ("Los Angeles, CA" if x == "Los Angeles, Calif."
                        else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
                        else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
                        else ("Orchard Park, NY" if x == "Orchard Park NY"
                        else ("Philadelphia, PA" if x == "Philadelphia, Pa."
                        else ("Pittsburgh, PA" if x == "Pittsburgh"
                        else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))

grass_labels = ['grass', 'natural grass', 'natural', 'naturall grass']
train_single['Grass'] = np.where(train_single.Turf.str.lower().isin([grass_labels]), "Natural", "Artificial")

train_single["OffenseFormation"] = train_single["OffenseFormation"].fillna("Unknown") 
train_single['DefendersInTheBox_vs_Distance'] = train_single['DefendersInTheBox'] / train_single['Distance']

TypeError: unhashable type: 'list'

In [7]:
remove_features = ['GameId','PlayId','DisplayName','GameClock','TimeHandoff','TimeSnap']
train_single['date_game'] = train_single.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
train_single['runner_age'] = (train_single.date_game.map(pd.to_datetime) - train_single.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365
remove_features.append('HomeTeamAbbr')
remove_features.append('VisitorTeamAbbr')
remove_features.append('PlayerBirthDate')
remove_features.append('is_run')
def transform_height(te):
    return (int(te.split('-')[0])*12 + int(te.split('-')[1]))*2.54/100
train_single['runner_height'] = train_single.PlayerHeight.map(transform_height)
remove_features.append('PossessionTeam')
remove_features.append('FieldPosition')
remove_features.append('PlayerHeight')
remove_features.append('NflIdRusher')
remove_features.append('date_game')
train_single['own_field'] = (train_single['FieldPosition'] == train_single['PossessionTeam']).astype(int)
dist_to_end_train = train_single.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
remove_features.append('own_field')
train_single.drop(remove_features,axis=1,inplace=True)
train_single.fillna(-999,inplace=True)

In [8]:
y_train = train_single.Yards
X_train = train_single.drop(['Yards'],axis=1)
for f in X_train.columns:
    if X_train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f])+[-999])
        X_train[f] = lbl.transform(list(X_train[f]))

In [9]:
def voronoi_volumes(points, selected_index):
    v = Voronoi(points)
    vol = np.zeros(v.npoints)
    
    for i, reg_num in enumerate(v.point_region):
        indices = v.regions[reg_num]
        if -1 in indices: # some regions can be opened
            vol[i] = -999 ## insert missing value when the area is open
        else:
            vol[i] = ConvexHull(v.vertices[indices]).volume
        
        if reg_num == v.point_region[selected_index]: # in the case of rusher or 1st defender etc...
            index = i
            rusher_reg_num = reg_num         
        
    return vol[index]

tmp = train.groupby(["PlayId", "OnOffense"]).agg(np.mean)[["S", "X", "Y", "Age"]]
X_train["DefenseAveX"] = np.array(tmp[0::2]["X"])
X_train["OffenseAveX"] = np.array(tmp[1::2]["X"])

X_train["DefenseAveY"] = np.array(tmp[0::2]["Y"]) 
X_train["OffenseAveY"] = np.array(tmp[1::2]["Y"]) 

tmp = train.groupby(["PlayId", "OnOffense"]).agg(["std"])[["X", "Y"]]
X_train["DefenseStdX"] = np.array(tmp[0::2]["X"])
X_train["OffenseStdX"] = np.array(tmp[1::2]["X"])

X_train["DefenseStdY"] = np.array(tmp[0::2]["Y"])
X_train["OffenseStdY"] = np.array(tmp[1::2]["Y"])

X_train["RunnerToDefenseCentoid"] = np.sqrt((X_train["X"] - X_train["DefenseAveX"]) ** 2 + (X_train["Y"] - X_train["DefenseAveY"]) ** 2)
X_train["RunnerToOffenseCentoid"] = np.sqrt((X_train["X"] - X_train["OffenseAveX"]) ** 2 + (X_train["Y"] - X_train["OffenseAveY"]) ** 2)

# defense x spread, offense x spread
tmp_max = train.groupby(["PlayId", "OnOffense"])["X"].max()
tmp_min = train.groupby(["PlayId", "OnOffense"])["X"].min()
X_train["DefenseSpreadX"] = np.array(tmp_max[0::2]- tmp_min[0::2])
X_train["OffenseSpreadX"] = np.array(tmp_max[1::2]- tmp_min[1::2])

# voronoi area
pts = np.array(train[["X", "Y"]]).reshape(train.shape[0]//22, 22, 2) # plays * players * (X, Y, rusher)
# index of row where rusher data is included when separated by each play
rusher_index = list(train[train.is_run==True].index % 22) 
closest_def_index = list(train.loc[train.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]].index % 22)
rusher_voronoi = []
closest_def_voronoi = []
for i in range(0, train.shape[0] //22):
    rusher_voronoi.append(voronoi_volumes(pts[i], rusher_index[i]))
    closest_def_voronoi.append(voronoi_volumes(pts[i], closest_def_index[i]))
X_train["RusherVoronoi"] = rusher_voronoi    
X_train["FirstDefenderVoronoi"] = closest_def_voronoi 

In [10]:
def get_cdf_df(yards_array):
    pdf, edges = np.histogram(yards_array, bins=199,
                 range=(-99,100), density=True)
    cdf = pdf.cumsum().clip(0, 1)
    cdf_df = pd.DataFrame(data=cdf.reshape(-1, 1).T, 
                            columns=['Yards'+str(i) for i in range(-99,100)])
    return cdf_df
cdf = get_cdf_df(y_train).values.reshape(-1,)

def get_score(y_pred,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    return y_pred_array    

def get_score_pingyi1(y_pred,y_true,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    y_true_array = np.zeros(199)
    y_true_array[(y_true+99):]=1
    return np.mean((y_pred_array - y_true_array)**2)


def CRPS_pingyi1(y_preds,y_trues,w,cdf,dist_to_ends):
    if len(y_preds) != len(y_trues):
        print('length does not match')
        return None
    n = len(y_preds)
    tmp = []
    for a,b,c in zip(y_preds, y_trues,dist_to_ends):
        tmp.append(get_score_pingyi1(a,b,cdf,w,c))
    return np.mean(tmp)

# modelling

In [11]:
kf=KFold(n_splits = 5)
resu1 = 0
impor1 = 0
resu2_cprs = 0
resu3_mae=0
##y_pred = 0
stack_train = np.zeros([X_train.shape[0],])
models = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "mae", 
    "tree_learner": "serial",
    "max_depth" : -1,
    "boosting": 'gbdt',
    #"num_leaves" : 13,
    "learning_rate" : 0.1,
    #"bagging_freq": 5,
    #"bagging_fraction" : 0.4,
    #"feature_fraction" : 0.05,
    #"min_data_in_leaf": 80,
}
for train_index, test_index in kf.split(X_train, y_train):
    X_train2= X_train.iloc[train_index,:]
    y_train2= y_train.iloc[train_index]
    X_test2= X_train.iloc[test_index,:]
    y_test2= y_train.iloc[test_index]
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        early_stopping_rounds=100,
    )
    
    models.append(clf)
    temp_predict = clf.predict(X_test2)
    stack_train[test_index] = temp_predict
    mse = mean_squared_error(y_test2, temp_predict)
    crps = CRPS_pingyi1(temp_predict,y_test2,4,cdf,dist_to_end_train.iloc[test_index])
    mae = mean_absolute_error(y_test2, temp_predict)
    print(crps)
    
    resu1 += mse/5
    resu2_cprs += crps/5
    resu3_mae += mae/5 
    impor1 += clf.feature_importance() /5
    gc.collect()
print('mean mse:',resu1)
print('oof mse:',mean_squared_error(y_train,stack_train))
print('mean mae:',resu3_mae)
print('oof mae:',mean_absolute_error(y_train,stack_train))
print('mean cprs:',resu2_cprs)
print('oof cprs:',CRPS_pingyi1(stack_train,y_train,4,cdf,dist_to_end_train))

[1]	valid_0's l1: 3.7941
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 3.73496
[3]	valid_0's l1: 3.68208
[4]	valid_0's l1: 3.64107
[5]	valid_0's l1: 3.61338
[6]	valid_0's l1: 3.58602
[7]	valid_0's l1: 3.55607
[8]	valid_0's l1: 3.53989
[9]	valid_0's l1: 3.51174
[10]	valid_0's l1: 3.49469
[11]	valid_0's l1: 3.48346
[12]	valid_0's l1: 3.46466
[13]	valid_0's l1: 3.45942
[14]	valid_0's l1: 3.45286
[15]	valid_0's l1: 3.44774
[16]	valid_0's l1: 3.44391
[17]	valid_0's l1: 3.43189
[18]	valid_0's l1: 3.42756
[19]	valid_0's l1: 3.42621
[20]	valid_0's l1: 3.42272
[21]	valid_0's l1: 3.41438
[22]	valid_0's l1: 3.40796
[23]	valid_0's l1: 3.40379
[24]	valid_0's l1: 3.40118
[25]	valid_0's l1: 3.40026
[26]	valid_0's l1: 3.39592
[27]	valid_0's l1: 3.39309
[28]	valid_0's l1: 3.39165
[29]	valid_0's l1: 3.39038
[30]	valid_0's l1: 3.39009
[31]	valid_0's l1: 3.38753
[32]	valid_0's l1: 3.38721
[33]	valid_0's l1: 3.38815
[34]	valid_0's l1: 3.38697
[35]	valid_0's l1: 3.38525
[36

# prediction

In [12]:
def transform_test(test):
    test.loc[test.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
    test.loc[test.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

    test.loc[test.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
    test.loc[test.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

    test.loc[test.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
    test.loc[test.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

    test.loc[test.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
    test.loc[test.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"
    
    test['is_run'] = test.NflId == test.NflIdRusher
    
    test['ToLeft'] = test.PlayDirection == "left"
    test['TeamOnOffense'] = "home"
    test.loc[test.PossessionTeam != test.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    test['OnOffense'] = test.Team == test.TeamOnOffense # Is player on offense?
    test['YardLine_std'] = 100 - test.YardLine.copy()
    test.loc[test.FieldPosition.fillna('') == test.PossessionTeam,  
            'YardLine_std'
             ] = test.loc[test.FieldPosition.fillna('') == test.PossessionTeam,  
              'YardLine']
    test['X_std'] = test.X.copy()
    test.loc[test.ToLeft, 'X_std'] = 120 - test.loc[test.ToLeft, 'X'] 
    test['Y_std'] = test.Y.copy()
    test.loc[test.ToLeft, 'Y_std'] = 53.3 - test.loc[test.ToLeft, 'Y'] 
    test['Orientation_std'] = test.Orientation.copy()
    test.loc[test.ToLeft, 'Orientation_std'] = np.mod(180 + test.loc[test.ToLeft, 'Orientation_std'], 360)
    test['Dir_std'] = test.Dir.copy()
    test.loc[test.ToLeft, 'Dir_std'] = np.mod(180 + test.loc[test.ToLeft, 'Dir_std'], 360)
    test.loc[test['Season'] == 2017, 'Orientation'] = np.mod(90 + test.loc[test['Season'] == 2017, 'Orientation'], 360) 
    test.drop(["X", "Y", "Orientation", "YardLine", "Dir", "ToLeft"], axis=1, inplace=True)
    test.rename(columns={'X_std': 'X', 'Y_std': 'Y', 'Orientation_std': 'Orientation', 'Dir_std': 'Dir', "YardLine_std": "YardLine"}, inplace=True)
    
    # age #
    FMT_birth = '%m/%d/%Y'
    FMT_gamedate = '%Y-%m-%d'
    test["Age"] = test["TimeSnap"].apply(lambda t: t.split("T")[0])
    test["Age"] = test["Age"].apply(lambda t: datetime.strptime(t, FMT_gamedate))
    tmp_birth = test["PlayerBirthDate"].apply(lambda t: datetime.strptime(t, FMT_birth))
    test["Age"] = test["Age"] - tmp_birth
    test["Age"] = test["Age"].apply(lambda t: t.days//365)

    # momentum 
    test["Momentum"] = test["S"] * test["PlayerWeight"]

    rusher_x = np.array(test.groupby(["PlayId", "is_run"])["X"].agg(np.mean)[1::2])
    rusher_x = np.repeat(rusher_x, 22) # repeat each elemnt 22 times test["RusherX"]
    rusher_y = np.array(test.groupby(["PlayId", "is_run"])["Y"].agg(np.mean)[1::2])
    rusher_y = np.repeat(rusher_y, 22) # train["RusherY"]
    test["DisToRusher"] = np.sqrt((test["X"] - rusher_x) ** 2 + (test["Y"] - rusher_y) ** 2)
    test["TackleTimeToRusher"] = test["DisToRusher"] / test["S"] 

    rusher_s = np.array(test.groupby(["PlayId", "is_run"]).agg(np.mean)["S"][1::2])
    rusher_s = np.repeat(rusher_s, 22)
    test["RatioSToRusher"] = test["S"] / rusher_s
    
    test_single = test[test.is_run==True].copy()

    test_single['time_quarter'] = test_single.GameClock.map(lambda x:transform_time_quarter(x))
    test_single['time_end'] = test_single.apply(lambda x:transform_time_all(x.loc['GameClock'],x.loc['Quarter']),axis=1)

    test_single['TimeHandoff'] = test_single['TimeHandoff'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    test_single['TimeSnap'] = test_single['TimeSnap'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    test_single['handoff_snap_diff'] = (test_single['TimeHandoff'] - test_single['TimeSnap']).map(lambda x:x.seconds)
    
    test_single["Stadium"] = test_single["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
                                             else ("Everbank Field" if x == "EverBank Field"
                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
                                             else ("Lambeau Field" if x == "Lambeau field"
                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
                                             else ("NRG Stadium" if x == "NRG"
                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))
    
    test_single["Location"] = test_single["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
                        else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
                        else ("Charlotte, NC" if x == "Charlotte, North Carolina"
                        else ("Chicago, IL" if x == "Chicago. IL"
                        else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
                        else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
                        else ("Detroit, MI" if x == "Detroit"
                        else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
                        else ("Foxborough, MA" if x == "Foxborough, Ma"
                        else ("Houston, TX" if x == "Houston, Texas"
                        else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
                        else ("London" if x == "London, England"
                        else ("Los Angeles, CA" if x == "Los Angeles, Calif."
                        else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
                        else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
                        else ("Orchard Park, NY" if x == "Orchard Park NY"
                        else ("Philadelphia, PA" if x == "Philadelphia, Pa."
                        else ("Pittsburgh, PA" if x == "Pittsburgh"
                        else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))
    
    test_single['Grass'] = np.where(test_single.Turf.str.lower().isin([grass_labels]), "Natural", "Artificial")

    test_single["OffenseFormation"] = test_single["OffenseFormation"].fillna("Unknown") 
    test_single['DefendersInTheBox_vs_Distance'] = test_single['DefendersInTheBox'] / test_single['Distance']
    
    test_single['date_game'] = test_single.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
    test_single['runner_age'] = (test_single.date_game.map(pd.to_datetime) - test_single.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365
    test_single['runner_height'] = test_single.PlayerHeight.map(transform_height)
    test_single.drop(remove_features,axis=1,inplace=True)
    
    tmp = test.groupby(["PlayId", "OnOffense"]).agg(np.mean)[["S", "X", "Y", "Age"]]
    test_single["DefenseAveX"] = np.array(tmp[0::2]["X"])
    test_single["OffenseAveX"] = np.array(tmp[1::2]["X"])

    test_single["DefenseAveY"] = np.array(tmp[0::2]["Y"]) 
    test_single["OffenseAveY"] = np.array(tmp[1::2]["Y"]) 

    tmp = test.groupby(["PlayId", "OnOffense"]).agg(["std"])[["X", "Y"]]
    test_single["DefenseStdX"] = np.array(tmp[0::2]["X"])
    test_single["OffenseStdX"] = np.array(tmp[1::2]["X"])

    test_single["DefenseStdY"] = np.array(tmp[0::2]["Y"])
    test_single["OffenseStdY"] = np.array(tmp[1::2]["Y"])

    test_single["RunnerToDefenseCentoid"] = np.sqrt((test_single["X"] - test_single["DefenseAveX"]) ** 2 + (test_single["Y"] - test_single["DefenseAveY"]) ** 2)
    test_single["RunnerToOffenseCentoid"] = np.sqrt((test_single["X"] - test_single["OffenseAveX"]) ** 2 + (test_single["Y"] - test_single["OffenseAveY"]) ** 2)

    # defense x spread, offense x spread
    tmp_max = test.groupby(["PlayId", "OnOffense"])["X"].max()
    tmp_min = test.groupby(["PlayId", "OnOffense"])["X"].min()
    test_single["DefenseSpreadX"] = np.array(tmp_max[0::2]- tmp_min[0::2])
    test_single["OffenseSpreadX"] = np.array(tmp_max[1::2]- tmp_min[1::2])
    
    # voronoi area
    pts = np.array(test[["X", "Y"]]).reshape(test.shape[0]//22, 22, 2) # plays * players * (X, Y, rusher)
    # index of row where rusher data is included when separated by each play
    rusher_index = list(test[test.is_run==True].index % 22) 
    closest_def_index = list(test.loc[test.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]].index % 22)
    rusher_voronoi = []
    closest_def_voronoi = []
    for i in range(0, test.shape[0] //22):
        rusher_voronoi.append(voronoi_volumes(pts[i], rusher_index[i]))
        closest_def_voronoi.append(voronoi_volumes(pts[i], closest_def_index[i]))
    test_single["RusherVoronoi"] = rusher_voronoi    
    test_single["FirstDefenderVoronoi"] = closest_def_voronoi 
    test_single.fillna(-999,inplace=True)
    return test_single

In [13]:
for (test_df, sample_prediction_df) in env.iter_test():
    test_df['own_field'] = (test_df['FieldPosition'] == test_df['PossessionTeam']).astype(int)
    dist_to_end_test = test_df.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
    X_test = transform_test(test_df)
    for f in X_test.columns:
        if X_test[f].dtype=='object':
            X_test[f] = X_test[f].map(lambda x:x if x in set(X_train[f]) else -999)
    for f in X_test.columns:
        if X_test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(X_train[f])+[-999])
            X_test[f] = lbl.transform(list(X_test[f])) 
    pred_value = 0
    for model in models:
        pred_value += model.predict(X_test)[0]/5
    pred_data = list(get_score(pred_value,cdf,4,dist_to_end_test.values[0]))
    pred_data = np.array(pred_data).reshape(1,199)
    pred_target = pd.DataFrame(index = sample_prediction_df.index, \
                               columns = sample_prediction_df.columns, \
                               #data = np.array(pred_data))
                               data = pred_data)
    env.predict(pred_target)
env.write_submission_file()

TypeError: unhashable type: 'list'