In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error
import xgboost as xgb
from datetime import datetime
import gc
from bayes_opt import BayesianOptimization
from kaggle.competitions import nflrush
import math
import tqdm
from scipy.spatial import Delaunay, delaunay_plot_2d, Voronoi, voronoi_plot_2d, ConvexHull
env = nflrush.make_env()

In [2]:
def get_cdf_df(yards_array):
    pdf, edges = np.histogram(yards_array, bins=199,
                 range=(-99,100), density=True)
    cdf = pdf.cumsum().clip(0, 1)
    cdf_df = pd.DataFrame(data=cdf.reshape(-1, 1).T, 
                            columns=['Yards'+str(i) for i in range(-99,100)])
    return cdf_df

def get_score(y_pred,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    return y_pred_array    

def get_score_pingyi1(y_pred,y_true,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    y_true_array = np.zeros(199)
    y_true_array[(y_true+99):]=1
    return np.mean((y_pred_array - y_true_array)**2)

def CRPS_pingyi1(y_preds,y_trues,w,cdf,dist_to_ends):
    if len(y_preds) != len(y_trues):
        print('length does not match')
        return None
    n = len(y_preds)
    tmp = []
    for a,b,c in zip(y_preds, y_trues,dist_to_ends):
        tmp.append(get_score_pingyi1(a,b,cdf,w,c))
    return np.mean(tmp)

In [3]:
train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv',low_memory=False)

# preprocess and feature engineering 

In [4]:
def transform_time_quarter(str1):
    return int(str1[:2])*60 + int(str1[3:5])
  
def transform_time_all(str1,quarter):
    if quarter<=4:
        return 15*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
    if quarter ==5:
        return 10*60 - (int(str1[:2])*60 + int(str1[3:5])) + (quarter-1)*15*60
      
def back_direction(orientation):
    if orientation > 180.0:
        return 1
    else:
        return 0
      
def transform_height(te):
    return (int(te.split('-')[0])*12 + int(te.split('-')[1]))*2.54/100

def voronoi_volumes(points, selected_index):
    v = Voronoi(points)
    vol = np.zeros(v.npoints)
      
    for i, reg_num in enumerate(v.point_region):
        if reg_num == v.point_region[selected_index]:
            indices = v.regions[reg_num]
            if -1 in indices: # some regions can be opened
                vol = -999 ## insert missing value when the area is open
            else:
                vol = ConvexHull(v.vertices[indices]).volume      
            break
    return vol

In [5]:
remove_features = ['GameId','PlayId','DisplayName','GameClock','TimeHandoff','TimeSnap', 'PlayDirection', 'TeamOnOffense', 
                   'Turf', 'PlayerBirthDate', 'is_run', 'NflIdRusher', 'date_game', 'PossessionTeam', 'FieldPosition', 
                   'HomeTeamAbbr', 'VisitorTeamAbbr', 'PlayerHeight', 'own_field']

In [6]:
def transform_data(df):
    df.loc[df.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
    df.loc[df.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

    df.loc[df.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
    df.loc[df.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

    df.loc[df.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
    df.loc[df.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

    df.loc[df.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
    df.loc[df.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"

    df['is_run'] = df.NflId == df.NflIdRusher

    if 2017 in list(df["Season"].unique()):
        df.loc[df['Season'] == 2017, 'S'] = (df['S'][df['Season'] == 2017] - 2.4355) / 1.2930 * 1.4551 + 2.7570

    df['ToLeft'] = df.PlayDirection == "left"
    df['TeamOnOffense'] = "home"
    df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    df['OnOffense'] = df.Team == df.TeamOnOffense # Is player on offense?
    df['YardLine_std'] = 100 - df.YardLine.copy()
    df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
            'YardLine_std'
             ] = df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
              'YardLine']
    df['X_std'] = df.X.copy()
    df.loc[df.ToLeft, 'X_std'] = 120 - df.loc[df.ToLeft, 'X'] 
    df['Y_std'] = df.Y.copy()
    df.loc[df.ToLeft, 'Y_std'] = 53.3 - df.loc[df.ToLeft, 'Y'] 
    df['Orientation_std'] = df.Orientation.copy()
    df.loc[df.ToLeft, 'Orientation_std'] = np.mod(180 + df.loc[df.ToLeft, 'Orientation_std'], 360)
    df['Dir_std'] = df.Dir.copy()
    df.loc[df.ToLeft, 'Dir_std'] = np.mod(180 + df.loc[df.ToLeft, 'Dir_std'], 360)
    df.loc[df['Season'] == 2017, 'Orientation_std'] = np.mod(90 + df.loc[df['Season'] == 2017, 'Orientation_std'], 360) 
    df.drop(["X", "Y", "Orientation", "YardLine", "Dir", "ToLeft"], axis=1, inplace=True)
    df.rename(columns={'X_std': 'X', 'Y_std': 'Y', 'Orientation_std': 'Orientation', 'Dir_std': 'Dir', "YardLine_std": "YardLine"}, inplace=True)

    df['date_game'] = df.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
    df['age'] = (df.date_game.map(pd.to_datetime) - df.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365

    df["Momentum"] = df["S"] * df["PlayerWeight"]

    #df["F"] = df["A"] * df["PlayerWeight"]

    rusher_x = np.array(df.groupby(["PlayId", "is_run"])["X"].agg(np.mean)[1::2])
    rusher_x = np.repeat(rusher_x, 22) # repeat each elemnt 22 times df["RusherX"]
    rusher_y = np.array(df.groupby(["PlayId", "is_run"])["Y"].agg(np.mean)[1::2])
    rusher_y = np.repeat(rusher_y, 22) # df["RusherY"]
    df["DisToRusher"] = np.sqrt((df["X"] - rusher_x) ** 2 + (df["Y"] - rusher_y) ** 2)
    df["TackleTimeToRusher"] = df["DisToRusher"] / df["S"] 

    df["Dir_sin"] = df["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    df["Dir_cos"] = df["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    #df["Orientation_sin"] = df["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    #df["Orientation_cos"] = df["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    
    df["Momentum_cos"] = df["Momentum"] * df["Dir_cos"]
    df["Momentum_sin"] = df["Momentum"] * df["Dir_sin"]

    rusher_s = np.array(df.groupby(["PlayId", "is_run"]).agg(np.mean)["S"][1::2])
    rusher_s = np.repeat(rusher_s, 22)
    df["RatioSToRusher"] = df["S"] / rusher_s

    df_single = df[df.is_run==True].copy()
    
    df_single["NecDisPerDown"] = df_single["Distance"] / (5 - df_single["Down"])
        
    df_single['time_quarter'] = df_single.GameClock.map(lambda x:transform_time_quarter(x))
    df_single['time_end'] = df_single.apply(lambda x:transform_time_all(x.loc['GameClock'],x.loc['Quarter']),axis=1)

    df_single['TimeHandoff'] = df_single['TimeHandoff'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    df_single['TimeSnap'] = df_single['TimeSnap'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    df_single['handoff_snap_diff'] = (df_single['TimeHandoff'] - df_single['TimeSnap']).map(lambda x:x.seconds)

    df_single["Stadium"] = df_single["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
                                             else ("Everbank Field" if x == "EverBank Field"
                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
                                             else ("Lambeau Field" if x == "Lambeau field"
                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
                                             else ("NRG Stadium" if x == "NRG"
                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))

    df_single["Location"] = df_single["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
                                            else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
                                            else ("Charlotte, NC" if x == "Charlotte, North Carolina"
                                            else ("Chicago, IL" if x == "Chicago. IL"
                                            else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
                                            else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
                                            else ("Detroit, MI" if x == "Detroit"
                                            else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
                                            else ("Foxborough, MA" if x == "Foxborough, Ma"
                                            else ("Houston, TX" if x == "Houston, Texas"
                                            else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
                                            else ("London" if x == "London, England"
                                            else ("Los Angeles, CA" if x == "Los Angeles, Calif."
                                            else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
                                            else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
                                            else ("Orchard Park, NY" if x == "Orchard Park NY"
                                            else ("Philadelphia, PA" if x == "Philadelphia, Pa."
                                            else ("Pittsburgh, PA" if x == "Pittsburgh"
                                            else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))

    grass_labels = ['grass', 'natural grass', 'natural', 'naturall grass']
    df_single['Grass'] = np.where(df_single.Turf.str.lower().isin(grass_labels), "Natural", "Artificial")
                                                                 
    #top20_weather = list(df.GameWeather.value_counts(normalize=True, dropna=False).cumsum().head(20).index)
    #df_single["GameWeather"] = df_single["GameWeather"].apply(lambda x: "Others" if x not in top20_weather else x)
                                                                 
    df_single["OffenseFormation"] = df_single["OffenseFormation"].fillna("Unknown") 
    df_single['DefendersInTheBox_vs_Distance'] = df_single['DefendersInTheBox'] / df_single['Distance']
                                                                 
    #df_single['back_oriented_down_field'] = df_single['Orientation'].apply(lambda x: back_direction(x))
    #df_single['back_moving_down_field'] = df_single['Dir'].apply(lambda x: back_direction(x))

    #arr = [[int(s[0]) for s in t.split(", ")] for t in df_single["DefensePersonnel"]]
    #df_single["DefenseDL"] = np.array([a[0] for a in arr])
    #df_single["DefenseLB"] = np.array([a[1] for a in arr])
    #df_single["DefenseDB"] = np.array([a[2] for a in arr])
    #df_single["DefenseOL"] = np.array([a[3] if len(a) == 4 else 0 for a in arr])
  
    #df_single["OffenseRB"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" RB")[0][-1]) if "RB" in x else 0)
    #df_single["OffenseTE"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" TE")[0][-1]) if "TE" in x else 0)
    #df_single["OffenseWR"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" WR")[0][-1]) if "WR" in x else 0)
    #df_single["OffenseOL"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" OL")[0][-1]) if "OL" in x else 0)
    #df_single["OffenseDL"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" DL")[0][-1]) if "DL" in x else 0)
    #df_single["OffenseQB"] = df_single["OffensePersonnel"].apply(lambda x: 
    #                        int(x.replace(",", "").split(" QB")[0][-1]) if "QB" in x else 0)
  
    df_single["DisToQB"] = np.array(df[(df.Position=="QB") | (df.Position=="C")].groupby(["PlayId"]).agg(np.mean)["DisToRusher"])

    df_single["OffenseFormation"] = df_single["OffenseFormation"].apply(lambda x: "SHOTGUN" if x== "ACE" else x)

    df_single["Margin"] = df_single["HomeScoreBeforePlay"] - df_single["VisitorScoreBeforePlay"]
    df_single.loc[df_single['Team'] == "away", 'Margin'] = (df_single['VisitorScoreBeforePlay'][df_single['Team'] == "away"] - df_single['HomeScoreBeforePlay'][df_single['Team'] == "away"])

    df_single['runner_height'] = df_single.PlayerHeight.map(transform_height)
    df_single['own_field'] = (df_single['FieldPosition'] == df_single['PossessionTeam']).astype(int)
    dist_to_end = df_single.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
    df_single.drop(remove_features,axis=1,inplace=True) 

    tmp = df.groupby(["PlayId", "OnOffense"]).agg(np.mean)[["X", "Y", "age"]]
    df_single["DefenseAveX"] = np.array(tmp[0::2]["X"])
    df_single["OffenseAveX"] = np.array(tmp[1::2]["X"])

    df_single["DefenseAveY"] = np.array(tmp[0::2]["Y"]) 
    df_single["OffenseAveY"] = np.array(tmp[1::2]["Y"]) 
    
    df_single["DefenseAveAge"] = np.array(tmp[0::2]["age"])
    df_single["OffenseAveAge"] = np.array(tmp[1::2]["age"])

    tmp = df.groupby(["PlayId", "OnOffense"]).agg(["std"])[["X", "Y"]]
    df_single["DefenseStdX"] = np.array(tmp[0::2]["X"])
    df_single["OffenseStdX"] = np.array(tmp[1::2]["X"])

    df_single["DefenseStdY"] = np.array(tmp[0::2]["Y"])
    df_single["OffenseStdY"] = np.array(tmp[1::2]["Y"])

    df_single["RunnerToDefenseCentoid"] = np.sqrt((df_single["X"] - df_single["DefenseAveX"]) ** 2 + (df_single["Y"] - df_single["DefenseAveY"]) ** 2)
    df_single["RunnerToOffenseCentoid"] = np.sqrt((df_single["X"] - df_single["OffenseAveX"]) ** 2 + (df_single["Y"] - df_single["OffenseAveY"]) ** 2)

    tmp_max = df.groupby(["PlayId", "OnOffense"])["X"].max()
    tmp_min = df.groupby(["PlayId", "OnOffense"])["X"].min()
    df_single["DefenseSpreadX"] = np.array(tmp_max[0::2]- tmp_min[0::2])
    df_single["OffenseSpreadX"] = np.array(tmp_max[1::2]- tmp_min[1::2])

    df_single["RunnerToScrimmage"] = df_single["X"] - df_single["YardLine"]

    df_single["MinTackleTime"] = np.array(df.groupby(["PlayId", "OnOffense"])["TackleTimeToRusher"].min()[0::2])
    df_single["1stDefender_Momentum_cos"] = np.array(df.loc[df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["Momentum_cos"])
    df_single["1stDefender_Momentum_sin"] = np.array(df.loc[df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["Momentum_sin"])
    #df_single["1stDefender_A"] = np.array(df.loc[df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["A"])

    #df_single["Rusher1stDefSpeedRatio"] = df.loc[df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]]["RatioSToRusher"]

    pts = np.array(df[["X", "Y"]]).reshape(df.shape[0]//22, 22, 2) # plays * players * (X, Y, rusher)
    rusher_index = list(df[df.is_run==True].index % 22) 
    closest_def_index = list(df.loc[df.groupby(["PlayId", "OnOffense"])["DisToRusher"].idxmin()[0::2]].index % 22)
    rusher_voronoi = []
    closest_def_voronoi = []

    for i in range(0, df.shape[0] //22):
        rusher_voronoi.append(voronoi_volumes(pts[i], rusher_index[i]))
        closest_def_voronoi.append(voronoi_volumes(pts[i], closest_def_index[i]))
    df_single["RusherVoronoi"] = rusher_voronoi    
    df_single["FirstDefenderVoronoi"] = closest_def_voronoi 
    df_single.fillna(-999,inplace=True) 
    remove_features2 = ["OnOffense", "DisToRusher", "TackleTimeToRusher", "RatioSToRusher"]
    df_single.drop(remove_features2, axis=1, inplace=True)

    return df_single, dist_to_end

In [7]:
train_single, dist_to_end_train = transform_data(train)
y_train = train_single.Yards
X_train = train_single.drop(['Yards'],axis=1)
for f in X_train.columns:
    if X_train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f])+[-999])
        X_train[f] = lbl.transform(list(X_train[f]))
cdf = get_cdf_df(y_train).values.reshape(-1,)

# modelling

In [8]:
n_folds=5
kf=KFold(n_splits = n_folds, random_state=1108)
resu1 = 0
resu2_cprs = 0
resu3_mae=0
stack_train = np.zeros([X_train.shape[0],])
models = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "mae", 
    "tree_learner": "serial",
    "max_depth" : -1,
    "boosting": 'gbdt',
    #"num_leaves" : 13,
    "learning_rate" : 0.1,
    #"bagging_freq": 5,
    #"bagging_fraction" : 0.4,
    #"feature_fraction" : 0.05,
    #"min_data_in_leaf": 80,
}
feature_importance_df = pd.DataFrame(list(X_train.columns), columns=["Feature"])
for i , (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train2= X_train.iloc[train_index,:]
    y_train2= y_train.iloc[train_index]
    X_test2= X_train.iloc[test_index,:]
    y_test2= y_train.iloc[test_index]
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        early_stopping_rounds=100,
    )
    
    models.append(clf)
    temp_predict = clf.predict(X_test2)
    stack_train[test_index] = temp_predict
    mse = mean_squared_error(y_test2, temp_predict)
    crps = CRPS_pingyi1(temp_predict,y_test2,4,cdf,dist_to_end_train.iloc[test_index])
    mae = mean_absolute_error(y_test2, temp_predict)
    print(crps)
    
    resu1 += mse/n_folds
    resu2_cprs += crps/n_folds
    resu3_mae += mae/n_folds
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
    gc.collect()
print('mean mse:',resu1)
print('oof mse:',mean_squared_error(y_train,stack_train))
print('mean mae:',resu3_mae)
print('oof mae:',mean_absolute_error(y_train,stack_train))
print('mean cprs:',resu2_cprs)
print('oof cprs:',CRPS_pingyi1(stack_train,y_train,4,cdf,dist_to_end_train))
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

[1]	valid_0's l1: 3.78102
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 3.71288
[3]	valid_0's l1: 3.66943
[4]	valid_0's l1: 3.62034
[5]	valid_0's l1: 3.5908
[6]	valid_0's l1: 3.56188
[7]	valid_0's l1: 3.53963
[8]	valid_0's l1: 3.51059
[9]	valid_0's l1: 3.49567
[10]	valid_0's l1: 3.47071
[11]	valid_0's l1: 3.46051
[12]	valid_0's l1: 3.44433
[13]	valid_0's l1: 3.4369
[14]	valid_0's l1: 3.42547
[15]	valid_0's l1: 3.41977
[16]	valid_0's l1: 3.41384
[17]	valid_0's l1: 3.41146
[18]	valid_0's l1: 3.40594
[19]	valid_0's l1: 3.39911
[20]	valid_0's l1: 3.39587
[21]	valid_0's l1: 3.39396
[22]	valid_0's l1: 3.39023
[23]	valid_0's l1: 3.38301
[24]	valid_0's l1: 3.37968
[25]	valid_0's l1: 3.37666
[26]	valid_0's l1: 3.37843
[27]	valid_0's l1: 3.37619
[28]	valid_0's l1: 3.37236
[29]	valid_0's l1: 3.36855
[30]	valid_0's l1: 3.36836
[31]	valid_0's l1: 3.37018
[32]	valid_0's l1: 3.36973
[33]	valid_0's l1: 3.36743
[34]	valid_0's l1: 3.3658
[35]	valid_0's l1: 3.36584
[36]	

In [9]:
X_train.columns

Index(['Team', 'S', 'A', 'Dis', 'NflId', 'JerseyNumber', 'Season', 'Quarter',
       'Down', 'Distance', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'OffenseFormation', 'OffensePersonnel', 'DefendersInTheBox',
       'DefensePersonnel', 'PlayerWeight', 'PlayerCollegeName', 'Position',
       'Week', 'Stadium', 'Location', 'StadiumType', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'YardLine',
       'X', 'Y', 'Orientation', 'Dir', 'age', 'Momentum', 'Dir_sin', 'Dir_cos',
       'Momentum_cos', 'Momentum_sin', 'NecDisPerDown', 'time_quarter',
       'time_end', 'handoff_snap_diff', 'Grass',
       'DefendersInTheBox_vs_Distance', 'DisToQB', 'Margin', 'runner_height',
       'DefenseAveX', 'OffenseAveX', 'DefenseAveY', 'OffenseAveY',
       'DefenseAveAge', 'OffenseAveAge', 'DefenseStdX', 'OffenseStdX',
       'DefenseStdY', 'OffenseStdY', 'RunnerToDefenseCentoid',
       'RunnerToOffenseCentoid', 'DefenseSpreadX', 'OffenseSpreadX',
       'R

In [10]:
feature_importance_df.sort_values("Average").head(30)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
7,Quarter,0,1,1,1,0,0.6,0.489898,0.816497
42,handoff_snap_diff,0,2,4,0,0,1.2,1.6,1.333333
43,Grass,0,1,3,2,1,1.4,1.019804,0.728431
9,Distance,2,3,0,2,0,1.4,1.2,0.857143
18,Position,0,1,3,4,0,1.6,1.624808,1.015505
39,NecDisPerDown,0,1,5,2,1,1.8,1.720465,0.955814
47,runner_height,2,3,3,3,0,2.2,1.16619,0.530087
12,OffenseFormation,1,2,4,5,0,2.4,1.854724,0.772802
0,Team,2,5,2,5,1,3.0,1.67332,0.557773
8,Down,3,3,3,5,1,3.0,1.264911,0.421637


# prediction

In [11]:
for (test_df, sample_prediction_df) in env.iter_test():
    X_test, dist_to_end_test = transform_data(test_df)
    for f in X_test.columns:
        if X_test[f].dtype=='object':
            X_test[f] = X_test[f].map(lambda x:x if x in set(X_train[f]) else -999)
    for f in X_test.columns:
        if X_test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(X_train[f])+[-999])
            X_test[f] = lbl.transform(list(X_test[f])) 
    pred_value = 0
    for model in models:
        pred_value += model.predict(X_test)[0]/5
    pred_data = list(get_score(pred_value,cdf,4,dist_to_end_test.values[0]))
    pred_data = np.array(pred_data).reshape(1,199)
    pred_target = pd.DataFrame(index = sample_prediction_df.index, \
                               columns = sample_prediction_df.columns, \
                               #data = np.array(pred_data))
                               data = pred_data)
    env.predict(pred_target)
env.write_submission_file()

Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
