In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime
#from kaggle.competitions import nflrush
import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import keras

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv(r'\\Neptune\RiskMgmtArchive\3 Temp Hold\Temp_Jason\research\train.csv')
print(train.shape)
train = train.sort_values(['GameId','PlayId','Team','Y'])

(509762, 49)


In [3]:
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def OffensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"

In [4]:
def preprocess(train):
    ## GameClock
    train['GameClock_sec'] = train['GameClock'].apply(strtoseconds)
    train["GameClock_minute"] = train["GameClock"].apply(lambda x : x.split(":")[0]).astype("object")

    ## Height
    train['PlayerHeight_dense'] = train['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    ## Time
    train['TimeHandoff'] = train['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeSnap'] = train['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    train['TimeDelta'] = train.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
    train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

    ## Age
    seconds_in_year = 60*60*24*365.25
    train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
    train["PlayerAge_ob"] = train['PlayerAge'].astype(np.int).astype("object")

    ## WindSpeed
    train['WindSpeed_ob'] = train['WindSpeed'].apply(lambda x: str(x).lower().replace('mph', '').strip() if not pd.isna(x) else x)
    train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    train['WindSpeed_dense'] = train['WindSpeed_ob'].apply(strtofloat)

    ## Weather
    train['GameWeather_process'] = train['GameWeather'].str.lower()
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
    train['GameWeather_dense'] = train['GameWeather_process'].apply(map_weather)

    ## Rusher
    train['IsRusher'] = (train['NflId'] == train['NflIdRusher'])
    train['IsRusher_ob'] = (train['NflId'] == train['NflIdRusher']).astype("object")
    temp = train[train["IsRusher"]][["Team", "PlayId"]].rename(columns={"Team":"RusherTeam"})
    train = train.merge(temp, on = "PlayId")
    train["IsRusherTeam"] = train["Team"] == train["RusherTeam"]

    ## dense -> categorical
    train["Quarter_ob"] = train["Quarter"].astype("object")
    train["Down_ob"] = train["Down"].astype("object")
    train["JerseyNumber_ob"] = train["JerseyNumber"].astype("object")
    train["YardLine_ob"] = train["YardLine"].astype("object")
    # train["DefendersInTheBox_ob"] = train["DefendersInTheBox"].astype("object")
    # train["Week_ob"] = train["Week"].astype("object")
    # train["TimeDelta_ob"] = train["TimeDelta"].astype("object")


    ## Orientation and Dir
    train["Orientation_ob"] = train["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
    train["Dir_ob"] = train["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

    train["Orientation"] = train.loc[train['Season']==2017,"Orientation"] + 90
    train["Orientation_sin"] = train["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Orientation_cos"] = train["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    train["Dir_sin"] = train["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Dir_cos"] = train["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))

    ## diff Score
    train["diffScoreBeforePlay"] = train["HomeScoreBeforePlay"] - train["VisitorScoreBeforePlay"]
    train["diffScoreBeforePlay_binary_ob"] = (train["HomeScoreBeforePlay"] > train["VisitorScoreBeforePlay"]).astype("object")

    ## Turf
    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 
    train['Turf'] = train['Turf'].map(Turf)

    ## OffensePersonnel
    temp = train["OffensePersonnel"].iloc[np.arange(0, len(train), 22)].apply(lambda x : pd.Series(OffensePersonnelSplit(x)))
    temp.columns = ["Offense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    ## DefensePersonnel
    temp = train["DefensePersonnel"].iloc[np.arange(0, len(train), 22)].apply(lambda x : pd.Series(DefensePersonnelSplit(x)))
    temp.columns = ["Defense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    ## sort
#     train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index(drop = True)
    #train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop = True)
    return train

In [5]:
%%time
train = preprocess(train)

Wall time: 1min 20s


In [6]:
## DisplayName remove Outlier
v = train["DisplayName"].value_counts()
missing_values = list(v[v < 5].index)
train["DisplayName"] = train["DisplayName"].where(~train["DisplayName"].isin(missing_values), "nan")

## PlayerCollegeName remove Outlier
v = train["PlayerCollegeName"].value_counts()
missing_values = list(v[v < 10].index)
train["PlayerCollegeName"] = train["PlayerCollegeName"].where(~train["PlayerCollegeName"].isin(missing_values), "nan")

In [7]:
#add distance to rusher
Rusher =train.loc[train['IsRusher'],['PlayId','X','Y']].rename(columns={"X":"Rusher_X","Y":"Rusher_Y"})
train = train.merge(Rusher,how = 'left',on = 'PlayId')
train['distance_to_rusher'] = ((train['X']- train['Rusher_X'])**2 + (train['Y']- train['Rusher_Y'])**2)**0.5
train = train.drop(['Rusher_X','Rusher_X'],axis=1)

In [8]:
pd.to_pickle(train, "train.pkl")

In [9]:
def drop(train):
    drop_cols = ["GameId", "GameWeather", "NflId", "Season", "NflIdRusher"] 
    drop_cols += ['TimeHandoff', 'TimeSnap', 'PlayerBirthDate']
    drop_cols += ["Orientation", "Dir", 'WindSpeed', "GameClock"]
    # drop_cols += ["DefensePersonnel","OffensePersonnel"]
    train = train.drop(drop_cols, axis = 1)
    return train

In [10]:
train = drop(train)

In [11]:
cat_features = []
dense_features = []
for col in train.columns:
    if train[col].dtype =='object':
        cat_features.append(col)
        print("*cat*", col, len(train[col].unique()))
    else:
        dense_features.append(col)
        print("!dense!", col, len(train[col].unique()))
dense_features.remove("PlayId")
dense_features.remove("Yards")

!dense! PlayId 23171
*cat* Team 2
!dense! X 10890
!dense! Y 4339
!dense! S 884
!dense! A 903
!dense! Dis 105
*cat* DisplayName 2128
!dense! JerseyNumber 99
!dense! YardLine 50
!dense! Quarter 5
*cat* PossessionTeam 32
!dense! Down 4
!dense! Distance 35
*cat* FieldPosition 33
!dense! HomeScoreBeforePlay 50
!dense! VisitorScoreBeforePlay 45
*cat* OffenseFormation 9
*cat* OffensePersonnel 56
!dense! DefendersInTheBox 12
*cat* DefensePersonnel 38
*cat* PlayDirection 2
!dense! Yards 94
*cat* PlayerHeight 16
!dense! PlayerWeight 182
*cat* PlayerCollegeName 291
*cat* Position 25
*cat* HomeTeamAbbr 32
*cat* VisitorTeamAbbr 32
!dense! Week 17
*cat* Stadium 55
*cat* Location 60
*cat* StadiumType 30
*cat* Turf 2
!dense! Temperature 79
!dense! Humidity 87
*cat* WindDirection 54
!dense! GameClock_sec 901
*cat* GameClock_minute 16
!dense! PlayerHeight_dense 16
!dense! TimeDelta 7
!dense! PlayerAge 506270
*cat* PlayerAge_ob 22
*cat* WindSpeed_ob 52
!dense! WindSpeed_dense 27
*cat* GameWeather_process

In [12]:
train_cat = train[cat_features]
categories = []
most_appear_each_categories = {}
for col in tqdm_notebook(train_cat.columns):
    train_cat.loc[:,col] = train_cat[col].fillna("nan")
    train_cat.loc[:,col] = col + "__" + train_cat[col].astype(str)
    most_appear_each_categories[col] = list(train_cat[col].value_counts().index)[0]
    categories.append(train_cat[col].unique())
categories = np.hstack(categories)
print(len(categories))

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


3240


In [13]:
train_cat

Unnamed: 0,Team,DisplayName,PossessionTeam,FieldPosition,OffenseFormation,OffensePersonnel,DefensePersonnel,PlayDirection,PlayerHeight,PlayerCollegeName,...,GameWeather_process,IsRusher_ob,RusherTeam,Quarter_ob,Down_ob,JerseyNumber_ob,YardLine_ob,Orientation_ob,Dir_ob,diffScoreBeforePlay_binary_ob
0,Team__away,DisplayName__Terrance Mitchell,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__5-11,PlayerCollegeName__Oregon,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__39,YardLine_ob__35,Orientation_ob__23,Dir_ob__21,diffScoreBeforePlay_binary_ob__False
1,Team__away,DisplayName__Phillip Gaines,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-0,PlayerCollegeName__Rice,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__23,YardLine_ob__35,Orientation_ob__21,Dir_ob__18,diffScoreBeforePlay_binary_ob__False
2,Team__away,DisplayName__Dee Ford,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-2,PlayerCollegeName__Auburn,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__55,YardLine_ob__35,Orientation_ob__20,Dir_ob__6,diffScoreBeforePlay_binary_ob__False
3,Team__away,DisplayName__Daniel Sorensen,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-2,PlayerCollegeName__Brigham Young,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__49,YardLine_ob__35,Orientation_ob__22,Dir_ob__3,diffScoreBeforePlay_binary_ob__False
4,Team__away,DisplayName__Derrick Johnson,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-3,PlayerCollegeName__Texas,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__56,YardLine_ob__35,Orientation_ob__23,Dir_ob__7,diffScoreBeforePlay_binary_ob__False
5,Team__away,DisplayName__Chris Jones,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-6,PlayerCollegeName__Mississippi State,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__95,YardLine_ob__35,Orientation_ob__22,Dir_ob__18,diffScoreBeforePlay_binary_ob__False
6,Team__away,DisplayName__Allen Bailey,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-3,PlayerCollegeName__Miami,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__97,YardLine_ob__35,Orientation_ob__1,Dir_ob__13,diffScoreBeforePlay_binary_ob__False
7,Team__away,DisplayName__Justin Houston,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-3,PlayerCollegeName__Georgia,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__50,YardLine_ob__35,Orientation_ob__0,Dir_ob__13,diffScoreBeforePlay_binary_ob__False
8,Team__away,DisplayName__Eric Berry,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-0,PlayerCollegeName__Tennessee,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__29,YardLine_ob__35,Orientation_ob__5,Dir_ob__11,diffScoreBeforePlay_binary_ob__False
9,Team__away,DisplayName__Ron Parker,PossessionTeam__NE,FieldPosition__NE,OffenseFormation__SHOTGUN,"OffensePersonnel__1 RB, 1 TE, 3 WR","DefensePersonnel__2 DL, 3 LB, 6 DB",PlayDirection__left,PlayerHeight__6-0,PlayerCollegeName__Newberry,...,GameWeather_process__clear and warm,IsRusher_ob__False,RusherTeam__home,Quarter_ob__1,Down_ob__3,JerseyNumber_ob__38,YardLine_ob__35,Orientation_ob__0,Dir_ob__10,diffScoreBeforePlay_binary_ob__False


In [14]:
categories

array(['Team__away', 'Team__home', 'DisplayName__Terrance Mitchell', ...,
       'Dir_ob__nan', 'diffScoreBeforePlay_binary_ob__False',
       'diffScoreBeforePlay_binary_ob__True'], dtype=object)

In [15]:
most_appear_each_categories

{'Team': 'Team__away',
 'DisplayName': 'DisplayName__Michael Thomas',
 'PossessionTeam': 'PossessionTeam__NE',
 'FieldPosition': 'FieldPosition__BUF',
 'OffenseFormation': 'OffenseFormation__SINGLEBACK',
 'OffensePersonnel': 'OffensePersonnel__1 RB, 1 TE, 3 WR',
 'DefensePersonnel': 'DefensePersonnel__4 DL, 2 LB, 5 DB',
 'PlayDirection': 'PlayDirection__left',
 'PlayerHeight': 'PlayerHeight__6-3',
 'PlayerCollegeName': 'PlayerCollegeName__Alabama',
 'Position': 'Position__CB',
 'HomeTeamAbbr': 'HomeTeamAbbr__SF',
 'VisitorTeamAbbr': 'VisitorTeamAbbr__LA',
 'Stadium': 'Stadium__MetLife Stadium',
 'Location': 'Location__East Rutherford, NJ',
 'StadiumType': 'StadiumType__Outdoor',
 'Turf': 'Turf__Natural',
 'WindDirection': 'WindDirection__nan',
 'GameClock_minute': 'GameClock_minute__02',
 'PlayerAge_ob': 'PlayerAge_ob__25',
 'WindSpeed_ob': 'WindSpeed_ob__nan',
 'GameWeather_process': 'GameWeather_process__cloudy',
 'IsRusher_ob': 'IsRusher_ob__False',
 'RusherTeam': 'RusherTeam__home'

In [16]:
le = LabelEncoder()
le.fit(categories)
for col in tqdm_notebook(train_cat.columns):
    train_cat.loc[:, col] = le.transform(train_cat[col])
num_classes = len(le.classes_)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




In [17]:
train_dense = train[dense_features]
sss = {}
medians = {}
for col in tqdm_notebook(train_dense.columns):
    print(col)
    medians[col] = np.nanmedian(train_dense[col])
    train_dense.loc[:, col] = train_dense[col].fillna(medians[col])
    ss = StandardScaler()
    train_dense.loc[:, col] = ss.fit_transform(train_dense[col].values[:,None])
    sss[col] = ss

HBox(children=(IntProgress(value=0, max=44), HTML(value='')))

X
Y
S
A
Dis
JerseyNumber
YardLine
Quarter
Down
Distance
HomeScoreBeforePlay
VisitorScoreBeforePlay
DefendersInTheBox
PlayerWeight
Week
Temperature
Humidity
GameClock_sec
PlayerHeight_dense
TimeDelta
PlayerAge
WindSpeed_dense
GameWeather_dense
IsRusher
IsRusherTeam
Orientation_sin
Orientation_cos
Dir_sin
Dir_cos
diffScoreBeforePlay
OffenseDB
OffenseDL
OffenseLB
OffenseOL
OffenseQB
OffenseRB
OffenseTE
OffenseWR
DefenseDB
DefenseDL
DefenseLB
DefenseOL
Rusher_Y
distance_to_rusher



In [18]:
## dense features for play
dense_game_features = train_dense.columns[train_dense[:22].std() == 0]
## dense features for each player
dense_player_features = train_dense.columns[train_dense[:22].std() != 0]
## categorical features for play
cat_game_features = train_cat.columns[train_cat[:22].std() == 0]
## categorical features for each player
cat_player_features = train_cat.columns[train_cat[:22].std() != 0]

In [19]:
train_dense_game = train_dense[dense_game_features].iloc[np.arange(0, len(train), 22)].reset_index(drop = True).values
train_dense_game = np.hstack([train_dense_game, train_dense[dense_player_features][train_dense["IsRusher"] > 0]]) ## with rusher player feature

train_dense_players = [train_dense[dense_player_features].iloc[np.arange(k, len(train), 22)].reset_index(drop = True) for k in range(22)]
train_dense_players = np.stack([t.values for t in train_dense_players]).transpose(1, 0, 2)

train_cat_game = train_cat[cat_game_features].iloc[np.arange(0, len(train), 22)].reset_index(drop = True).values
train_cat_game = np.hstack([train_cat_game, train_cat[cat_player_features][train_dense["IsRusher"] > 0]]) ## with rusher player feature

train_cat_players = [train_cat[cat_player_features].iloc[np.arange(k, len(train), 22)].reset_index(drop = True) for k in range(22)]
train_cat_players = np.stack([t.values for t in train_cat_players]).transpose(1, 0, 2)

In [20]:
def return_step(x):
    temp = np.zeros(199)
    temp[x + 99:] = 1
    return temp

train_y_raw = train["Yards"].iloc[np.arange(0, len(train), 22)].reset_index(drop = True)
train_y = np.vstack(train_y_raw.apply(return_step).values)

In [21]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras import regularizers
import tensorflow as tf

In [22]:
keras.backend.clear_session()
def crps(y_true, y_pred):
    loss = K.mean((K.cumsum(y_pred, axis = 1) - y_true)**2)
    return loss

def get_model(batch_size = 32, epochs = 10):
    
    ## inputs
    input_dense_game = keras.layers.Input(shape=(train_dense_game.shape[1],), name = "numerical_general_inputs")
    input_dense_players = keras.layers.Input(shape=(train_dense_players.shape[1],train_dense_players.shape[2]), name = "numerical_players_inputs")
    input_cat_game = keras.layers.Input(shape=(train_cat_game.shape[1], ), name = "categorical_general_inputs")
    input_cat_players = keras.layers.Input(shape=(train_cat_players.shape[1], train_cat_players.shape[2]), name = "categorical_players_input")
    
    ## embedding
    embedding = keras.layers.Embedding(num_classes, 4, embeddings_regularizer=regularizers.l2(1e-4))
    emb_cat_game = embedding(input_cat_game)
    emb_cat_game = keras.layers.Flatten()(emb_cat_game)
    emb_cat_players = embedding(input_cat_players)
    emb_cat_players = keras.layers.Reshape((int(emb_cat_players.shape[1]), int(emb_cat_players.shape[2]) * int(emb_cat_players.shape[3])))(emb_cat_players)
    
    ## general game features
    game = keras.layers.Concatenate(name = "general_features")([input_dense_game, emb_cat_game])
    game = keras.layers.Dense(32, activation="relu")(game)
    game = keras.layers.Dropout(0.5)(game)
    
    ## players features
    players = keras.layers.Concatenate(name = "players_features")([input_dense_players, emb_cat_players])
    n_unit = 16
    players_aves = []
    for k in range(3):
        players = keras.layers.Dense(16, activation=None)(players)
        players_aves.append(keras.layers.GlobalAveragePooling1D()(players))
        players = keras.layers.Activation("relu")(players)
    players = keras.layers.Concatenate(name = "deep_players_features")(players_aves)
    players = keras.layers.Dropout(0.5)(players)

    ### concat all
    x_concat = keras.layers.Concatenate(name = "general_and_players")([game, players])
    x_concats = []
    n_unit = 128
    decay_rate = 0.5
    for k in range(3):
        x_concat = keras.layers.Dense(n_unit, activation="relu")(x_concat)
        x_concats.append(x_concat)
        n_unit = int(n_unit * decay_rate)
    x_concat = keras.layers.Concatenate(name = "deep_features")(x_concats)
    x_concat = keras.layers.Dropout(0.5)(x_concat)
    
    ## concat
    x_concat = keras.layers.Concatenate(name = "all_concat")([game, players, x_concat])
    out_soft = keras.layers.Dense(199, activation="softmax", name = "out_soft")(x_concat)
    out_reg = keras.layers.Dense(1, activation=None, name = "out_reg")(x_concat)
    model = keras.models.Model(inputs = [input_dense_game, input_dense_players, input_cat_game, input_cat_players],
                               outputs = [out_soft, out_reg])

    ## compile
    model.compile(loss=[crps, keras.losses.mae],
                  loss_weights=[1.0, 0.01],
                  optimizer=keras.optimizers.Adam(decay = 1e-4))

    ## train
    tr_x = [train_dense_game[tr_inds], train_dense_players[tr_inds], train_cat_game[tr_inds], train_cat_players[tr_inds]]
    tr_y = [train_y[tr_inds], train_y_raw[tr_inds]/100]
    val_x = [train_dense_game[val_inds], train_dense_players[val_inds], train_cat_game[val_inds], train_cat_players[val_inds]]
    val_y = [train_y[val_inds], train_y_raw[val_inds]/100]
    model.fit(tr_x,
              tr_y,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(val_x, val_y))
    loss = model.history.history["val_out_soft_loss"][-1]
    return model, loss

In [23]:
from sklearn.model_selection import train_test_split, KFold
losses = []
models = []
for k in range(2):
    kfold = KFold(5, random_state = 42 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(train_y)):
        print("-----------")
        print("-----------")
        model, loss = get_model(32, 20)
        models.append(model)
        print(k_fold, loss)
        losses.append(loss)
print("-------")
print(losses)
print(np.mean(losses))

-----------
-----------
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 18536 samples, validate on 4635 samples
Epoch 1/20
Epoch 2/20


Epoch 3/20
Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
0 0.013109257587401484
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
1 0.013119747141921989
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
2 0.013000701369221709
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
3 0.013042630750748635
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
4 0.013388753668065466
-----------
-----------
Train on 18536 samples, validate on 4635 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
0 0.01279393205722591
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
1 0.013365416393651883
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
2 0.01277012339389988
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20


Epoch 20/20
3 0.013687539671257904
-----------
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20


Epoch 16/20


Epoch 17/20
Epoch 18/20
Epoch 19/20


Epoch 20/20
4 0.012882046704974394
-------


[0.013109257587401484, 0.013119747141921989, 0.013000701369221709, 0.013042630750748635, 0.013388753668065466, 0.01279393205722591, 0.013365416393651883, 0.01277012339389988, 0.013687539671257904, 0.012882046704974394]
0.013116014873836925
