In [1]:
import os
import pandas as pd
from kaggle.competitions import nflrush
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import KFold, RepeatedKFold
import lightgbm as lgb
import gc
import pickle
import tqdm
import category_encoders as ce
from datetime import datetime
pd.set_option("display.max_columns",1000)

In [2]:
env = nflrush.make_env()

In [3]:
train_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)

In [4]:
unused_columns = ["GameId","PlayId","Team","Yards","TimeHandoff","TimeSnap", "DefensePersonnel", "OffensePersonnel", "GameClock",
                  "PlayerBirthDate", "Location", "Week", "DisplayName", "NflIdRusher"]

In [5]:
unique_columns = []
for c in train_df.columns:
    if c not in unused_columns and len(set(train_df[c][:22]))!= 1:
        unique_columns.append(c)
unique_columns+=["IsRusher"]
print(unique_columns)

['X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'NflId', 'JerseyNumber', 'PlayerHeight', 'PlayerWeight', 'PlayerCollegeName', 'Position', 'IsRusher']


In [6]:
ok = True
for i in range(0,509762,22):
    p=train_df["PlayId"][i]
    for j in range(1,22):
        if(p!=train_df["PlayId"][i+j]):
            ok=False
            break
print("train data is sorted by PlayId." if ok else "train data is not sorted by PlayId.")
ok = True
for i in range(0,509762,11):
    p=train_df["Team"][i]
    for j in range(1,11):
        if(p!=train_df["Team"][i+j]):
            ok=False
            break
print("train data is sorted by Team." if ok else "train data is not sorted by Team.")

train data is sorted by PlayId.
train data is sorted by Team.


In [7]:
# data cleaning and preprocessing
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def str_to_float(txt):
    try:
        return float(txt)
    except:
        return -1
    
map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
for abb in train_df['PossessionTeam'].unique():
    map_abbr[abb] = abb
train_df['PossessionTeam'] = train_df['PossessionTeam'].map(map_abbr)
train_df['HomeTeamAbbr'] = train_df['HomeTeamAbbr'].map(map_abbr)
train_df['VisitorTeamAbbr'] = train_df['VisitorTeamAbbr'].map(map_abbr)

# offense formation
train_df["OffenseFormation"] = train_df["OffenseFormation"].fillna("Unknown") 

# Stadium
train_df["Stadium"] = train_df["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
                                             else ("Everbank Field" if x == "EverBank Field"
                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
                                             else ("Lambeau Field" if x == "Lambeau field"
                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
                                             else ("NRG Stadium" if x == "NRG"
                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))

# Location
train_df["Location"] = train_df["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
                        else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
                        else ("Charlotte, NC" if x == "Charlotte, North Carolina"
                        else ("Chicago, IL" if x == "Chicago. IL"
                        else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
                        else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
                        else ("Detroit, MI" if x == "Detroit"
                        else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
                        else ("Foxborough, MA" if x == "Foxborough, Ma"
                        else ("Houston, TX" if x == "Houston, Texas"
                        else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
                        else ("London" if x == "London, England"
                        else ("Los Angeles, CA" if x == "Los Angeles, Calif."
                        else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
                        else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
                        else ("Orchard Park, NY" if x == "Orchard Park NY"
                        else ("Philadelphia, PA" if x == "Philadelphia, Pa."
                        else ("Pittsburgh, PA" if x == "Pittsburgh"
                        else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))

# Turf
train_df["Turf"] = train_df["Turf"].map(lambda x: "Artificial" if x == "Artifical"
                                       else ("Field Turf" if x == "FieldTurf" or x == "Field turf"
                                       else ("FieldTurf 360" if x == "FieldTurf360"
                                       else ("Natural Grass" if x == "natural grass" or x == "Naturall Grass" or x == "Natural grass" or x == "Natural"
                                       else ("Grass" if x == "grass"
                                       else ("UBU Speed Series-S5-M" if x == "UBU Sports Speed S5-M" else x))))))

# PlayerHeight
train_df['PlayerHeight'] = train_df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

# stadium type
outdoor =['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 'Outside', 'Outddors', 'Outdoor Retr Roof-Open', 'Oudoor', 'Bowl']
indoor_closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed', 
                 'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed']
indoor_open = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
dome_closed = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
dome_open = ['Domed, Open', 'Domed, open']

train_df['StadiumType'] = train_df['StadiumType'].apply(lambda x: "outdoor" if x in outdoor 
                                                         else ("indoor closed" if x in indoor_closed
                                                        else ("indoor open" if x in indoor_open
                                                        else ("dome_closed" if x in dome_closed
                                                        else ("dome_open" if x in dome_open else "unknown")))))

# Game weather
rain = ['Rainy', 'Rain Chance 40%', 'Showers', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
          'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain']
overcast = ['Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain','Coudy', 'Cloudy, 50% change of rain', 
            'Rain likely, temps in low 40s.', 'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter', 'Partly Clouidy', 
            '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool', 'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 
            'Mostly Cloudy', 'Partly Cloudy', 'Cloudy']
clear = ['Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny', 'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
        'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold', 'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
        'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny', 'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny']
snow = ['Heavy lake effect snow', 'Snow']
none = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']

train_df['GameWeather'] = train_df['GameWeather'].apply(lambda x: "rain" if x in rain 
                                                         else ("overcast" if x in overcast
                                                        else ("clear" if x in clear
                                                        else ("snow" if x in snow
                                                        else ("indoor" if x in none else "unknown")))))

# wind speed
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
train_df['WindSpeed'] = train_df['WindSpeed'].apply(str_to_float)

# wind direction
train_df['WindDirection'] = train_df['WindDirection'].apply(lambda x: "north" if x == "N" or x == "FROM S"
                                                   else ("south" if x == 'S' or x== 'FROM N'
                                                   else ("west" if x == 'W' or x == 'FROM E'
                                                   else ("east" if x == 'E' or x == 'FROM W'
                                                   else ("north east" if x == 'FROM SW' or x == 'FROM SSW' or x == 'FROM WSW'
                                                   else ("north west" if x == 'FROM SE' or x == 'FROM SSE' or x == 'FROM ESE'
                                                   else ("south east" if x == 'FROM NW' or x == 'FROM NNW' or x == 'FROM WNW'
                                                   else ("south west" if x == 'FROM NE' or x == 'FROM NNE' or x == 'FROM ENE'
                                                   else ("north west" if x == 'NW' or x == 'NORTHWEST'
                                                   else ("north east" if x == 'NE' or x == 'NORTH EAST'
                                                   else ("south west" if x == 'SW' or x == 'SOUTHWEST'
                                                   else ("south east" if x == 'SE' or x == 'SOUTHEAST' else "unknown"))))))))))))


# create new features
train_df['DefendersInTheBox_vs_Distance'] = train_df['DefendersInTheBox'] / train_df['Distance']
train_df['IsRusher'] = train_df['NflId'] == train_df['NflIdRusher']
train_df['TimeLeft'] = train_df['GameClock'].apply(strtoseconds)
train_df["Margin"] = (train_df["HomeScoreBeforePlay"] - train_df["VisitorScoreBeforePlay"]) + 2 * (1 - (train_df["PossessionTeam"] == train_df["HomeTeamAbbr"]).astype(int)) * (train_df["VisitorScoreBeforePlay"] - train_df["HomeScoreBeforePlay"])
train_df["IfPossess"] = train_df["FieldPosition"] == train_df["PossessionTeam"]

arr = [[int(s[0]) for s in t.split(", ")] for t in train_df["DefensePersonnel"]]
train_df["DefenseDL"] = np.array([a[0] for a in arr])
train_df["DefenseLB"] = np.array([a[1] for a in arr])
train_df["DefenseDB"] = np.array([a[2] for a in arr])
train_df["DefenseOL"] = np.array([a[3] if len(a) == 4 else 0 for a in arr])

# time from snap to handoff
FMT = '%Y-%m-%d %H:%M:%S'
tmp_time1 = [[s for s in t.split(".")][0].replace("T", " ") for t in train_df["TimeHandoff"]]
tmp_time2 = [[s for s in t.split(".")][0].replace("T", " ")  for t in train_df["TimeSnap"]]
time_diff = []
for i in range(len(tmp_time1)):
    tdelta = datetime.strptime(tmp_time1[i], FMT) - datetime.strptime(tmp_time2[i], FMT)
    time_diff.append(tdelta.seconds)
train_df["TimeFromSnapDiff"] = pd.DataFrame(time_diff)

# age
FMT_birth = '%m/%d/%Y'
FMT_gamedate = '%Y-%m-%d'
gamedate = [[s for s in t.split("T")][0] for t in train_df["TimeSnap"]]
age = []
for i in tqdm.tqdm(range(train_df.shape[0])):
    tdelta = datetime.strptime(gamedate[i], FMT_gamedate) - datetime.strptime(train_df.iloc[i]["PlayerBirthDate"], FMT_birth)
    age.append(np.floor(tdelta.days/365))
train_df["Age"] = age

arr = [[s for s in t.replace(" ", "").split(",")] for t in train_df["OffensePersonnel"]]
# RB, TE, WR, OL, DL, QB
RB_count = np.zeros(len(arr))
TE_count = np.zeros(len(arr))
WR_count = np.zeros(len(arr))
OL_count = np.zeros(len(arr))
DL_count = np.zeros(len(arr))
QB_count = np.zeros(len(arr))

for i in tqdm.tqdm(range(len(arr))):
    for j in range(len(arr[i])):
        if "RB" in arr[i][j]:
            RB_count[i] = int(arr[i][j][0])
        elif "TE" in arr[i][j]:
            TE_count[i] = int(arr[i][j][0])
        elif "WR" in arr[i][j]:
            WR_count[i] = int(arr[i][j][0])
        elif "OL" in arr[i][j]:
            OL_count[i] = int(arr[i][j][0])
        elif "DL" in arr[i][j]:
            DL_count[i] = int(arr[i][j][0])
        elif "QB" in arr[i][j]:
            QB_count[i] = int(arr[i][j][0])
train_df["OffenseRB"] = RB_count
train_df["OffenseTE"] = TE_count
train_df["OffenseWR"] = WR_count
train_df["OffenseOL"] = OL_count
train_df["OffenseDL"] = DL_count
train_df["OffenseQB"] = QB_count

train_df["Distance10"] = train_df["Distance"].apply(lambda x: 1 if x > 10 else 0)

# combine down and quarter
train_df["DownQuarter"] = train_df[["Down", "Quarter"]].apply(lambda x: "D{}_Q{}".format(x[0], x[1]), axis=1)

# rusher speed 
rusher_speed = np.zeros(train_df.shape[0])
game_id = list(train_df["GameId"].unique())
count_index = 0
for i in game_id:
    tmp_df = train_df[train_df["GameId"]==i]
    game_length = tmp_df.shape[0]
    rusher_speed[count_index: count_index+game_length] = np.sum(tmp_df["IsRusher"] * tmp_df["S"])
    count_index += game_length
train_df["RusherSpeed"] = rusher_speed
print("Precrocessing finish")

100%|██████████| 509762/509762 [04:31<00:00, 1874.54it/s]
100%|██████████| 509762/509762 [00:02<00:00, 188670.74it/s]


Precrocessing finish


In [8]:
all_columns = []
for c in train_df.columns:
    if c not in unique_columns and c not in unused_columns:
        all_columns.append(c)
for c in unique_columns:
    for i in range(22):
        all_columns.append(c+str(i))

In [9]:
category_change = [i for i in train_df.columns if train_df[i].dtype=='object' and i in all_columns]
category_change.append("Position")
category_change.append("PlayerCollegeName")
ce_oe = ce.OrdinalEncoder(cols=category_change, handle_unknown="impute")
train_df = ce_oe.fit_transform(train_df)

In [10]:
train_data=np.zeros((509762//22,len(all_columns)))
for i in tqdm.tqdm(range(0,509762,22)):
    count=0
    for c in all_columns:
        if c in train_df:
            train_data[i//22][count] = train_df[c][i]
            count+=1
    for c in unique_columns:
        for j in range(22):
            train_data[i//22][count] = train_df[c][i+j]
            count+=1        

100%|██████████| 23171/23171 [03:43<00:00, 103.53it/s]


In [11]:
y_train_ = np.array([train_df["Yards"][i] for i in range(0,509762,22)])
X_train = pd.DataFrame(data=train_data,columns=all_columns)
print(list(X_train.columns))

['Season', 'YardLine', 'Quarter', 'PossessionTeam', 'Down', 'Distance', 'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'OffenseFormation', 'DefendersInTheBox', 'PlayDirection', 'HomeTeamAbbr', 'VisitorTeamAbbr', 'Stadium', 'StadiumType', 'Turf', 'GameWeather', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'DefendersInTheBox_vs_Distance', 'TimeLeft', 'Margin', 'IfPossess', 'DefenseDL', 'DefenseLB', 'DefenseDB', 'DefenseOL', 'TimeFromSnapDiff', 'Age', 'OffenseRB', 'OffenseTE', 'OffenseWR', 'OffenseOL', 'OffenseDL', 'OffenseQB', 'Distance10', 'DownQuarter', 'RusherSpeed', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17', 'Y18', 'Y19', 'Y20', 'Y21', 'S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16

In [12]:
# Since the variance is small, I standardized the objective variable.
# scaler = preprocessing.StandardScaler()
# scaler.fit([[y] for y in y_train_])
# y_train = np.array([y[0] for y in scaler.transform([[y] for y in y_train_])])
scaler = preprocessing.StandardScaler()
scaler.fit(y_train_.reshape(-1, 1))
y_train = scaler.transform(y_train_.reshape(-1, 1)).flatten()

## train

In [13]:
# I wanted to use multi-class classification, but the number of datasets was small and it was difficult to split them including all labels.
folds = 5
seed = 222
kf = KFold(n_splits = folds, shuffle = True, random_state=seed)
y_valid_pred = np.zeros(X_train.shape[0])
models = []

for tr_idx, val_idx in kf.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx,:], y_train[tr_idx]
    vl_x, vl_y = X_train.iloc[val_idx,:], y_train[val_idx]
            
    print(len(tr_x),len(vl_x))
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)  
    clf = lgb.LGBMRegressor(n_estimators=10000, random_state=47,learning_rate=0.005,importance_type = 'gain',
                     n_jobs = -1,metric='mae')
    clf.fit(tr_x, tr_y,
        eval_set=[(vl_x, vl_y)],
        early_stopping_rounds=20,
        verbose=False)
    y_valid_pred[val_idx] += clf.predict(vl_x, num_iteration=clf.best_iteration_)
    models.append(clf)

gc.collect()

18536 4635
18537 4634
18537 4634
18537 4634
18537 4634


255

## evaluation

In [14]:
y_pred = np.zeros((509762//22,199))
y_ans = np.zeros((509762//22,199))

for i,p in enumerate(np.round(scaler.inverse_transform(y_valid_pred))):
    p+=99
    for j in range(199):
        if j>=p+10:
            y_pred[i][j]=1.0
        elif j>=p-10:
            y_pred[i][j]=(j+10-p)*0.05

for i,p in enumerate(scaler.inverse_transform(y_train)):
    p+=99
    for j in range(199):
        if j>=p:
            y_ans[i][j]=1.0

print("validation score:",np.sum(np.power(y_pred-y_ans,2))/(199*(509762//22)))

validation score: 0.015559894548483662


## make submission

In [15]:
# When there is a label that does not exist in the training data, it is handled as nan.
# If you can check the error one by one and complement it, you will get better score.
index = 0
for (test_df, sample_prediction_df) in tqdm.tqdm(env.iter_test()):
    map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
    for abb in test_df['PossessionTeam'].unique():
        map_abbr[abb] = abb
    test_df['PossessionTeam'] = test_df['PossessionTeam'].map(map_abbr)
    test_df['HomeTeamAbbr'] = test_df['HomeTeamAbbr'].map(map_abbr)
    test_df['VisitorTeamAbbr'] = test_df['VisitorTeamAbbr'].map(map_abbr)  
    
    # offense formation
    test_df["OffenseFormation"] = train_df["OffenseFormation"].fillna("Unknown") 

    # Stadium
    test_df["Stadium"] = test_df["Stadium"].map(lambda x: "Broncos Stadium at Mile High" if x=="Broncos Stadium At Mile High" 
                                             else ("CenturyLink Field" if x == "CenturyField" or x == x=="CenturyLink"
                                             else ("Everbank Field" if x == "EverBank Field"
                                             else ("FirstEnergy Stadium" if x =="First Energy Stadium" or x=="FirstEnergy" or x == "FirstEnergyStadium"
                                             else ("Lambeau Field" if x == "Lambeau field"
                                             else ("Los Angeles Memorial Coliseum" if x == "Los Angeles Memorial Coliesum"
                                             else ("M&T Bank Stadium" if x == "M & T Bank Stadium" or x == "M&T Stadium"
                                             else ("Mercedes-Benz Superdome" if x == "Mercedes-Benz Dome"
                                             else ("MetLife Stadium" if x == "MetLife" or x == "Metlife Stadium"
                                             else ("NRG Stadium" if x == "NRG"
                                             else ("Oakland-Alameda County Coliseum" if x == "Oakland Alameda-County Coliseum"
                                             else ("Paul Brown Stadium" if x == "Paul Brown Stdium"
                                             else ("Twickenham Stadium" if x == "Twickenham" else x)))))))))))))
    
    test_df["Location"] = test_df["Location"].map(lambda x: "Arlington, TX" if x == "Arlington, Texas"
                        else ("Baltimore, MD" if x == "Baltimore, Maryland" or x == "Baltimore, Md."
                        else ("Charlotte, NC" if x == "Charlotte, North Carolina"
                        else ("Chicago, IL" if x == "Chicago. IL"
                        else ("Cincinnati, OH" if x == "Cincinnati, Ohio"
                        else ("Cleveland, OH" if x == "Cleveland" or x == "Cleveland Ohio" or x == "Cleveland, Ohio" or x == "Cleveland,Ohio"
                        else ("Detroit, MI" if x == "Detroit"
                        else ("East Rutherford, NJ" if x == "E. Rutherford, NJ" or x == "East Rutherford, N.J."
                        else ("Foxborough, MA" if x == "Foxborough, Ma"
                        else ("Houston, TX" if x == "Houston, Texas"
                        else ("Jacksonville, FL" if x == "Jacksonville Florida" or x == "Jacksonville, Fl" or x == "Jacksonville, Florida"
                        else ("London" if x == "London, England"
                        else ("Los Angeles, CA" if x == "Los Angeles, Calif."
                        else ("Miami Gardens, FLA" if x == "Miami Gardens, Fla."
                        else ("New Orleans, LA" if x == "New Orleans" or x == "New Orleans, La."
                        else ("Orchard Park, NY" if x == "Orchard Park NY"
                        else ("Philadelphia, PA" if x == "Philadelphia, Pa."
                        else ("Pittsburgh, PA" if x == "Pittsburgh"
                        else ("Seattle, WA" if x == "Seattle" else x)))))))))))))))))))
    
    test_df["Turf"] = test_df["Turf"].map(lambda x: "Artificial" if x == "Artifical"
                                       else ("Field Turf" if x == "FieldTurf" or x == "Field turf"
                                       else ("FieldTurf 360" if x == "FieldTurf360"
                                       else ("Natural Grass" if x == "natural grass" or x == "Naturall Grass" or x == "Natural grass" or x == "Natural"
                                       else ("Grass" if x == "grass"
                                       else ("UBU Speed Series-S5-M" if x == "UBU Sports Speed S5-M" else x))))))
   
    # PlayerHeight
    test_df['PlayerHeight'] = test_df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    # stadium type
    outdoor =['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 'Outside', 'Outddors', 
             'Outdoor Retr Roof-Open', 'Oudoor', 'Bowl']
    indoor_closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed',
                   'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed']
    indoor_open = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
    dome_closed = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
    dome_open = ['Domed, Open', 'Domed, open']

    test_df['StadiumType'] = test_df['StadiumType'].apply(lambda x: "outdoor" if x in outdoor 
                                                         else ("indoor closed" if x in indoor_closed
                                                        else ("indoor open" if x in indoor_open
                                                        else ("dome_closed" if x in dome_closed
                                                        else ("dome_open" if x in dome_open else "unknown")))))

    # Game weather
    rain = ['Rainy', 'Rain Chance 40%', 'Showers',
            'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
              'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain']
    overcast = ['Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain',
                  'Coudy', 'Cloudy, 50% change of rain', 'Rain likely, temps in low 40s.',
                  'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter',
                  'Partly Clouidy', '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool',
                  'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 'Mostly Cloudy',
                  'Partly Cloudy', 'Cloudy']
    clear = ['Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny',
               'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
               'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold',
               'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
               'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny',
               'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny']
    snow = ['Heavy lake effect snow', 'Snow']
    none = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']

    test_df['GameWeather'] = test_df['GameWeather'].apply(lambda x: "rain" if x in rain 
                                                         else ("overcast" if x in overcast
                                                        else ("clear" if x in clear
                                                        else ("snow" if x in snow
                                                        else ("indoor" if x in none else "unknown")))))

    # wind speed
    test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    test_df['WindSpeed'] = test_df['WindSpeed'].apply(str_to_float)

    # wind direction
    test_df['WindDirection'] = test_df['WindDirection'].apply(lambda x: "north" if x == "N" or x == "FROM S"
                                                   else ("south" if x == 'S' or x== 'FROM N'
                                                   else ("west" if x == 'W' or x == 'FROM E'
                                                   else ("east" if x == 'E' or x == 'FROM W'
                                                   else ("north east" if x == 'FROM SW' or x == 'FROM SSW' or x == 'FROM WSW'
                                                   else ("north west" if x == 'FROM SE' or x == 'FROM SSE' or x == 'FROM ESE'
                                                   else ("south east" if x == 'FROM NW' or x == 'FROM NNW' or x == 'FROM WNW'
                                                   else ("south west" if x == 'FROM NE' or x == 'FROM NNE' or x == 'FROM ENE'
                                                   else ("north west" if x == 'NW' or x == 'NORTHWEST'
                                                   else ("north east" if x == 'NE' or x == 'NORTH EAST'
                                                   else ("south west" if x == 'SW' or x == 'SOUTHWEST'
                                                   else ("south east" if x == 'SE' or x == 'SOUTHEAST' else "unknown"))))))))))))

    test_df['DefendersInTheBox_vs_Distance'] = test_df['DefendersInTheBox'] / test_df['Distance']
    test_df['IsRusher'] = test_df['NflId'] == test_df['NflIdRusher']
    test_df['TimeLeft'] = test_df['GameClock'].apply(strtoseconds)
    test_df["Margin"] = (test_df["HomeScoreBeforePlay"] - test_df["VisitorScoreBeforePlay"]) + 2 * (1 - (test_df["PossessionTeam"] == test_df["HomeTeamAbbr"]).astype(int)) * (test_df["VisitorScoreBeforePlay"] - test_df["HomeScoreBeforePlay"])
    test_df["IfPossess"] = test_df["FieldPosition"] == test_df["PossessionTeam"]
    
    # time from snap to handoff
    FMT = '%Y-%m-%d %H:%M:%S'
    tmp_time1 = [[s for s in t.split(".")][0].replace("T", " ") for t in test_df["TimeHandoff"]]
    tmp_time2 = [[s for s in t.split(".")][0].replace("T", " ")  for t in test_df["TimeSnap"]]
    time_diff = []
    for i in range(len(tmp_time1)):
        tdelta = datetime.strptime(tmp_time1[i], FMT) - datetime.strptime(tmp_time2[i], FMT)
        time_diff.append(tdelta.seconds)
    test_df["TimeFromSnapDiff"] = pd.DataFrame(time_diff)
    
    # age
    FMT_birth = '%m/%d/%Y'
    FMT_gamedate = '%Y-%m-%d'
    gamedate = [[s for s in t.split("T")][0] for t in test_df["TimeSnap"]]
    age = []
    for i in tqdm.tqdm(range(test_df.shape[0])):
        tdelta = datetime.strptime(gamedate[i], FMT_gamedate) - datetime.strptime(test_df.iloc[i]["PlayerBirthDate"], FMT_birth)
        age.append(np.floor(tdelta.days/365))
    test_df["Age"] = age
    
    # RB, TE, WR, OL, DL, QB
    arr = [[s for s in t.replace(" ", "").split(",")] for t in test_df["OffensePersonnel"]]
    RB_count = np.zeros(len(arr))
    TE_count = np.zeros(len(arr))
    WR_count = np.zeros(len(arr))
    OL_count = np.zeros(len(arr))
    DL_count = np.zeros(len(arr))
    QB_count = np.zeros(len(arr))

    for i in range(len(arr)):
        for j in range(len(arr[i])):
            if "RB" in arr[i][j]:
                RB_count[i] = int(arr[i][j][0])
            elif "TE" in arr[i][j]:
                TE_count[i] = int(arr[i][j][0])
            elif "WR" in arr[i][j]:
                WR_count[i] = int(arr[i][j][0])
            elif "OL" in arr[i][j]:
                OL_count[i] = int(arr[i][j][0])
            elif "DL" in arr[i][j]:
                DL_count[i] = int(arr[i][j][0])
            elif "QB" in arr[i][j]:
                QB_count[i] = int(arr[i][j][0])
    test_df["OffenseRB"] = RB_count
    test_df["OffenseTE"] = TE_count
    test_df["OffenseWR"] = WR_count
    test_df["OffenseOL"] = OL_count
    test_df["OffenseDL"] = DL_count
    test_df["OffenseQB"] = QB_count
    
    test_df["Distance10"] = test_df["Distance"].apply(lambda x: 1 if x > 10 else 0)

    # combine down and quarter
    test_df["DownQuarter"] = test_df[["Down", "Quarter"]].apply(lambda x: "D{}_Q{}".format(x[0], x[1]), axis=1)

    # rusher speed 
    rusher_speed = np.zeros(test_df.shape[0])
    game_id = list(test_df["GameId"].unique())
    count_index = 0
    for i in game_id:
        tmp_df = test_df[test_df["GameId"]==i]
        game_length = tmp_df.shape[0]
        rusher_speed[count_index: count_index+game_length] = np.sum(tmp_df["IsRusher"] * tmp_df["S"])
        count_index += game_length
    test_df["RusherSpeed"] = rusher_speed
    
    for c in test_df.columns:
        if c == "DefensePersonnel":
            try:
                arr = [[int(s[0]) for s in t.split(", ")] for t in test_df["DefensePersonnel"]]
                test_df["DefenseDL"] = np.array([a[0] for a in arr])
                test_df["DefenseLB"] = np.array([a[1] for a in arr])
                test_df["DefenseDB"] = np.array([a[2] for a in arr])
                test_df["DefenseOL"] = np.array([a[3] if len(a) == 4 else 0 for a in arr])
            except:
                test_df["DefenseDL"] = [np.nan for i in range(22)]
                test_df["DefenseLB"] = [np.nan for i in range(22)]
                test_df["DefenseDB"] = [np.nan for i in range(22)]
                test_df["DefenseOL"] = [np.nan for i in range(22)]       
    test_df = ce_oe.fit_transform(test_df)            
    count=0
    test_data = np.zeros((1,len(all_columns)))

    for c in all_columns:
        if c in test_df:
            try:
                test_data[0][count] = test_df[c][index]
            except:
                test_data[0][count] = np.nan
            count+=1
    for c in unique_columns:
        for j in range(22):
            try:
                test_data[0][count] = test_df[c][index + j]
            except:
                test_data[0][count] = np.nan
            count+=1        
    y_pred = np.zeros(199)        
    y_pred_p = np.sum(np.round(scaler.inverse_transform(
        [model.predict(test_data)[0] for model in models])))/folds
    y_pred_p += 99
    for j in range(199):
        if j>=y_pred_p+10:
            y_pred[j]=1.0
        elif j>=y_pred_p-10:
            y_pred[j]=(j+10-y_pred_p)*0.05
    env.predict(pd.DataFrame(data=[y_pred],columns=sample_prediction_df.columns))
    index += 22
env.write_submission_file()

0it [00:00, ?it/s]
100%|██████████| 22/22 [00:00<00:00, 1675.07it/s]
1it [00:01,  1.99s/it]
100%|██████████| 22/22 [00:00<00:00, 1891.30it/s]
2it [00:02,  1.45s/it]
100%|██████████| 22/22 [00:00<00:00, 1825.45it/s]
3it [00:02,  1.07s/it]
100%|██████████| 22/22 [00:00<00:00, 1750.08it/s]
4it [00:02,  1.25it/s]
100%|██████████| 22/22 [00:00<00:00, 1851.68it/s]
5it [00:02,  1.63it/s]
100%|██████████| 22/22 [00:00<00:00, 1647.23it/s]
6it [00:02,  2.07it/s]
100%|██████████| 22/22 [00:00<00:00, 1869.91it/s]
7it [00:03,  2.56it/s]
100%|██████████| 22/22 [00:00<00:00, 1876.07it/s]
8it [00:03,  3.05it/s]
100%|██████████| 22/22 [00:00<00:00, 1801.04it/s]
9it [00:03,  3.56it/s]
100%|██████████| 22/22 [00:00<00:00, 1780.88it/s]
10it [00:03,  3.98it/s]
100%|██████████| 22/22 [00:00<00:00, 1801.78it/s]
11it [00:03,  4.31it/s]
100%|██████████| 22/22 [00:00<00:00, 1814.93it/s]
12it [00:03,  4.63it/s]
100%|██████████| 22/22 [00:00<00:00, 1664.65it/s]
13it [00:04,  4.85it/s]
100%|██████████| 22/22 [00:0

The organizers seemed to expect to predict one by one, so I did. However, it seems that it is likely to be faster to predict at once after all the evaluation data is acquired by dummy input.This model is a simple one that has not been tuned, so I think we can still expect a better score.