In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors

import json
import pickle
import random
import datetime as dt

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder



each row represents a player.<br>
labels = digital engagement (there are 4 labels) on a particular day<br>
features = player's status (match result, number hits, trading information etc)

<br><br>
For prediction, we need a dataset where each row represents a unique player&day.  

In [3]:
df = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train_updated.csv")

In [4]:
df.shape

(1308, 12)

- each record is for a particular day. 
- For each column, it shows data for that column for all players (as a list of dictionary)
- nextDayPlayerEngagement stores the labels. 
- for each day, "roster" shows where in which team the player is, for each player.

In [5]:
df.head()

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


In [6]:
def extract_json2(col_name):
    col = df[col_name]
    
    json_oups = []
    for row in range(len(col)):
        json_oups.append(pd.read_json(col[row]))
    
    final_oup = pd.concat(json_oups, axis=0)
    
    identifier_col = final_oup["engagementMetricsDate"] + "_" + final_oup["playerId"].astype(str)
    final_oup["date_playerId"] = identifier_col
    
    return final_oup

In [7]:
df_engagement = extract_json2("nextDayPlayerEngagement")
df_engagement

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,2018-01-02_628317
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,2018-01-02_547989
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,2018-01-02_519317
3,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745,2018-01-02_607625
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,2018-01-02_592547
...,...,...,...,...,...,...,...
2056,2021-08-01,527055,0.000000,0.027442,0.000000,0.157926,2021-08-01_527055
2057,2021-08-01,542963,0.000834,0.137212,0.000000,0.146908,2021-08-01_542963
2058,2021-08-01,622259,0.000119,0.102909,0.000000,0.058763,2021-08-01_622259
2059,2021-08-01,642840,0.000119,0.048024,0.000000,0.036727,2021-08-01_642840


In [8]:
# add date, day_of_week
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"], format="%Y-%m-%d")
df_engagement["weekday"] = df_engagement["date"].dt.weekday
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])

creating a few other features

In [9]:
df_engagement

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,weekday,yearmonth
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,2018-01-02_628317,2018-01-02,1,2018-01
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,2018-01-02_547989,2018-01-02,1,2018-01
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,2018-01-02_519317,2018-01-02,1,2018-01
3,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745,2018-01-02_607625,2018-01-02,1,2018-01
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,2018-01-02_592547,2018-01-02,1,2018-01
...,...,...,...,...,...,...,...,...,...,...
2056,2021-08-01,527055,0.000000,0.027442,0.000000,0.157926,2021-08-01_527055,2021-08-01,6,2021-08
2057,2021-08-01,542963,0.000834,0.137212,0.000000,0.146908,2021-08-01_542963,2021-08-01,6,2021-08
2058,2021-08-01,622259,0.000119,0.102909,0.000000,0.058763,2021-08-01_622259,2021-08-01,6,2021-08
2059,2021-08-01,642840,0.000119,0.048024,0.000000,0.036727,2021-08-01_642840,2021-08-01,6,2021-08


We will also take a look at player.csv.

In [10]:
df_players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")

In [11]:
df_players.head()

Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True


In [12]:
df_engagement = pd.merge(df_engagement, df_players, on=["playerId"], how="left")

In [13]:
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,weekday,yearmonth,...,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,2018-01-02_628317,2018-01-02,1,2018-01,...,1988-04-11,2016-04-06,Osaka,,Japan,73,185,1,Pitcher,True
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,2018-01-02_547989,2018-01-02,1,2018-01,...,1987-01-29,2014-03-31,Cienfuegos,,Cuba,75,235,3,First Base,True
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,2018-01-02_519317,2018-01-02,1,2018-01,...,1989-11-08,2010-06-08,Panorama,CA,USA,78,245,10,Designated Hitter,True
3,2018-01-02,607625,0.0067,2.675097,0.005168,1.862745,2018-01-02_607625,2018-01-02,1,2018-01,...,1989-11-17,2016-07-01,Shreveport,LA,USA,76,225,1,Pitcher,True
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,2018-01-02_592547,2018-01-02,1,2018-01,...,1989-06-23,2017-09-12,Richmond,VA,USA,78,220,1,Pitcher,False


In [48]:
id_cols = ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
targets = ["target1", "target2", "target3", "target4"]
not_needed = ["DOB", "playerName", "mlbDebutDate"]
x_train = df_engagement.drop(id_cols+targets+not_needed, axis=1)
y_train = df_engagement[targets]
id_train = df_engagement[id_cols]

In [47]:
class OriginalLabelEncoder:
    def __init__(self):
        self.label_to_int = {}
        self.int_to_label = {}
        self.current_int = 0
    
    def fit(self, labels):
        for label in labels:
            if label not in self.label_to_int:
                self.label_to_int[label] = self.current_int
                self.int_to_label[self.current_int] = label
                self.current_int += 1
    
    def transform(self, labels):
        transformed_labels = np.zeros((len(labels)), dtype=np.int16)
        for i, label in enumerate(labels):
            if label not in self.label_to_int:
                transformed_labels[i] = random.randint(0, self.current_int-1)
            else:
                transformed_labels[i] = self.label_to_int[label]
        return transformed_labels
    
    def fit_transform(self, labels):
        self.fit(labels)
        return self.transform(labels)

In [49]:
# use label encoder to convert the categorical features to integer types (they will be handles as categorical types in lightgbm, if you specify so)
cat_cols_names = ["primaryPositionCode", "primaryPositionName", "birthCountry", "birthStateProvince", "birthCity"]
label_encoders = []

for col_name in cat_cols_names:
    le = OriginalLabelEncoder()
    new_col = le.fit_transform(x_train[col_name])
    label_encoders.append(le)
    
    x_train[col_name] = new_col

## Designing Validation Dataset

In [59]:
list_cv_months = [
    [["2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11",
      "2020-12", "2021-01", "2021-02", "2021-03", "2021-04"], "2021-05"],
    [["2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11","2020-12",
      "2021-01", "2021-02", "2021-03", "2021-04", "2021-05"], "2021-06"],
    [["2020-07", "2020-08", "2020-09", "2020-10", "2020-11","2020-12", "2021-01",
      "2021-02", "2021-03", "2021-04", "2021-05", "2021-06"], "2021-07"]
]

In [60]:
folds = []
for train_months, val_month in list_cv_months:
    folds.append([
        id_train.index[id_train["yearmonth"].isin(train_months)],
        id_train.index[(id_train["yearmonth"] == val_month) & (id_train["playerForTestSetAndFuturePreds"] == True)]
    ])

In [75]:
folds

[[[Int64Index([1751850, 1751851, 1751852, 1751853, 1751854, 1751855, 1751856,
               1751857, 1751858, 1751859,
               ...
               2504105, 2504106, 2504107, 2504108, 2504109, 2504110, 2504111,
               2504112, 2504113, 2504114],
              dtype='int64', length=752265)],
  [Int64Index([2504115, 2504116, 2504117, 2504118, 2504120, 2504121, 2504122,
               2504124, 2504125, 2504126,
               ...
               2567962, 2567965, 2567966, 2567971, 2567979, 2567980, 2567981,
               2567983, 2567984, 2567985],
              dtype='int64', length=36797)]],
 [[Int64Index([1815741, 1815742, 1815743, 1815744, 1815745, 1815746, 1815747,
               1815748, 1815749, 1815750,
               ...
               2567996, 2567997, 2567998, 2567999, 2568000, 2568001, 2568002,
               2568003, 2568004, 2568005],
              dtype='int64', length=752265)],
  [Int64Index([2568006, 2568007, 2568008, 2568010, 2568011, 2568012, 2568013,
    

## Implementing Baseline Model

In [61]:
def train(x_train, y_train, cat_col_names, folds, params, dryrun=False):
    metrics = np.zeros((4, len(folds)))
    imps = np.zeros((4, len(x_train.columns)))

    for idx_fold, fold in enumerate(folds):
        # get fold
        train_idx, valid_idx = fold

        if dryrun:
            # sample from the train_idx, valid_idx
            random_idxs_train = [random.randint(0, len(train_idx)-1) for _ in range(10)]
            random_idxs_valid = [random.randint(0, len(valid_idx)-1) for _ in range(5)]
            train_idx, valid_idx = train_idx[random_idxs_train], valid_idx[random_idxs_valid]
            
        for idx_target, target in enumerate(["target1", "target2", "target3", "target4"]):
            x_train_current, y_train_current = x_train.loc[train_idx,:], y_train.loc[train_idx, target]
            x_valid_current, y_valid_current = x_train.loc[valid_idx,:], y_train.loc[valid_idx, target]

            # create a lgm dateset
            train_current = lgb.Dataset(x_train_current, label=y_train_current,
                                        feature_name=list(x_train_current.columns),
                                        categorical_feature=cat_col_names)
            valid_current = lgb.Dataset(x_valid_current, label=y_valid_current,
                                        feature_name=list(x_train_current.columns),
                                        categorical_feature=cat_col_names)

            # create lgm model
            num_round = 100
            model = lgb.train(params, train_current, num_round, valid_sets=[valid_current],
                              callbacks=[lgb.early_stopping(stopping_rounds=30)])

            with open(f"model_{target}_{idx_fold}.pickle", "wb") as file:
                pickle.dump(model, file)

            # predict on validation data & calculate metrics
            pred = model.predict(x_valid_current)
            mae = mean_absolute_error(y_valid_current, pred)
            metrics[idx_target, idx_fold] = mae

            # get importance metrics
            imps[idx_target, :] = model.feature_importance() / len(folds)
    
    # calculate mean mae
    mean_mae = metrics.sum(axis=1) / 4
    
    return mean_mae, metrics, imps

In [62]:
params = {
    "boosting_type": "gbdt",
    "objective": "regression_l1",
    "metric": "mean_absolute_error",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "subsample": 0.7,
    "subsample_freq": 1,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 50,
    "n_estimators": 1000,
    "random_state": 123,
    "importance_type": "gain"
}

mean_mae, metrics, imps = train(x_train, y_train, cat_cols_names, folds, params, dryrun=False)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.001289




Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.30993




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.603622




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[917]	valid_0's l1: 2.46885




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.002005




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[68]	valid_0's l1: 0.872016




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.218103




Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.29623




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.001164




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[793]	valid_0's l1: 1.20171




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.488334




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[765]	valid_0's l1: 2.18764




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.001965




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[90]	valid_0's l1: 0.839552




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.236174




Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.71698




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.001065




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[656]	valid_0's l1: 1.12242




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.400566




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[320]	valid_0's l1: 1.80124




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.002010




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[930]	valid_0's l1: 0.770289




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 8
[LightGBM] [Info] Start training from score 0.258303




Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[478]	valid_0's l1: 0.865989


## Checking our Model

In [64]:
mean_mae

array([0.90851796, 1.61443203, 0.62046415, 0.96979883])

In [65]:
imps

array([[   0.        , 3537.66666667,  603.33333333,  228.33333333,
         568.33333333,  947.        ,  719.33333333,  174.66666667],
       [  31.        , 1507.        ,  244.66666667,   97.33333333,
         380.66666667,  605.33333333,  355.        ,   85.66666667],
       [   0.        , 4766.66666667, 1025.33333333,  310.66666667,
         709.33333333, 1338.66666667, 1159.66666667,  299.66666667],
       [ 214.33333333, 2159.        ,  360.66666667,  110.66666667,
         572.66666667,  889.33333333,  519.66666667,  113.        ]])

From the visualization below, we see that some important variables are birthCity, weight, and heightInches.

In [66]:
imp_df = pd.DataFrame({"target1": imps[0], "target2": imps[1], "target3": imps[2], "target4": imps[3]}, index=x_train.columns)

norm = colors.Normalize(vmin=imp_df.min().min(), vmax=imp_df.max().max())
colormap = plt.cm.YlOrRd

def gradient_color(val):
    color = colormap(norm(val))
    return f'background-color: {colors.rgb2hex(color)}'

styled_imp_df = imp_df.style.applymap(gradient_color)
styled_imp_df

Unnamed: 0,target1,target2,target3,target4
weekday,0.0,31.0,0.0,214.333333
birthCity,3537.666667,1507.0,4766.666667,2159.0
birthStateProvince,603.333333,244.666667,1025.333333,360.666667
birthCountry,228.333333,97.333333,310.666667,110.666667
heightInches,568.333333,380.666667,709.333333,572.666667
weight,947.0,605.333333,1338.666667,889.333333
primaryPositionCode,719.333333,355.0,1159.666667,519.666667
primaryPositionName,174.666667,85.666667,299.666667,113.0


## Prediction

In [57]:
test_df = df_engagement.loc[df_engagement["date"]==20210426, :]
display(test_df.head())

prediction_df = df_engagement.loc[df_engagement["date"]=="2021-04-26", ["date", "date_playerId"]].reset_index(drop=False)
prediction_df["date"] = prediction_df["date"].apply(lambda x: int(str(x).replace("-","")[:8]))
for col in ["target1", "target2", "target3", "target4"]:
    prediction_df[col] = 0
display(prediction_df.head())

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,weekday,yearmonth,...,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds


Unnamed: 0,index,date,date_playerId,target1,target2,target3,target4
0,2493810,20210426,2021-04-26_543606,0,0,0,0
1,2493811,20210426,2021-04-26_458708,0,0,0,0
2,2493812,20210426,2021-04-26_545121,0,0,0,0
3,2493813,20210426,2021-04-26_518813,0,0,0,0
4,2493814,20210426,2021-04-26_665947,0,0,0,0


In [51]:
def create_test_dataset(df_test, input_prediction, df_player, lab_encs):
    test = df_test.copy()  # we don't use this yet
    prediction = input_prediction.copy()
    
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d")
    prediction["engagementMetricsDate"] = prediction["date_playerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d")
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: int(x[9:]))
    
    prediction["weekday"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])
    
    
    df_test = pd.merge(prediction, df_player, on="playerId", how="left")
    
    id_cols = ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
    targets = ["target1", "target2", "target3", "target4"]
    not_needed = ["DOB", "playerName", "mlbDebutDate"]
    x_test = df_test.drop(id_cols+targets+not_needed, axis=1)
    y_test = df_test[targets]
    id_test = df_test[id_cols]
    
    cat_cols_names = ["primaryPositionCode", "primaryPositionName", "birthCountry", "birthStateProvince", "birthCity"]

    for lab_enc, col_name in zip(lab_encs, cat_cols_names):
        new_col = le.transform(x_test[col_name])
        x_test[col_name] = new_col
    
    return x_test, id_test

In [76]:
def predict(x_test, cat_col_names, n_folds):
    preds = np.zeros((len(x_test), 4))
    
    
    for target_idx, target in enumerate(["target1", "target2", "target3", "target4"]):
        
        # load the model
        
        for fold_idx in range(n_folds):
            with open(f"model_{target}_{fold_idx}.pickle", "rb") as file:
                model = pickle.load(file)
                pred = model.predict(x_test)
                preds[:, target_idx] += pred / n_folds
    
    return preds

In [84]:
def preds_to_dataframe(preds, id_test):
    df_output = pd.DataFrame(preds)
    target_col_names = ["target1", "target2", "target3", "target4"]
    df_output.columns = target_col_names
    df_output["date_playerId"] = id_test["date_playerId"]
    df_output = df_output.reindex(["date_playerId"] + target_col_names, axis=1)
    return df_output

delete the below cell when submitting the code

In [22]:
# df_test = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_test.csv")
# df_prediction = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
# x_test, id_test = create_test_dataset(df_test, df_prediction, df_players, label_encoders)
# preds = predict(x_test, cat_cols_names, 3)
# df_output = preds_to_dataframe(preds, id_test)

In [86]:
# df_output

Unnamed: 0,date_playerId,target1,target2,target3,target4
0,20210427_656669,0.110753,1.745119,0.011048,0.602071
1,20210427_543475,0.013672,1.315646,0.006961,0.455145
2,20210427_592866,0.020442,1.221870,0.005025,0.882427
3,20210427_452678,0.020864,1.479299,0.003331,0.667692
4,20210427_570257,0.001498,0.294598,0.002539,0.176086
...,...,...,...,...,...
5930,20210501_596049,0.007072,0.380612,0.000637,0.219379
5931,20210501_642851,0.001799,0.334169,0.016888,0.107578
5932,20210501_596071,0.038199,1.548090,0.021006,0.409881
5933,20210501_664901,0.008283,0.552284,0.000845,0.096057


In [91]:
!pip install mlb



In [99]:
import mlb
env = mlb.make_env()
iter_test = env.iter_test()

for (df_test, df_prediction) in iter_test:
    test = df_test.copy()
    prediction = df_prediction.copy()
    prediction = prediction.reset_index(drop=False)
    
    print("date:", prediction["date"][0])
    
    x_test, id_test = create_test_dataset(test, prediction, df_players, label_encoders)
    preds = predict(x_test, cat_cols_names, 3)
    df_submit = preds_to_dataframe(preds, id_test)
    
    df_submit.fillna(0.)
    df_submit.clip(0, 100)
    
    env.predict(df_submit)
    
print("Done")

ModuleNotFoundError: No module named 'mlb.competition'

In [89]:
import os
print(os.listdir('/kaggle/input/mlb-player-digital-engagement-forecasting/'))

['players.csv', 'example_sample_submission.csv', 'teams.csv', 'seasons.csv', 'example_test.csv', 'train_updated.csv', 'train.csv', 'awards.csv', 'mlb']
