In [75]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
from torch import nn
import numpy as np

import json
import pickle
import random
import datetime as dt

## Creating a LSTM model

In [46]:
class MLBModel(torch.nn.Module):
    def __init__(self, n_time, cat_var_sizes, cat_embedding_sizes, num_numerical_vars, n_dense_output, hidden_size):
        super(MLBModel, self).__init__()
        self.dense_layers = nn.ModuleList()
        self.embedding_layers = nn.ModuleList()
        
        # Prepare embedding layers
        for unique_cats, embedding_dim in zip(cat_var_sizes, cat_embedding_sizes):
            embedding = nn.Embedding(unique_cats, embedding_dim)
            self.embedding_layers.append(embedding)
        
        n_dense_input = sum(cat_embedding_sizes) + num_numerical_vars
        
        # Prepare dense layers
        for _ in range(n_time):
            dense = nn.Linear(n_dense_input, n_dense_output)
            self.dense_layers.append(dense)
        print(n_dense_output)
        # Prepare LSTM
        self.lstm = nn.LSTM(n_dense_output, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 4)
    
    def forward(self, categorical_inps, numerical_inps):
        # Processing categorical vars
        cat_oups = [embedding_layer(categorical_inp) for categorical_inp, embedding_layer in zip(categorical_inps, self.embedding_layers)]
        
        # Create a list of inputs for dense layers
        denses_inps = [torch.cat((numerical_inp, cat_oup), dim=1) for numerical_inp, cat_oup in zip(numerical_inps, cat_oups)]
        
        # Process through dense layers and concatenate
        denses_oups = [dense_layer(dense_inp) for dense_inp, dense_layer in zip(denses_inps, self.dense_layers)]
        denses_oups = torch.cat(denses_oups, dim=1)

        
        # Pass through LSTM
        lstm_output, _ = self.lstm(denses_oups.unsqueeze(0))  # unsqueeze to add batch dimension: note LSTM accepts (L, N, X)
        lstm_output = lstm_output[:, -1, :]  # only get the hidden state from the last layer
        predictions = self.output_layer(lstm_output.squeeze(0))
        
        return predictions

Testing the above model with a toy data

In [47]:
# Example usage with one categorical variable
n_time = 5
cat_var_sizes = [5]
cat_embedding_sizes = [10]
num_numerical_vars = 2
n_dense_output = 32
hidden_size = 64

# Create an instance of the model
model = MLBModel(n_time, cat_var_sizes, cat_embedding_sizes, num_numerical_vars, n_dense_output, hidden_size)

# Example input data with one categorical variable
categorical_inputs = [torch.LongTensor([0, 2, 1, 4, 3])]  # Provide input for one categorical variable
numerical_inputs = [torch.FloatTensor([[1.0, 2.0],
                                       [2.0, 3.0],
                                       [0.5, 1.5],
                                       [1.2, 0.8],
                                       [2.0, 1.0]])]

# Forward pass through the model
output = model(categorical_inputs, numerical_inputs)

print("LSTM Output:")
print(output)

32
LSTM Output:
tensor([ 0.0858, -0.0203,  0.0584,  0.0163], grad_fn=<AddBackward0>)


In [32]:
df_train = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv")

In [48]:
df_players = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv")

In [55]:
def extract_json2(df, col_name):
    col = df[col_name]
    
    json_oups = []
    for row in range(len(col)):
        json_oups.append(pd.read_json(col[row]))
    
    final_oup = pd.concat(json_oups, axis=0)
    
    identifier_col = final_oup["engagementMetricsDate"] + "_" + final_oup["playerId"].astype(str)
    final_oup["date_playerId"] = identifier_col
    
    return final_oup

In [50]:
class OriginalLabelEncoder:
    def __init__(self):
        self.label_to_int = {}
        self.int_to_label = {}
        self.current_int = 0
    
    def fit(self, labels):
        for label in labels:
            if label not in self.label_to_int:
                self.label_to_int[label] = self.current_int
                self.int_to_label[self.current_int] = label
                self.current_int += 1
    
    def transform(self, labels):
        transformed_labels = np.zeros((len(labels)), dtype=np.int16)
        for i, label in enumerate(labels):
            if label not in self.label_to_int:
                transformed_labels[i] = random.randint(0, self.current_int-1)
            else:
                transformed_labels[i] = self.label_to_int[label]
        return transformed_labels
    
    def fit_transform(self, labels):
        self.fit(labels)
        return self.transform(labels)

We create a simple preprocessor of our data

In [79]:
def preprocess(df):
    df_extracted = extract_json2(df, "nextDayPlayerEngagement")
    df_extracted["date"] = pd.to_datetime(df_extracted["engagementMetricsDate"], format="%Y-%m-%d")    
    df_extracted["weekday"] = df_extracted["date"].dt.dayofweek
    df_extracted["yearmonth"] = df_extracted["date"].astype(str).apply(lambda x: x[:7])
    
    
    df_train = pd.merge(df_extracted, df_players, on="playerId", how="left")
    
    id_cols = ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
    targets = ["target1", "target2", "target3", "target4"]
    not_needed = ["DOB", "playerName", "mlbDebutDate"]
    x_train = df_train.drop(id_cols+targets+not_needed, axis=1)
    y_train = df_train[targets]
    id_test = df_train[id_cols]
    
    cat_cols_names = ["primaryPositionCode", "primaryPositionName", "birthCountry", "birthStateProvince", "birthCity"]
    label_encoders = []
    
    x_train_cat = pd.DataFrame()

    for col_name in cat_cols_names:
        le = OriginalLabelEncoder()
        new_col = le.fit_transform(x_train[col_name])
        label_encoders.append(le)

        x_train_cat[col_name] = new_col
    
    return x_train, x_train_cat, y_train, id_test, label_encoders

In [80]:
x_train, x_train_cat, y_train, id_test, label_encoders = preprocess(df_train)

## Validation designing

In [None]:
list_cv_months = [
    [["2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11",
      "2020-12", "2021-01", "2021-02", "2021-03", "2021-04"], "2021-05"],
    [["2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11","2020-12",
      "2021-01", "2021-02", "2021-03", "2021-04", "2021-05"], "2021-06"],
    [["2020-07", "2020-08", "2020-09", "2020-10", "2020-11","2020-12", "2021-01",
      "2021-02", "2021-03", "2021-04", "2021-05", "2021-06"], "2021-07"]
]
folds = []
for train_months, val_month in list_cv_months:
    folds.append([
        id_train.index[id_train["yearmonth"].isin(train_months)],
        id_train.index[(id_train["yearmonth"] == val_month) & (id_train["playerForTestSetAndFuturePreds"] == True)]
    ])