In [None]:
# Predicting 2024 ERA (basic, Full-tft sl=4)

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
file_path = 'data/basic.csv'
data = pd.read_csv(file_path)

# Remove unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Handle missing values (e.g., replace with 0)
data = data.fillna(0)

# Use data from 2020 to 2023 only
data = data[data['year'].isin([2020, 2021, 2022, 2023])]

# Extract common player_id with data from 2020 to 2023
player_ids_2020 = set(data[data['year'] == 2020]['player_id'].unique())
player_ids_2021 = set(data[data['year'] == 2021]['player_id'].unique())
player_ids_2022 = set(data[data['year'] == 2022]['player_id'].unique())
player_ids_2023 = set(data[data['year'] == 2023]['player_id'].unique())
common_player_ids = player_ids_2020 & player_ids_2021 & player_ids_2022 & player_ids_2023

# Extract data corresponding to common player_id
common_data = data[data['player_id'].isin(common_player_ids)]
common_data = common_data.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in common_data.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Separate independent and dependent variables
X = common_data[features].values
y = common_data[target].values

# Data scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to time series data format
def create_sequences(X, seq_length):
    """
    Convert 2020~2023 data for each player into a single sequence.
    """
    X_seq = []
    num_players = len(X) // seq_length  # Calculate number of players
    for i in range(num_players):
        start_idx = i * seq_length
        end_idx = start_idx + seq_length
        seq_x = X[start_idx:end_idx]
        X_seq.append(seq_x)
    return np.array(X_seq)

seq_length = 4  # Set sequence length
X_seq = create_sequences(X_scaled, seq_length)
y_seq = y_scaled[seq_length - 1::seq_length]  # Extract the last value for each player

# Set training data
X_train, y_train = X_seq, y_seq

# Define TFT model
class GatedResidualNetwork(tf.keras.layers.Layer):
    def __init__(self, input_dim, state_size, dropout_rate):
        super(GatedResidualNetwork, self).__init__()
        self.dense1 = Dense(state_size, activation="relu")
        self.dense2 = Dense(input_dim)
        self.gate = Dense(input_dim, activation="sigmoid")
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout_rate)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout(x)
        x = self.dense2(x)
        gate_output = self.gate(inputs)
        gated_output = x * gate_output + inputs
        return self.layer_norm(gated_output)

class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()
        self.grn1 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.grn2 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output)
        out1 = self.grn1(out1)
        out2 = self.grn2(out1)
        flat_output = self.flatten(out2)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# Set model with fixed hyperparameters
state_size = 80
dropout_rate = 0.2
minibatch_size = 128
learning_rate = 0.01
max_gradient_norm = 0.01
num_heads = 1

# Array to store prediction results
predictions = []

for iteration in range(5):  # Repeat 5 times
    tf.keras.backend.clear_session()
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(loss="mse", optimizer=optimizer, metrics=["mse", "mae"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train model
    model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Predict 2024 data
    X_2024_seq = create_sequences(X_scaled, seq_length)
    y_pred_scaled = model.predict(X_2024_seq)
    predictions.append(scaler_y.inverse_transform(y_pred_scaled).flatten())  # Restore scale and save

# Calculate the average of 5 predictions
average_prediction = np.mean(predictions, axis=0)

# Match with player_id and output results
player_ids = common_data['player_id'].unique()
df_results = pd.DataFrame({
    'player_id': player_ids,
    'predicted_era_2024': average_prediction
})

print(df_results)


    player_id  predicted_era_2024
0      425794            5.223461
1      425844            5.048904
2      448179            5.146493
3      450203            4.051329
4      453286            3.585796
..        ...                 ...
69     666200            4.168821
70     668678            3.646165
71     669203            3.192470
72     669456            3.726211
73     670950            4.784991

[74 rows x 2 columns]


In [None]:
# Predicting 2024 ERA (statcast, Full-tft sl=4)

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
file_path = 'data/statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Handle missing values (e.g., replace with 0)
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# Use only data from 2020 to 2023
data = data[data['year'].isin([2020, 2021, 2022, 2023])]

# Extract common player_id with data from 2020 to 2023
player_ids_2020 = set(data[data['year'] == 2020]['player_id'].unique())
player_ids_2021 = set(data[data['year'] == 2021]['player_id'].unique())
player_ids_2022 = set(data[data['year'] == 2022]['player_id'].unique())
player_ids_2023 = set(data[data['year'] == 2023]['player_id'].unique())
common_player_ids = player_ids_2020 & player_ids_2021 & player_ids_2022 & player_ids_2023

# Extract data corresponding to common player_id
common_data = data[data['player_id'].isin(common_player_ids)]
common_data = common_data.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in common_data.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Separate independent and dependent variables
X = common_data[features].values
y = common_data[target].values

# Data scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to time series data
def create_sequences(X, seq_length):
    """
    Convert each player's 2020-2023 data into a single sequence.
    """
    X_seq = []
    num_players = len(X) // seq_length  # Calculate number of players
    for i in range(num_players):
        start_idx = i * seq_length
        end_idx = start_idx + seq_length
        seq_x = X[start_idx:end_idx]
        X_seq.append(seq_x)
    return np.array(X_seq)

seq_length = 4  # Set sequence length
X_seq = create_sequences(X_scaled, seq_length)
y_seq = y_scaled[seq_length - 1::seq_length]  # Extract the last value for each player

# Set training data
X_train, y_train = X_seq, y_seq

# Define TFT model
class GatedResidualNetwork(tf.keras.layers.Layer):
    def __init__(self, input_dim, state_size, dropout_rate):
        super(GatedResidualNetwork, self).__init__()
        self.dense1 = Dense(state_size, activation="relu")
        self.dense2 = Dense(input_dim)
        self.gate = Dense(input_dim, activation="sigmoid")
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout_rate)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout(x)
        x = self.dense2(x)
        gate_output = self.gate(inputs)
        gated_output = x * gate_output + inputs
        return self.layer_norm(gated_output)

class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()
        self.grn1 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.grn2 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output)
        out1 = self.grn1(out1)
        out2 = self.grn2(out1)
        flat_output = self.flatten(out2)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# Set model with fixed hyperparameters
state_size = 320
dropout_rate = 0.4
minibatch_size = 64
learning_rate = 0.001
max_gradient_norm = 0.01
num_heads = 4

# Array to store prediction results
predictions = []

for iteration in range(5):  # Repeat 5 times
    tf.keras.backend.clear_session()
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(loss="mse", optimizer=optimizer, metrics=["mse", "mae"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train model
    model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Predict 2024 data
    X_2024_seq = create_sequences(X_scaled, seq_length)
    y_pred_scaled = model.predict(X_2024_seq)
    predictions.append(scaler_y.inverse_transform(y_pred_scaled).flatten())  # Restore scale and save

# Calculate average of 5 predictions
average_prediction = np.mean(predictions, axis=0)

# Match with player_id and output results
player_ids = common_data['player_id'].unique()
df_results_mps = pd.DataFrame({
    'player_id': player_ids,
    'predicted_era_2024': average_prediction
})

print(df_results_mps)


    player_id  predicted_era_2024
0      425794            6.882403
1      425844            5.394585
2      448179            5.186832
3      450203            4.358220
4      453286            3.927478
..        ...                 ...
69     666200            4.312869
70     668678            3.677975
71     669203            3.637825
72     669456            3.955359
73     670950            4.200325

[74 rows x 2 columns]
