In [1]:
# GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
file_path = 'data/statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# Extract player_ids that exist in all years (2019, 2020, 2021, 2022, 2023)
data_2019 = data[data['year'] == 2019]
data_2020 = data[data['year'] == 2020]
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2019 = set(data_2019['player_id'].unique())
player_ids_2020 = set(data_2020['player_id'].unique())
player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2019 & player_ids_2020 & player_ids_2021 & player_ids_2022 & player_ids_2023

# Extract data for common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data for the years 2019, 2020, 2021, 2022
final = common_data[common_data['year'].isin([2019, 2020, 2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Separate independent and dependent variables
X = final[features].values
y = final[target].values

# Data scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to time series data
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data to be the same as the entire data
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Define GRU model
    model_GRU = Sequential()
    model_GRU.add(GRU(64, input_shape=(seq_length, X_train.shape[2]), return_sequences=True))
    model_GRU.add(GRU(64, return_sequences=True))
    model_GRU.add(Dropout(rate=0.5))
    model_GRU.add(Flatten())
    model_GRU.add(Dense(512, activation="relu"))
    model_GRU.add(Dropout(rate=0.5))
    model_GRU.add(Dense(64, activation="relu"))
    model_GRU.add(Dense(1, activation='relu'))

    # Compile
    adam = optimizers.Adam(learning_rate=0.001)
    model_GRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train model
    history = model_GRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2023 data
    data_23 = common_data[common_data['year'] == 2023]

    # Scale 2023 data
    X_2023_scaled = scaler_X.transform(data_23[features].values)

    # Convert to time series data (use 2019~2022 data to predict 2023)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # Predict 2023 data
    y_pred_scaled = model_GRU.predict(X_2023_seq)

    # Inverse scale
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # Restore actual 2023 p_era values for comparison
    y_test_actual = data_23[target].values

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # Calculate MAE
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# Calculate average RMSE, MAE, MAPE
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.9560276570535653, MAE: 0.7419234699673122, MAPE: 0.1608599672082652
Iteration 2/5 - RMSE: 0.7374423225844757, MAE: 0.5773915800972591, MAPE: 0.1284125915067521
Iteration 3/5 - RMSE: 0.821518730841329, MAE: 0.6317891681005086, MAPE: 0.13700516759923587
Iteration 4/5 - RMSE: 0.7394755412656008, MAE: 0.5792420300983249, MAPE: 0.12761320697851947
Iteration 5/5 - RMSE: 1.2562089910994154, MAE: 0.9501905084034755, MAPE: 0.21161174181160855
Average RMSE: 0.9021346485688774
Average MAE: 0.696107351333376
Average MAPE: 0.15310053502087623


In [2]:
# Bi-GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten, Bidirectional
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
file_path = 'data/statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# Extract player_ids that exist in 2019, 2020, 2021, 2022, and 2023
data_2019 = data[data['year'] == 2019]
data_2020 = data[data['year'] == 2020]
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2019 = set(data_2019['player_id'].unique())
player_ids_2020 = set(data_2020['player_id'].unique())
player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2019 & player_ids_2020 & player_ids_2021 & player_ids_2022 & player_ids_2023

# Extract data for common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data for the years 2019, 2020, 2021, and 2022
final = common_data[common_data['year'].isin([2019, 2020, 2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])
# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Separate independent and dependent variables
X = final[features].values
y = final[target].values

# Data scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to time series data format
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data to be the same as the entire data
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Define Bidirectional GRU model
    model_BiGRU = Sequential()
    model_BiGRU.add(Bidirectional(GRU(64, return_sequences=True), input_shape=(seq_length, X_train.shape[2])))
    model_BiGRU.add(Bidirectional(GRU(64, return_sequences=True)))
    model_BiGRU.add(Dropout(rate=0.5))
    model_BiGRU.add(Flatten())
    model_BiGRU.add(Dense(512, activation="relu"))
    model_BiGRU.add(Dropout(rate=0.5))
    model_BiGRU.add(Dense(64, activation="relu"))
    model_BiGRU.add(Dense(1, activation='relu'))

    # Compile
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiGRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train model
    history = model_BiGRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2023 data
    data_23 = common_data[common_data['year'] == 2023]

    # Scale 2023 data
    X_2023_scaled = scaler_X.transform(data_23[features].values)
    
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    # Convert to time series data format (using 2022, 2021, 2020, 2019 data to predict 2023)
    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # Predict 2023 data
    y_pred_scaled = model_BiGRU.predict(X_2023_seq)

    # Restore scale
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # Restore actual 2023 p_era values for comparison
    y_test_actual = data_23[target].values

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # Calculate MAE
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# Calculate average RMSE, MAE, MAPE
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.7931281723594807, MAE: 0.629649693625314, MAPE: 0.1423023958899047
Iteration 2/5 - RMSE: 0.7299522165602768, MAE: 0.5534959970201765, MAPE: 0.12294412624929221
Iteration 3/5 - RMSE: 1.071830346899878, MAE: 0.812652264246865, MAPE: 0.1779589862626963
Iteration 4/5 - RMSE: 0.8691483939366246, MAE: 0.6831056764390736, MAPE: 0.14924783356663113
Iteration 5/5 - RMSE: 1.0570913337836403, MAE: 0.8410065548003668, MAPE: 0.18484117194270883
Average RMSE: 0.90423009270798
Average MAE: 0.7039820372263591
Average MAPE: 0.1554589027822466


In [3]:
# biattention GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten, Bidirectional, Input, Layer, dot, concatenate, Activation
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

# Load dataset
file_path = 'data/statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# Extract player_ids that exist in 2019, 2020, 2021, 2022, and 2023
data_2019 = data[data['year'] == 2019]
data_2020 = data[data['year'] == 2020]
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2019 = set(data_2019['player_id'].unique())
player_ids_2020 = set(data_2020['player_id'].unique())
player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2019 & player_ids_2020 & player_ids_2021 & player_ids_2022 & player_ids_2023

# Extract data for common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data for the years 2019, 2020, 2021, and 2022
final = common_data[common_data['year'].isin([2019, 2020, 2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Separate independent and dependent variables
X = final[features].values
y = final[target].values

# Data scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to time series data
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data to be the same as the entire data
X_train, y_train = X_seq, y_seq

# Define custom Attention Layer
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], input_shape[-1]),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-1],),
                                 initializer="glorot_uniform", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Define BiAttention GRU model
    input_seq = Input(shape=(seq_length, X_train.shape[2]))
    
    # Bidirectional GRU
    x = Bidirectional(GRU(64, return_sequences=True))(input_seq)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    
    # Apply Attention Layer
    attn_out = Attention()(x)
    
    # Fully connected layers
    x = Dense(512, activation="relu")(attn_out)
    x = Dropout(rate=0.5)(x)
    x = Dense(64, activation="relu")(x)
    output = Dense(1, activation='linear')(x)
    
    model_BiAttGRU = Model(inputs=input_seq, outputs=output)

    # Compile
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiAttGRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train model
    history = model_BiAttGRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2023 data
    data_23 = common_data[common_data['year'] == 2023]

    # Scale 2023 data
    X_2023_scaled = scaler_X.transform(data_23[features].values)
    
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    # Convert to time series data (using 2022, 2021, 2020, 2019 data to predict 2023)
    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # Predict 2023 data
    y_pred_scaled = model_BiAttGRU.predict(X_2023_seq)

    # Inverse scale
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # Restore actual 2023 p_era values for comparison
    y_test_actual = data_23[target].values

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # Calculate MAE
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# Calculate average RMSE, MAE, MAPE
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.6728226222799613, MAE: 0.5427144294314915, MAPE: 0.13178833969495776
Iteration 2/5 - RMSE: 0.6679693451546063, MAE: 0.5444474429175966, MAPE: 0.1264926413289803
Iteration 3/5 - RMSE: 0.75527691619252, MAE: 0.5784723277319046, MAPE: 0.13671451777896154
Iteration 4/5 - RMSE: 0.7608847236833624, MAE: 0.59522155943371, MAPE: 0.14035239088587442
Iteration 5/5 - RMSE: 0.6912876747879596, MAE: 0.5544204695262607, MAPE: 0.13248522468930424
Average RMSE: 0.7096482564196819
Average MAE: 0.5630552458081928
Average MAPE: 0.13356662287561566
