In [1]:
# BiLSTM

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Flatten
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
file_path = 'data/basic.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

# Extract player_ids that exist in 2016, 2017, 2018, and 2019
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# Extract data corresponding to common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data corresponding to 2016, 2017, 2018
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Split independent and dependent variables
X = final[features].values
y = final[target].values

# Scale the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Function to create sequences for time series data
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Initialize the BiLSTM model
    model_BiLSTM = Sequential()
    model_BiLSTM.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(seq_length, X_train.shape[2])))
    model_BiLSTM.add(Bidirectional(LSTM(64, return_sequences=True)))
    model_BiLSTM.add(Dropout(rate=0.5))
    model_BiLSTM.add(Flatten())
    model_BiLSTM.add(Dense(512, activation="relu"))
    model_BiLSTM.add(Dropout(rate=0.5))
    model_BiLSTM.add(Dense(64, activation="relu"))
    model_BiLSTM.add(Dense(1, activation='relu'))

    # Compile the model
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiLSTM.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history_BiLSTM = model_BiLSTM.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2019 data
    data_19 = common_data[common_data['year'] == 2019]

    # Scale 2019 data
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # Function to create sequences for prediction
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # Predict 2019 data
    y_pred_scaled_BiLSTM = model_BiLSTM.predict(X_2019_seq)

    # Inverse scale the predictions
    y_pred_BiLSTM = scaler_y.inverse_transform(y_pred_scaled_BiLSTM)

    # Actual 2019 p_era values
    y_test_actual = data_19[target].values

    # Calculate RMSE
    rmse_BiLSTM = np.sqrt(mean_squared_error(y_test_actual, y_pred_BiLSTM))
    rmse_list.append(rmse_BiLSTM)

    # Calculate MAE
    mae_BiLSTM = mean_absolute_error(y_test_actual, y_pred_BiLSTM)
    mae_list.append(mae_BiLSTM)
    
    # Calculate MAPE
    mape_BiLSTM = mean_absolute_percentage_error(y_test_actual, y_pred_BiLSTM)
    mape_list.append(mape_BiLSTM)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse_BiLSTM}, MAE: {mae_BiLSTM}, MAPE: {mape_BiLSTM}')

# Calculate average metrics
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.662454275640596, MAE: 0.4997886935869853, MAPE: 0.12612836504215238
Iteration 2/5 - RMSE: 0.6885740569393176, MAE: 0.5076321281137921, MAPE: 0.12584478848315314
Iteration 3/5 - RMSE: 0.6573902796347724, MAE: 0.4949054245721727, MAPE: 0.12404524726482763
Iteration 4/5 - RMSE: 0.6402646799452496, MAE: 0.4821146085148766, MAPE: 0.11961239170502991
Iteration 5/5 - RMSE: 0.6443625789074132, MAE: 0.48374063429378333, MAPE: 0.12124944605537043
Average RMSE: 0.6586091742134698
Average MAE: 0.49363629781632207
Average MAPE: 0.12337604771010671


In [2]:
# CNN-BiLSTM

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, Reshape
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
file_path = 'data/basic.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

# Extract player_ids that exist in 2016, 2017, 2018, and 2019
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# Extract data corresponding to common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data corresponding to the years 2016, 2017, and 2018
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Split independent and dependent variables
X = final[features].values
y = final[target].values

# Scale the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Function to create sequences for time series data
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Initialize the CNN-BiLSTM model
    model_CNN_BiLSTM = Sequential()
    model_CNN_BiLSTM.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(seq_length, X_train.shape[2])))
    model_CNN_BiLSTM.add(Flatten())
    model_CNN_BiLSTM.add(Dense(64, activation='relu'))
    model_CNN_BiLSTM.add(Reshape((1, 64)))
    model_CNN_BiLSTM.add(Bidirectional(LSTM(64, return_sequences=True)))
    model_CNN_BiLSTM.add(Bidirectional(LSTM(64)))
    model_CNN_BiLSTM.add(Dense(1))

    # Compile the model
    adam = optimizers.Adam(learning_rate=0.001)
    model_CNN_BiLSTM.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history_CNN_BiLSTM = model_CNN_BiLSTM.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2019 data
    data_19 = common_data[common_data['year'] == 2019]

    # Scale 2019 data
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # Function to create sequences for prediction
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # Predict 2019 data
    y_pred_scaled_CNN_BiLSTM = model_CNN_BiLSTM.predict(X_2019_seq)

    # Inverse scale the predictions
    y_pred_CNN_BiLSTM = scaler_y.inverse_transform(y_pred_scaled_CNN_BiLSTM)

    # Actual 2019 p_era values
    y_test_actual = data_19[target].values

    # Calculate RMSE
    rmse_CNN_BiLSTM = np.sqrt(mean_squared_error(y_test_actual, y_pred_CNN_BiLSTM))
    rmse_list.append(rmse_CNN_BiLSTM)

    # Calculate MAE
    mae_CNN_BiLSTM = mean_absolute_error(y_test_actual, y_pred_CNN_BiLSTM)
    mae_list.append(mae_CNN_BiLSTM)
    
    # Calculate MAPE
    mape_CNN_BiLSTM = mean_absolute_percentage_error(y_test_actual, y_pred_CNN_BiLSTM)
    mape_list.append(mape_CNN_BiLSTM)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse_CNN_BiLSTM}, MAE: {mae_CNN_BiLSTM}, MAPE: {mape_CNN_BiLSTM}')

# Calculate average metrics
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.6931566370931409, MAE: 0.5304834375778834, MAPE: 0.13284108749605708
Iteration 2/5 - RMSE: 0.6866524138542723, MAE: 0.5218686689365478, MAPE: 0.12957913403714583
Iteration 3/5 - RMSE: 0.654233870918899, MAE: 0.48372688722042817, MAPE: 0.12065783014902304
Iteration 4/5 - RMSE: 0.6524609789832243, MAE: 0.48428567562784475, MAPE: 0.12156705243817038
Iteration 5/5 - RMSE: 0.6739845199168804, MAE: 0.5107364993435995, MAPE: 0.12679892909087215
Average RMSE: 0.6720976841532833
Average MAE: 0.5062202337412607
Average MAPE: 0.1262888066422537


In [3]:
# BiLSTM-ED

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Flatten, RepeatVector, TimeDistributed
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
file_path = 'data/basic.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

# Extract player_ids that exist in 2016, 2017, 2018, and 2019
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# Extract data corresponding to common player_ids
common_data = data[data['player_id'].isin(common_player_ids)]

# Extract data corresponding to 2016, 2017, 2018
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# Select necessary columns (excluding year)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# Split independent and dependent variables
X = final[features].values
y = final[target].values

# Scale the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Function to create sequences for time series data
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # Set sequence length
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Set training data
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Initialize the BiLSTM-ED model
    model_BiLSTM_ED = Sequential()
    model_BiLSTM_ED.add(Bidirectional(LSTM(64, return_sequences=False), input_shape=(seq_length, X_train.shape[2])))
    model_BiLSTM_ED.add(RepeatVector(seq_length))
    model_BiLSTM_ED.add(Bidirectional(LSTM(64, return_sequences=True)))
    model_BiLSTM_ED.add(TimeDistributed(Dense(1)))

    # Compile the model
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiLSTM_ED.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history_BiLSTM_ED = model_BiLSTM_ED.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Filter 2019 data
    data_19 = common_data[common_data['year'] == 2019]

    # Scale 2019 data
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # Function to create sequences for prediction
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # Predict 2019 data
    y_pred_scaled_BiLSTM_ED = model_BiLSTM_ED.predict(X_2019_seq)

    # Inverse scale the predictions
    y_pred_BiLSTM_ED = scaler_y.inverse_transform(y_pred_scaled_BiLSTM_ED[:, -1, :])  # Take the last time step

    # Actual 2019 p_era values
    y_test_actual = data_19[target].values

    # Calculate RMSE
    rmse_BiLSTM_ED = np.sqrt(mean_squared_error(y_test_actual, y_pred_BiLSTM_ED))
    rmse_list.append(rmse_BiLSTM_ED)

    # Calculate MAE
    mae_BiLSTM_ED = mean_absolute_error(y_test_actual, y_pred_BiLSTM_ED)
    mae_list.append(mae_BiLSTM_ED)
    
    # Calculate MAPE
    mape_BiLSTM_ED = mean_absolute_percentage_error(y_test_actual, y_pred_BiLSTM_ED)
    mape_list.append(mape_BiLSTM_ED)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse_BiLSTM_ED}, MAE: {mae_BiLSTM_ED}, MAPE: {mape_BiLSTM_ED}')

# Calculate average metrics
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.6336219688468926, MAE: 0.4814413395098278, MAPE: 0.1185051208490851
Iteration 2/5 - RMSE: 0.6146225474459316, MAE: 0.46058686472120736, MAPE: 0.11433076538987386
Iteration 3/5 - RMSE: 0.6431848818715695, MAE: 0.4819822298345112, MAPE: 0.11781800630584134
Iteration 4/5 - RMSE: 0.6086321905042628, MAE: 0.45778338091714044, MAPE: 0.11472520468636052
Iteration 5/5 - RMSE: 0.6109915283931701, MAE: 0.4515072388592221, MAPE: 0.11543070096256981
Average RMSE: 0.6222106234123653
Average MAE: 0.46666021076838177
Average MAPE: 0.11616195963874612
