In [1]:
# multi-head attention TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import gc

# 데이터셋 불러오기
file_path = 'C:\\Users\\dssal\\OneDrive\\바탕 화면\\이원병\\mp_statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# 2020, 2021, 2022년에 모두 존재하는 player_id 추출
data_2015 = data[data['year'] == 2015]
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2015 = set(data_2015['player_id'].unique())
player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2015 & player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2015, 2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# TFT 모델 정의
class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()
        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()
        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm2 = LayerNormalization()
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(feature_dim)
        ])
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        attn_output1 = self.multi_head_attention1(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output1)
        attn_output2 = self.multi_head_attention2(out1, out1)
        out2 = self.layer_norm2(out1 + attn_output2)
        ffn_output = self.ffn(out2)
        flat_output = self.flatten(ffn_output)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2023년 데이터 필터링
    data_19 = common_data[common_data['year'] == 2019]

    # 2023년 데이터 스케일링
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model.predict(X_2019_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_19[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    
    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

    del model, history, y_pred_scaled, y_pred  # 필요한 변수를 명시적으로 삭제
    tf.keras.backend.clear_session()
    gc.collect()

print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")

Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 1.0105415259504928, MAE: 0.7567863706380379, MAPE: 0.18175039194652232
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 1.4676638266114173, MAE: 1.1223001854359604, MAPE: 0.2669463684130409
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 1.0377797948789727, MAE: 0.793915309345021, MAPE: 0.18588388766976507
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 0.8284287472327642, MAE: 0.60915088373072, MAPE: 0.14065228235194677
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 1.0597759297319944, MAE: 0.813505641392299, MAPE: 0.1810012412667025
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 0.7751586956479596, MAE: 0.6119171606592771, MAPE: 0.14463710816550646
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 1.2851599860709029, MAE: 0.9490804650603223, MAPE: 0.21402694717238555
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 1.5235249027684086, MAE: 1.1695972491512778, MAPE: 0.27015064895199276
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 0.958503435523496, MAE: 0.742408007

In [2]:
# basic TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Flatten
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import gc

# 데이터셋 불러오기
file_path = 'C:\\Users\\dssal\\OneDrive\\바탕 화면\\이원병\\mp_statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# 2020, 2021, 2022년에 모두 존재하는 player_id 추출
data_2015 = data[data['year'] == 2015]
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2015 = set(data_2015['player_id'].unique())
player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2015 & player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2015, 2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# 모델 정의
class BasicTFTModel(Model):
    def __init__(self, seq_length, feature_dim, ff_dim, state_size, dropout_rate):
        super(BasicTFTModel, self).__init__()
        self.layer_norm1 = LayerNormalization()
        self.ffn1 = Dense(ff_dim, activation="relu")
        self.layer_norm2 = LayerNormalization()
        self.ffn2 = Dense(feature_dim)
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        x = self.layer_norm1(inputs)
        x = self.ffn1(x)
        x = self.layer_norm2(x)
        x = self.ffn2(x)
        flat_output = self.flatten(x)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = BasicTFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2023년 데이터 필터링
    data_19 = common_data[common_data['year'] == 2019]

    # 2023년 데이터 스케일링
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model.predict(X_2019_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_19[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    
    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

    del model, history, y_pred_scaled, y_pred  # 필요한 변수를 명시적으로 삭제
    tf.keras.backend.clear_session()
    gc.collect()

print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")



Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 1.2032712284214218, MAE: 0.9282617026417194, MAPE: 0.21336464988249645
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 1.62066946629485, MAE: 1.270893403742494, MAPE: 0.31612818529571723
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 1.469660562444251, MAE: 1.1239956841348602, MAPE: 0.26703073591706833
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 1.5907812692885583, MAE: 1.3003099478393043, MAPE: 0.33047108075592807
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 1.3103256521067703, MAE: 0.9351888719125956, MAPE: 0.21241688401524214
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 1.7958499572772433, MAE: 1.3501862889027396, MAPE: 0.30037767238619345
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 1.7591560014043024, MAE: 1.3658516092059991, MAPE: 0.31899333911712835
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 1.4880408670772747, MAE: 1.1218343921068334, MAPE: 0.24116915005173808
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 1.541599085584661, MAE: 1.2162092

In [2]:
# Full TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import gc

# 데이터셋 불러오기
file_path = 'C:\\Users\\dssal\\OneDrive\\바탕 화면\\이원병\\mp_statcast.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['last_name, first_name'])

# Fill missing values with 0
data = data.fillna(0)

if 'pitch_hand' in data:
    data = pd.get_dummies(data, columns=['pitch_hand'], drop_first=True)

# 특정 연도에 모두 존재하는 player_id 추출
data_2015 = data[data['year'] == 2015]
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2015 = set(data_2015['player_id'].unique())
player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2015 & player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2015, 2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 4  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# TFT 모델 정의 (정적 특성 제거)
class GatedResidualNetwork(tf.keras.layers.Layer):
    def __init__(self, input_dim, state_size, dropout_rate):
        super(GatedResidualNetwork, self).__init__()
        self.dense1 = Dense(state_size, activation="relu")
        self.dense2 = Dense(input_dim)  # 입력 차원에 맞춰 조정
        self.gate = Dense(input_dim, activation="sigmoid")
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout_rate)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout(x)
        x = self.dense2(x)
        gate_output = self.gate(inputs)
        gated_output = x * gate_output + inputs
        return self.layer_norm(gated_output)

# TFT 모델 정의 시 GatedResidualNetwork에 input_dim을 전달하도록 수정
class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()

        # Temporal Attention Layers
        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()

        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm2 = LayerNormalization()

        # Gated Residual Networks for gating mechanisms
        self.grn1 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.grn2 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)

        # Feed Forward Network
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(feature_dim)
        ])

        # Prediction Layers
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        # Temporal Attention Layer 1 with Residual Connection and Gating
        attn_output1 = self.multi_head_attention1(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output1)
        out1 = self.grn1(out1)

        # Temporal Attention Layer 2 with Residual Connection and Gating
        attn_output2 = self.multi_head_attention2(out1, out1)
        out2 = self.layer_norm2(out1 + attn_output2)
        out2 = self.grn2(out2)

        # Feed-Forward Network
        ffn_output = self.ffn(out2)

        # Flatten and predict
        flat_output = self.flatten(ffn_output)

        # Final Dense Layers
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)

        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝 및 학습
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2019년 데이터 필터링 및 예측
    data_19 = common_data[common_data['year'] == 2019]
    X_2019_scaled = scaler_X.transform(data_19[features].values)
    X_2019_seq, _ = create_sequences(X_2019_scaled, np.zeros(len(X_2019_scaled)), seq_length)
    
    # 예측
    y_pred_scaled = model.predict(X_2019_seq)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 비교
    y_test_actual = data_19[target].values
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

    del model, history, y_pred_scaled, y_pred  # 필요한 변수를 명시적으로 삭제
    tf.keras.backend.clear_session()
    gc.collect()
    
print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")

Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 0.9025450338350964, MAE: 0.6560983909078004, MAPE: 0.15258616629841443
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 0.8445683933547644, MAE: 0.6453948687705672, MAPE: 0.15190608515488466
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 1.067448651599777, MAE: 0.7988348751709241, MAPE: 0.19182593986072907
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 1.472287127129282, MAE: 1.126206263133458, MAPE: 0.2671407336615007
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 0.8733309800589424, MAE: 0.6613393999949222, MAPE: 0.15205634242163726
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 1.1395036199846713, MAE: 0.8436833749899343, MAPE: 0.1969356403599439
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 1.3946474086885938, MAE: 1.0567734548023768, MAPE: 0.2587870805146984
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 0.9011536228134561, MAE: 0.663477598478814, MAPE: 0.15259101229805783
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 1.1412936193411494, MAE: 0.845191031