In [1]:
# multi-head attention TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import gc

# 데이터셋 불러오기
file_path = 'C:\\Users\\dssal\\OneDrive\\바탕 화면\\이원병\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022년에 모두 존재하는 player_id 추출
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2021 & player_ids_2022 & player_ids_2023

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 2  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# TFT 모델 정의
class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()
        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()
        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm2 = LayerNormalization()
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(feature_dim)
        ])
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        attn_output1 = self.multi_head_attention1(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output1)
        attn_output2 = self.multi_head_attention2(out1, out1)
        out2 = self.layer_norm2(out1 + attn_output2)
        ffn_output = self.ffn(out2)
        flat_output = self.flatten(ffn_output)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2023년 데이터 필터링
    data_23 = common_data[common_data['year'] == 2023]

    # 2023년 데이터 스케일링
    X_2023_scaled = scaler_X.transform(data_23[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model.predict(X_2023_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_23[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    
    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

    del model, history, y_pred_scaled, y_pred  # 필요한 변수를 명시적으로 삭제
    tf.keras.backend.clear_session()
    gc.collect()
    
print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")

Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 1.324380964164905, MAE: 1.0038815007693525, MAPE: 0.2747654264209429
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 1.3271512753634382, MAE: 1.0055115227077318, MAPE: 0.27445355236477786
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 0.9252169938169674, MAE: 0.6983390066128422, MAPE: 0.19152629777401403
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 0.6835537436790717, MAE: 0.5382475184703219, MAPE: 0.1483008825248013
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 1.3490538062142972, MAE: 1.0198583302060187, MAPE: 0.27294017580722235
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 0.7920472260092012, MAE: 0.621324519097517, MAPE: 0.16502793399829074
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 0.9585648521717184, MAE: 0.6844291298400951, MAPE: 0.20266839050973398
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 0.6976111035687614, MAE: 0.5354165409843702, MAPE: 0.1546243584096838
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 0.8301706989133076, MAE: 0.6322303

In [2]:
# basic TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Flatten
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product

# 데이터셋 불러오기
file_path = 'C:\\Users\\co279\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022년에 모두 존재하는 player_id 추출
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2021 & player_ids_2022 & player_ids_2023

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 2  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# 모델 정의
class BasicTFTModel(Model):
    def __init__(self, seq_length, feature_dim, ff_dim, state_size, dropout_rate):
        super(BasicTFTModel, self).__init__()
        self.layer_norm1 = LayerNormalization()
        self.ffn1 = Dense(ff_dim, activation="relu")
        self.layer_norm2 = LayerNormalization()
        self.ffn2 = Dense(feature_dim)
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        x = self.layer_norm1(inputs)
        x = self.ffn1(x)
        x = self.layer_norm2(x)
        x = self.ffn2(x)
        flat_output = self.flatten(x)
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)
        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = BasicTFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2023년 데이터 필터링
    data_23 = common_data[common_data['year'] == 2023]

    # 2023년 데이터 스케일링
    X_2023_scaled = scaler_X.transform(data_23[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model.predict(X_2023_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_23[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    
    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")



Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 0.8160301219159812, MAE: 0.6402566931443514, MAPE: 0.181404016880695
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 1.091973447983296, MAE: 0.704434143738902, MAPE: 0.2115864925258478
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 1.2058466247125574, MAE: 0.9521406999475138, MAPE: 0.2563773044209574
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 0.9582061074094803, MAE: 0.7519918044868874, MAPE: 0.19897605295784163
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 0.877315848593236, MAE: 0.6815529330341137, MAPE: 0.19281597197665695
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 1.0636259711562597, MAE: 0.792907833845719, MAPE: 0.2264666929457936
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 1.360308341386191, MAE: 1.0329631666745538, MAPE: 0.27061678739395456
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 1.542602905460849, MAE: 1.11205577739051, MAPE: 0.3022113706576185
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 1.316285025240575, MAE: 0.9649400949017437,

In [2]:
# Full TFT

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Flatten, MultiHeadAttention
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import gc

# 데이터셋 불러오기
file_path = 'C:\\Users\\dssal\\OneDrive\\바탕 화면\\이원병\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022년에 모두 존재하는 player_id 추출
data_2021 = data[data['year'] == 2021]
data_2022 = data[data['year'] == 2022]
data_2023 = data[data['year'] == 2023]

player_ids_2021 = set(data_2021['player_id'].unique())
player_ids_2022 = set(data_2022['player_id'].unique())
player_ids_2023 = set(data_2023['player_id'].unique())

common_player_ids = player_ids_2021 & player_ids_2022 & player_ids_2023

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2021, 2022])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 2  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 하이퍼파라미터 설정
state_sizes = [10, 20, 40, 80, 160, 240, 320]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9]
minibatch_sizes = [32, 64, 128]
learning_rates = [0.0001, 0.001, 0.01]
max_gradient_norms = [0.01, 1, 100.0]
num_heads = [1, 2, 3, 4]

# 하이퍼파라미터 조합 생성
hyperparameter_combinations = list(product(state_sizes, dropout_rates, minibatch_sizes, learning_rates, max_gradient_norms, num_heads))

best_rmse = float('inf')
best_mae = float('inf')
best_mape = float('inf')
best_params = None

# TFT 모델 정의 (정적 특성 제거)
class GatedResidualNetwork(tf.keras.layers.Layer):
    def __init__(self, input_dim, state_size, dropout_rate):
        super(GatedResidualNetwork, self).__init__()
        self.dense1 = Dense(state_size, activation="relu")
        self.dense2 = Dense(input_dim)  # 입력 차원에 맞춰 조정
        self.gate = Dense(input_dim, activation="sigmoid")
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout_rate)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout(x)
        x = self.dense2(x)
        gate_output = self.gate(inputs)
        gated_output = x * gate_output + inputs
        return self.layer_norm(gated_output)

# TFT 모델 정의 시 GatedResidualNetwork에 input_dim을 전달하도록 수정
class TFTModel(Model):
    def __init__(self, seq_length, feature_dim, num_heads, ff_dim, state_size, dropout_rate):
        super(TFTModel, self).__init__()

        # Temporal Attention Layers
        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm1 = LayerNormalization()

        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=feature_dim)
        self.layer_norm2 = LayerNormalization()

        # Gated Residual Networks for gating mechanisms
        self.grn1 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)
        self.grn2 = GatedResidualNetwork(feature_dim, state_size, dropout_rate)

        # Feed Forward Network
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(feature_dim)
        ])

        # Prediction Layers
        self.flatten = Flatten()
        self.dense1 = Dense(state_size, activation="relu")
        self.dropout1 = Dropout(dropout_rate)
        self.dense2 = Dense(state_size // 4, activation="relu")
        self.dense3 = Dense(1, activation="linear")

    def call(self, inputs):
        # Temporal Attention Layer 1 with Residual Connection and Gating
        attn_output1 = self.multi_head_attention1(inputs, inputs)
        out1 = self.layer_norm1(inputs + attn_output1)
        out1 = self.grn1(out1)

        # Temporal Attention Layer 2 with Residual Connection and Gating
        attn_output2 = self.multi_head_attention2(out1, out1)
        out2 = self.layer_norm2(out1 + attn_output2)
        out2 = self.grn2(out2)

        # Feed-Forward Network
        ffn_output = self.ffn(out2)

        # Flatten and predict
        flat_output = self.flatten(ffn_output)

        # Final Dense Layers
        dense_output1 = self.dense1(flat_output)
        drop_output1 = self.dropout1(dense_output1)
        dense_output2 = self.dense2(drop_output1)

        return self.dense3(dense_output2)

# 하이퍼파라미터 튜닝 및 학습
for state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads in hyperparameter_combinations:
    tf.keras.backend.clear_session()
    
    # 모델 인스턴스 생성
    model = TFTModel(seq_length=seq_length, feature_dim=X_train.shape[2], num_heads=num_heads, ff_dim=32, state_size=state_size, dropout_rate=dropout_rate)
    
    # 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_gradient_norm)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # 모델 학습
    history = model.fit(X_train, y_train, epochs=500, batch_size=minibatch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    # 2023년 데이터 필터링
    data_23 = common_data[common_data['year'] == 2023]

    # 2023년 데이터 스케일링
    X_2023_scaled = scaler_X.transform(data_23[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2023_seq = create_sequences_for_prediction(X_2023_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model.predict(X_2023_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_23[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    
    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)

    if rmse < best_rmse:
        best_rmse = rmse
        best_mae = mae
        best_mape = mape
        best_params = (state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads)

    print(f"Params: {state_size, dropout_rate, minibatch_size, learning_rate, max_gradient_norm, num_heads}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}")

    del model, history, y_pred_scaled, y_pred  # 필요한 변수를 명시적으로 삭제
    tf.keras.backend.clear_session()
    gc.collect()
    
print(f"Best RMSE: {best_rmse}, Best MAE: {best_mae}, Best MAPE: {best_mape}, Best Params: {best_params}")

Params: (10, 0.1, 32, 0.0001, 0.01, 1), RMSE: 1.3221048596686582, MAE: 1.0025090424910834, MAPE: 0.2750281144684063
Params: (10, 0.1, 32, 0.0001, 0.01, 2), RMSE: 0.7173707254718158, MAE: 0.548587448378116, MAPE: 0.16080100006738773
Params: (10, 0.1, 32, 0.0001, 0.01, 3), RMSE: 1.3216790226727102, MAE: 1.00224872050078, MAPE: 0.27507794000877234
Params: (10, 0.1, 32, 0.0001, 0.01, 4), RMSE: 0.6968199108877784, MAE: 0.5417541302114293, MAPE: 0.1595838529793355
Params: (10, 0.1, 32, 0.0001, 1, 1), RMSE: 0.7976981586726041, MAE: 0.621293065190891, MAPE: 0.17405921594122423
Params: (10, 0.1, 32, 0.0001, 1, 2), RMSE: 0.8275498081461363, MAE: 0.6479120078063817, MAPE: 0.17307847303803256
Params: (10, 0.1, 32, 0.0001, 1, 3), RMSE: 0.7521380975330724, MAE: 0.5879522520332521, MAPE: 0.17410976891111862
Params: (10, 0.1, 32, 0.0001, 1, 4), RMSE: 0.784977023529066, MAE: 0.6187938043806287, MAPE: 0.16700629524101435
Params: (10, 0.1, 32, 0.0001, 100.0, 1), RMSE: 0.7950098541821042, MAE: 0.623491954