In [1]:
# GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# 데이터셋 불러오기
file_path = 'C:\\Users\\co279\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022, 2023년에 모두 존재하는 player_id 추출
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # GRU 모델 정의
    model_GRU = Sequential()
    model_GRU.add(GRU(64, input_shape=(seq_length, X_train.shape[2]), return_sequences=True))
    model_GRU.add(GRU(64, return_sequences=True))
    model_GRU.add(Dropout(rate=0.5))
    model_GRU.add(Flatten())
    model_GRU.add(Dense(512, activation="relu"))
    model_GRU.add(Dropout(rate=0.5))
    model_GRU.add(Dense(64, activation="relu"))
    model_GRU.add(Dense(1, activation='relu'))

    # 컴파일
    adam = optimizers.Adam(learning_rate=0.001)
    model_GRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # 모델 학습
    history = model_GRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # 2023년 데이터 필터링
    data_19 = common_data[common_data['year'] == 2019]

    # 2023년 데이터 스케일링
    X_2019_scaled = scaler_X.transform(data_19[features].values)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model_GRU.predict(X_2019_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_19[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# 평균 RMSE, MAE, MAPE 계산
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.7751222239459999, MAE: 0.5790493988990784, MAPE: 0.1351094788103509
Iteration 2/5 - RMSE: 0.7101336212187516, MAE: 0.5419024290357317, MAPE: 0.1320106906486919
Iteration 3/5 - RMSE: 0.6683934385503512, MAE: 0.5140756978307452, MAPE: 0.12772206925945456
Iteration 4/5 - RMSE: 0.746154047758108, MAE: 0.5658378768534887, MAPE: 0.13272217652079346
Iteration 5/5 - RMSE: 0.753607788677392, MAE: 0.5715446783531279, MAPE: 0.13666552035739124
Average RMSE: 0.7306822240301205
Average MAE: 0.5544820161944344
Average MAPE: 0.13284598711933643


In [2]:
# Bi-GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten, Bidirectional
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping

# 데이터셋 불러오기
file_path = 'C:\\Users\\co279\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022, 2023년에 모두 존재하는 player_id 추출
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # Bidirectional GRU 모델 정의
    model_BiGRU = Sequential()
    model_BiGRU.add(Bidirectional(GRU(64, return_sequences=True), input_shape=(seq_length, X_train.shape[2])))
    model_BiGRU.add(Bidirectional(GRU(64, return_sequences=True)))
    model_BiGRU.add(Dropout(rate=0.5))
    model_BiGRU.add(Flatten())
    model_BiGRU.add(Dense(512, activation="relu"))
    model_BiGRU.add(Dropout(rate=0.5))
    model_BiGRU.add(Dense(64, activation="relu"))
    model_BiGRU.add(Dense(1, activation='relu'))

    # 컴파일
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiGRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # 모델 학습
    history = model_BiGRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # 2023년 데이터 필터링
    data_19 = common_data[common_data['year'] == 2019]

    # 2023년 데이터 스케일링
    X_2019_scaled = scaler_X.transform(data_19[features].values)
    
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model_BiGRU.predict(X_2019_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_19[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# 평균 RMSE, MAE, MAPE 계산
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.8398388781997836, MAE: 0.6235268998997553, MAPE: 0.14303214962831806
Iteration 2/5 - RMSE: 0.7913474811968171, MAE: 0.5994638861361005, MAPE: 0.1361016836652324
Iteration 3/5 - RMSE: 0.8174344523780651, MAE: 0.6217391989344643, MAPE: 0.1432395909687949
Iteration 4/5 - RMSE: 0.8059546149966071, MAE: 0.6055848322028206, MAPE: 0.14097528528328607
Iteration 5/5 - RMSE: 0.7414282608847068, MAE: 0.5668557698953719, MAPE: 0.1319639744541068
Average RMSE: 0.7992007375311959
Average MAE: 0.6034341174137025
Average MAPE: 0.13906253679994765


In [3]:
# biattention GRU

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GRU, Dense, Dropout, Flatten, Bidirectional, Input, Layer, dot, concatenate, Activation
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

# 데이터셋 불러오기
file_path = 'C:\\Users\\co279\\mp1.csv'
data = pd.read_csv(file_path)

# 필요 없는 컬럼 제거
data = data.drop(columns=['last_name, first_name'])

# 결측치 처리 (예: 0으로 대체)
data = data.fillna(0)

# 2020, 2021, 2022, 2023년에 모두 존재하는 player_id 추출
data_2016 = data[data['year'] == 2016]
data_2017 = data[data['year'] == 2017]
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

player_ids_2016 = set(data_2016['player_id'].unique())
player_ids_2017 = set(data_2017['player_id'].unique())
player_ids_2018 = set(data_2018['player_id'].unique())
player_ids_2019 = set(data_2019['player_id'].unique())

common_player_ids = player_ids_2016 & player_ids_2017 & player_ids_2018 & player_ids_2019

# 공통 player_id에 해당하는 데이터 추출
common_data = data[data['player_id'].isin(common_player_ids)]

# 2020, 2021, 2022년에 해당하는 데이터만 추출
final = common_data[common_data['year'].isin([2016, 2017, 2018])]
final = final.sort_values(by=['player_id', 'year'])

# 필요한 컬럼 선택 (year 제외)
features = [col for col in final.columns if col not in ['player_id', 'year', 'p_era']]
target = 'p_era'

# 독립변수와 종속변수 분리
X = final[features].values
y = final[target].values

# 데이터 스케일링
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# 시계열 데이터 형태로 변환
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X)):
        seq_x = X[max(0, i - seq_length + 1):i + 1]
        seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
        seq_y = y[i]
        X_seq.append(seq_x)
        y_seq.append(seq_y)
    return np.array(X_seq), np.array(y_seq)

seq_length = 3  # 시퀀스 길이 설정
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# 학습 데이터와 전체 데이터를 동일하게 설정
X_train, y_train = X_seq, y_seq

# 커스텀 Attention Layer 정의
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], input_shape[-1]),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-1],),
                                 initializer="glorot_uniform", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Initialize lists to store metrics
iterations = 5
rmse_list = []
mae_list = []
mape_list = []

for i in range(iterations):
    # BiAttention GRU 모델 정의
    input_seq = Input(shape=(seq_length, X_train.shape[2]))
    
    # Bidirectional GRU
    x = Bidirectional(GRU(64, return_sequences=True))(input_seq)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    
    # Attention Layer 적용
    attn_out = Attention()(x)
    
    # Fully connected layers
    x = Dense(512, activation="relu")(attn_out)
    x = Dropout(rate=0.5)(x)
    x = Dense(64, activation="relu")(x)
    output = Dense(1, activation='linear')(x)
    
    model_BiAttGRU = Model(inputs=input_seq, outputs=output)

    # 컴파일
    adam = optimizers.Adam(learning_rate=0.001)
    model_BiAttGRU.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

    # 조기 종료 콜백
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # 모델 학습
    history = model_BiAttGRU.fit(X_train, y_train, epochs=500, batch_size=64, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # 2023년 데이터 필터링
    data_19 = common_data[common_data['year'] == 2019]

    # 2023년 데이터 스케일링
    X_2019_scaled = scaler_X.transform(data_19[features].values)
    
    def create_sequences_for_prediction(X, seq_length):
        X_seq = []
        for i in range(len(X)):
            seq_x = X[max(0, i - seq_length + 1):i + 1]
            seq_x = np.pad(seq_x, ((seq_length - len(seq_x), 0), (0, 0)), 'constant')
            X_seq.append(seq_x)
        return np.array(X_seq)

    # 시계열 데이터 형태로 변환 (2022년, 2021년, 2020년 데이터를 사용하여 2023년 예측)
    X_2019_seq = create_sequences_for_prediction(X_2019_scaled, seq_length)

    # 2023년 데이터 예측
    y_pred_scaled = model_BiAttGRU.predict(X_2019_seq)

    # 스케일 복원
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # 실제 값과 예측 값 비교를 위해 실제 2023년 p_era 값도 복원
    y_test_actual = data_19[target].values

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test_actual[:len(y_pred)], y_pred))
    rmse_list.append(rmse)

    # MAE 계산
    mae = mean_absolute_error(y_test_actual[:len(y_pred)], y_pred)
    mae_list.append(mae)

    # MAPE 계산
    mape = mean_absolute_percentage_error(y_test_actual[:len(y_pred)], y_pred)
    mape_list.append(mape)

    print(f'Iteration {i+1}/{iterations} - RMSE: {rmse}, MAE: {mae}, MAPE: {mape}')

# 평균 RMSE, MAE, MAPE 계산
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print(f'Average RMSE: {avg_rmse}')
print(f'Average MAE: {avg_mae}')
print(f'Average MAPE: {avg_mape}')

Iteration 1/5 - RMSE: 0.649530487951905, MAE: 0.4852788228931881, MAPE: 0.1178634009560572
Iteration 2/5 - RMSE: 0.6859619014243299, MAE: 0.5162494223458426, MAPE: 0.1322029444537617
Iteration 3/5 - RMSE: 0.6409130186313422, MAE: 0.48341486956392016, MAPE: 0.11771695127051149
Iteration 4/5 - RMSE: 0.6516920951924791, MAE: 0.49454890611625857, MAPE: 0.12085999302210812
Iteration 5/5 - RMSE: 0.6384837455314137, MAE: 0.4734307493766149, MAPE: 0.11748786105557951
Average RMSE: 0.653316249746294
Average MAE: 0.4905845540591649
Average MAPE: 0.12122623015160361
