

> 모델 학습



In [None]:
#모델 학습 및 테스트
import pandas as pd
data = pd.read_excel('mlb_dp_20.xlsx')


from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_predict
kfold = KFold( n_splits=5, shuffle=True, random_state = 1234)

X = data.drop(columns=["theta_p", "distance", "theta_n", "Unnamed: 0"])
y = data[["theta_p", "distance"]]

# 1차: Train(80%) + Test(20%) 분할
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=42)

# XGBoost 다중 출력 회귀 모델
xgb_model = MultiOutputRegressor( XGBRegressor(random_state=1234, n_jobs=-1))
xgb_model.fit(x_train, y_train)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # 백분율로 변환

fold = 1
msel = []
mael = []
rmsel = []
mapel = []
for train_index, val_index in kfold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # 모델 훈련
    xgb_model.fit(X_train_fold, y_train_fold)

    # 예측
    y_pred = xgb_model.predict(X_val_fold)

    # 평가 지표 계산
    fold_mse = mse(y_val_fold, y_pred)
    fold_mae = mae(y_val_fold, y_pred)
    fold_rmse = rmse(y_val_fold, y_pred)
    fold_mape = mape(y_val_fold, y_pred)

    msel.append(fold_mse)
    mael.append(fold_mae)
    rmsel.append(fold_rmse)
    mapel.append(fold_mape)

    # 결과 출력
    print(f"Fold {fold}:")
    print(f"  MSE: {fold_mse:.4f}")
    print(f"  MAE: {fold_mae:.4f}")
    print(f"  RMSE: {fold_rmse:.4f}")
    print(f"  MAPE: {fold_mape:.4f}%\n")

    fold += 1

import numpy as np
print(np.mean(msel))
print(np.mean(mael))
print(np.mean(rmsel))
print(np.mean(mapel))



> 하이퍼 파라미터 최적화



In [None]:
# HPT (Optuna)
import optuna
import numpy as np
import pickle
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error


# RMSE 및 MAPE 계산 함수 정의
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# KFold 설정
kfold = KFold(n_splits=5, random_state=1234, shuffle=True)

# Optuna 목적 함수 정의
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000, step=50),
        "max_depth": trial.suggest_int("max_depth", 10, 30),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.1),
        "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.7, 0.9),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-4, 5.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 7, 12),
        "gamma": trial.suggest_loguniform("gamma", 1e-2, 7.0)
    }

    X = data.drop(columns=["theta_p", "distance", "theta_n", "Unnamed: 0"])
    y = data[["theta_p", "distance"]]

    # 1차: Train(80%) + Test(20%) 분할
    x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
    x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=42)


    xgb_model = MultiOutputRegressor(XGBRegressor(random_state=1234, n_jobs=-1))

    rmse_scores, mape_scores = [], []

    for train_idx, val_idx in kfold.split(x_train):
        X_t, X_v = x_train.iloc[train_idx], x_train.iloc[val_idx]
        y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]

        xgb_model.fit(X_t, y_t)
        y_pred = xgb_model.predict(X_v)

        rmse_scores.append(rmse(y_v, y_pred))
        mape_scores.append(mape(y_v, y_pred))

    # RMSE와 MAPE를 동시에 최적화하기 위해 가중 평균을 사용
    rmse_mean = np.mean(rmse_scores)
    mape_mean = np.mean(mape_scores)

    return 0.7 * rmse_mean + 0.3 * (mape_mean / 100)  # 가중합 (MAPE를 %에서 소수로 변환)

# Optuna 실행
xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=1234))
xgb_study.optimize(objective, n_trials=50)

# 최적 하이퍼파라미터 출력
print(" 최적 하이퍼파라미터 (Optuna):", xgb_study.best_params)
print(" 최적 RMSE + MAPE 가중합 (Optuna):", xgb_study.best_trial.value)

# 최적 모델 저장
opt_xgb =  MultiOutputRegressor(XGBRegressor(random_state=1234, **xgb_study.best_params))
file_name = "xgb_model.pkl"
with open(file_name, "wb") as f:
    pickle.dump(opt_xgb, f)
print(f" 모델이 {file_name}에 저장되었습니다.")



> 최종 모델 성능



In [None]:
# 최종 모델 성능
from joblib import load
xgb_model = load("xgb_model.pkl")

X = data.drop(columns=["theta_p", "distance", "theta_n", "Unnamed: 0"])
y = data[["theta_p", "distance"]]

# 1차: Train(80%) + Test(20%) 분할
x_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
X_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=42)

xgb_model.fit(X_train, y_train)
print("XGBoost RMSE : ", rmse(y_test, xgb_model.predict(X_test)))
print("XGBoost MAPE : ", mape(y_test, xgb_model.predict(X_test)))