In [None]:
# KNN

from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 데이터 로드
data = pd.read_excel(r"C:\Users\user\Desktop\팀프로젝트\mlb_dp_20.xlsx")
X = data.drop(columns=["theta_p", "theta_n", "distance"])
y = data[["theta_p", "distance"]]

# 1차: Train(80%) + Test(20%) 분할
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# KNN 모델 정의
knn = KNeighborsRegressor(n_neighbors=10)

# 다중출력 회귀 모델 정의
multi_target_knn = MultiOutputRegressor(knn)

# KFold 교차 검증 설정
kfold = KFold(n_splits=5, shuffle=True, random_state=1234)

# 평가 지표 계산 함수 정의
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred):
    # MAPE 계산 시 0 값에 대해 오류를 방지하기 위해 작은 값으로 나누기
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100  # 백분율로 변환

# X_train과 y_train을 리셋한 후 KFold 적용
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# 교차 검증 후 MSE, MAE, RMSE, MAPE 출력
fold = 1
for train_index, val_index in kfold.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # 모델 훈련
    multi_target_knn.fit(X_train_fold, y_train_fold)

    # 예측
    y_pred = multi_target_knn.predict(X_val_fold)

    # MSE, MAE, RMSE, MAPE 계산 (두 출력값에 대해서 동시에 계산)
    fold_mse = mse(y_val_fold, y_pred)
    fold_mae = mae(y_val_fold, y_pred)
    fold_rmse = rmse(y_val_fold, y_pred)
    fold_mape = mape(y_val_fold, y_pred)

    # 각 fold별 결과 출력
    print(f"Fold {fold}:")
    print(f"  MSE: {fold_mse:.4f}")
    print(f"  MAE: {fold_mae:.4f}")
    print(f"  RMSE: {fold_rmse:.4f}")
    print(f"  MAPE: {fold_mape:.4f}%\n")

    fold += 1


In [None]:
# optuna

!pip install optuna
import optuna
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 데이터 로드
data = pd.read_excel(r"C:\Users\user\Desktop\팀프로젝트\mlb_dp_20.xlsx")
X = data.drop(columns=["theta_p", "theta_n", "distance"])
y = data[["theta_p", "distance"]]

# 1차: Train(80%) + Test(20%) 분할
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=1234)

# KFold 교차 검증 설정
kfold = KFold(n_splits=5, shuffle=True, random_state=1234)

# 평가 지표 계산 함수 정의
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100  # 백분율로 변환

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 정의
    n_neighbors = trial.suggest_int('n_neighbors', 8, 22)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    leaf_size = trial.suggest_int('leaf_size', 10, 50)
    p = trial.suggest_int('p', 1, 2)
    metric = trial.suggest_categorical('metric', ['minkowski', 'euclidean', 'manhattan', 'chebyshev'])

    # KNN 모델 정의
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm,
                              leaf_size=leaf_size, p=p, metric=metric)
    multi_target_knn = MultiOutputRegressor(knn)

    # 교차 검증 후 MSE, MAE, RMSE, MAPE 계산
    fold_rmse_list = []
    fold_mape_list = []

    for train_index, val_index in kfold.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # 모델 훈련
        multi_target_knn.fit(X_train_fold, y_train_fold)

        # 예측
        y_pred = multi_target_knn.predict(X_val_fold)

        # RMSE와 MAPE 계산
        fold_rmse_list.append(rmse(y_val_fold, y_pred))
        fold_mape_list.append(mape(y_val_fold, y_pred))

    # RMSE와 MAPE의 평균을 구함
    avg_rmse = np.mean(fold_rmse_list)
    avg_mape = np.mean(fold_mape_list)

    # 가중합
    return 0.7 * avg_rmse + 0.3 * (avg_mape / 100)  # 가중합 (MAPE를 %에서 소수로 변환)  # RMSE 최적화

# Optuna의 최적화 실행
study = optuna.create_study(direction='minimize')  # 최소화 목표
study.optimize(objective, n_trials=50)

# 최적의 하이퍼파라미터 확인
print(f"Best trial: {study.best_trial.params}")


In [None]:
# 최적 모델 저장

import pickle

opt_knn = KNeighborsRegressor(**study.best_params)
file_name = "KNN_model.pkl"
with open(file_name, "wb") as f:
    pickle.dump(opt_knn, f)
print(f"✅ 모델이 {file_name}에 저장되었습니다.")