In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, ShuffleSplit, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import time
import joblib  # 모델 저장을 위한 라이브러리
from itertools import combinations

# 한글 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

In [38]:
# 데이터 불러오기 및 기본 확인
file_path = '../../data/features/final_oneHot/광어_price_features_notnull.csv'
data = pd.read_csv(file_path)

# 날짜 데이터 변환
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)

# 불필요한 'item' 컬럼 제거
data.drop(columns=['item'], inplace=True)

# 결측치 처리 - 수온의 평균값으로 대체
data['광어_수온_22107_79'].fillna(data['광어_수온_22107_79'].mean(), inplace=True)


In [52]:
# 선택할 피처 리스트
def train_xgb_model(data, target_col='avgPrice', features=None, test_size=0.2):
    if features is None:
        features = [col for col in data.columns if col != target_col]

    # 데이터 분할
    X = data[features]
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = XGBRegressor(n_estimators = 100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f'RMSE: {rmse:.2f}')
    print(f'R2 Score: {r2:.4f}')
    print(f'MAE: {mae:.2f}')

    # # 모델 저장
    # joblib.dump(best_model, 'xgb_model.joblib')
    # print("학습된 모델이 'xgb_model.joblib'로 저장되었습니다.")

    return model    #, rmse


# 피처 조합 탐색 및 최적 조합 출력
def feature_selection_experiment(data, target_col='avgPrice'):
    all_features = [col for col in data.columns if col != target_col]
    best_results = []
    select_columns = [
        '광어_20대_1', '광어_20대_250', '광어_30대_1', '광어_30대_317', '광어_40대_1',
       '광어_40대_330', '광어_50대_1', '광어_50대_395', '광어_60대이상_1', '광어_60대이상_339',
       '광어_Gold_314', '광어_KOSPI_136', '광어_MOVE_18', '광어_Silver_238',
       '광어_USD/KRW_1', '광어_VIX_399', '광어_WTI_1', '광어_기온_22105_97',
       '광어_수온_22107_79', '광어_습도_22186_349', '광어_파주기_22190_103'
    ]
    fixed_columns = ['m_가락시장', 'm_강서농수산물시장', 'm_구리농수산물시장', 'm_노량진 1층', 'm_노량진 2층',
       'm_마포농수산물시장', 'm_부산민락어민활어직판장', 'm_소래포구종합어시장', 'm_수원농수산물시장',
       'm_안양평촌농수산물시장', 'm_인천종합연안부두어시장', 'avgPrice', 'avgPrice_lag_1'
       ]
    
    for i in range(20, len(select_columns)):
        for combo in combinations(select_columns,i):
            final_columns = fixed_columns+list(combo)
            model, rmse = train_xgb_model(data, target_col=target_col, features=final_columns)
            # y_test = data[target_col].iloc[-len(model.predict(data[final_columns])):]  # 테스트 데이터
            # y_pred = model.predict(data[final_columns])
            # rmse = mean_squared_error(y_test, y_pred, squared=False)
            best_results.append((rmse, combo))

    # for i in range(1, len(all_features) + 1):
    #     for combo in combinations(all_features, i):
    #         model = train_xgb_model(data, target_col=target_col, features=list(combo))
    #         y_test = data[target_col].iloc[-len(model.predict(data[list(combo)])):]  # 테스트 데이터
    #         y_pred = model.predict(data[list(combo)])
    #         rmse = mean_squared_error(y_test, y_pred, squared=False)
    #         best_results.append((rmse, combo))

    best_results.sort()
    top_5 = best_results[:5]
    for rank, (rmse, features) in enumerate(top_5, 1):
        print(f'Top {rank}: RMSE = {rmse:.2f}, Features = {features}')

In [53]:
# # 모델 학습 및 평가 실행
print("\n--- 모델 학습 및 평가 ---")
model = train_xgb_model(data, target_col='avgPrice')

# 피처 조합 탐색 실행
print("\n--- 피처 조합 탐색 ---")
feature_selection_experiment(data, target_col='avgPrice')


--- 모델 학습 및 평가 ---




RMSE: 3.84
R2 Score: 0.9083
MAE: 2.95

--- 피처 조합 탐색 ---


KeyError: "['m_수원농수산물시장', 'avgPrice_lag_1'] not in index"

In [27]:
a = [1,2,3]
b = [1,2,3,4]
print(list(set(b)-set(a)))

[4]
