# 머신러닝 모델

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import time
from itertools import product

## 데이터 로드

In [3]:
# 데이터 로드
df = pd.read_csv('../../data/features/final_oneHot/광어_price_features_oneHot.csv')
X = df.drop(['date', 'item', 'avgPrice'], axis=1)
y = df['avgPrice']

In [6]:
# 평가 지표 계산 함수
def calculate_metrics(y_true, y_pred, training_time):
   mae = mean_absolute_error(y_true, y_pred)
   mse = mean_squared_error(y_true, y_pred)
   rmse = np.sqrt(mse)
   r2 = r2_score(y_true, y_pred)
   rmsle = np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))
   mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
   
   return {
       'MAE': mae,
       'MSE': mse,
       'RMSE': rmse,
       'R2': r2,
       'RMSLE': rmsle,
       'MAPE': mape,
       'Training_Time': training_time
   }

# 하이퍼파라미터 그리드
rf_params = {
   'n_estimators': [100, 200, 300],
   'max_depth': [10, 20, 30],
   'min_samples_split': [2, 5, 10]
}

lgb_params = {
   'n_estimators': [100, 200, 300],
   'max_depth': [10, 20, 30],
   'learning_rate': [0.01, 0.1]
}

cat_params = {
   'iterations': [100, 200, 300],
   'depth': [6, 8, 10],
   'learning_rate': [0.01, 0.1]
}

xgb_params = {
   'n_estimators': [100, 200, 300],
   'max_depth': [6, 8, 10],
   'learning_rate': [0.01, 0.1]
}

In [7]:

# Moving Window 평가
tscv = TimeSeriesSplit(n_splits=5, test_size=len(X)//10)
models = {
   'RandomForest': (RandomForestRegressor(), rf_params),
   'LightGBM': (lgb.LGBMRegressor(), lgb_params),
   'CatBoost': (CatBoostRegressor(verbose=False), cat_params),
   'XGBoost': (xgb.XGBRegressor(), xgb_params)
}

results = {}
for name, (model, params) in models.items():
   print(f"\nTraining {name}...")
   fold_metrics = []
   
   for train_idx, test_idx in tscv.split(X):
       X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
       y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
       
       # 그리드서치
       start_time = time.time()
       grid_search = GridSearchCV(model, params, cv=2, n_jobs=-1)
       grid_search.fit(X_train, y_train)
       training_time = time.time() - start_time
       
       # 예측 및 평가
       y_pred = grid_search.predict(X_test)
       metrics = calculate_metrics(y_test, y_pred, training_time)
       fold_metrics.append(metrics)
   
   # 평균 지표 계산
   avg_metrics = {metric: np.mean([fold[metric] for fold in fold_metrics]) 
                 for metric in fold_metrics[0].keys()}
   results[name] = avg_metrics


Training RandomForest...

Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5501
[LightGBM] [Info] Number of data points in the train set: 14692, number of used features: 32
[LightGBM] [Info] Start training from score 32186.028791
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5551
[LightGBM] [Info] Number of data points in the train set: 17629, number of used features: 32
[LightGBM] [Info] Start training from score 32807.715696
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5564
[LightGBM] [Info] Number of data points in the train set: 20566,

2 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\miniconda3\envs\env311_cu121\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\miniconda3\envs\env311_cu121\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\u


Training XGBoost...


In [8]:

# 랜덤 분할 평가
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
random_results = {}

for name, (model, params) in models.items():
   print(f"\nTraining {name} (Random Split)...")
   start_time = time.time()
   grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1)
   grid_search.fit(X_train, y_train)
   training_time = time.time() - start_time
   
   y_pred = grid_search.predict(X_test)
   metrics = calculate_metrics(y_test, y_pred, training_time)
   random_results[name] = metrics




Training RandomForest (Random Split)...

Training LightGBM (Random Split)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5568
[LightGBM] [Info] Number of data points in the train set: 26439, number of used features: 32
[LightGBM] [Info] Start training from score 35557.104467

Training CatBoost (Random Split)...

Training XGBoost (Random Split)...


In [None]:
# 결과 저장
time_series_results = pd.DataFrame(results)
random_split_results = pd.DataFrame(random_results)

time_series_results.to_csv('time_series_evaluation.csv')
random_split_results.to_csv('random_split_evaluation.csv')