<a href="https://colab.research.google.com/github/yeonyeo/LG_Aimers/blob/main/test1_RandomizedSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm


!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (19.6 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 126284 files and direc

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
train = pd.read_csv('/content/drive/MyDrive/open/train/train.csv')

In [7]:
# 결측치 확인
print(train.isna().sum())

# 음수 판매량 개수 확인
print("음수 매출 개수:", (train['매출수량']<0).sum())

영업일자        0
영업장명_메뉴명    0
매출수량        0
dtype: int64
음수 매출 개수: 14


In [8]:
# 음수 이상치 0으로 처리
train['매출수량'] = train['매출수량'].clip(lower=0)

In [9]:
# 결측치 확인
print(train.isna().sum())

# 음수 판매량 개수 확인
print("음수 매출 개수:", (train['매출수량']<0).sum())

영업일자        0
영업장명_메뉴명    0
매출수량        0
dtype: int64
음수 매출 개수: 0


In [10]:
from sklearn.preprocessing import LabelEncoder
# 카테고리 인코딩
# 업장명, 메뉴명 분리
train[['업장명', '메뉴명']] = train['영업장명_메뉴명'].str.split('_', expand=True)
# 각각 인코딩
le1 = LabelEncoder()
le2 = LabelEncoder()
train['업장명'] = le1.fit_transform(train['업장명'])
train['메뉴명'] = le2.fit_transform(train['메뉴명'])

In [11]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train[['매출수량']] = scaler.fit_transform(train[['매출수량']])

In [12]:
# 피처엔지니링
train['영업일자'] = pd.to_datetime(train['영업일자'])
# 요일 컬럼 생성
train['요일'] = pd.to_datetime(train['영업일자']).dt.dayofweek  # 0=월~6=일
# 주말 플래그
train['is_weekend'] = train['요일'].isin([5, 6]).astype(int)
# 금요일, 토요일, 일요일 개별 플래그
train['is_friday'] = (train['요일'] == 4).astype(int)
train['is_saturday'] = (train['요일'] == 5).astype(int)
train['is_sunday'] = (train['요일'] == 6).astype(int)
# 공휴일 플래그
holidays = ['2023-05-05', '2023-08-15', '2023-09-28', ...]  # 직접 입력
train['is_holiday'] = train['영업일자'].astype(str).isin(holidays).astype(int)

# 월, 계절
train['월'] = train['영업일자'].dt.month
train['계절'] = train['월'].map({12:'겨울', 1:'겨울', 2:'겨울', 3:'봄', 4:'봄', 5:'봄', 6:'여름', 7:'여름', 8:'여름', 9:'가을', 10:'가을', 11:'가을'})
train = pd.get_dummies(train, columns=['계절'])
# 최근 N일(7, 14, 28)간 매출 평균
for window in [7, 14, 28]:
    train[f'rolling_mean_{window}'] = train.groupby('영업장명_메뉴명')['매출수량']\
        .transform(lambda x: x.rolling(window, min_periods=1).mean().shift(1))

In [13]:
from sklearn.preprocessing import LabelEncoder

for col in ['영업장명_메뉴명', '요일']:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

In [14]:
split_day = train['영업일자'].sort_values().unique()[-7]  # 마지막 7일 기준
train_set = train[train['영업일자'] < split_day]
valid_set = train[train['영업일자'] >= split_day]

X_train = train_set.drop(['매출수량', '영업일자'], axis=1)
y_train = train_set['매출수량']
X_valid = valid_set.drop(['매출수량', '영업일자'], axis=1)
y_valid = valid_set['매출수량']

In [15]:
# NaN 처리
train = train.fillna(0)
# 숫자형만 남기고 결측값 채우기
X_train = X_train.select_dtypes(include=[np.number]).fillna(0)
X_valid = X_valid.select_dtypes(include=[np.number]).fillna(0)

In [17]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

# 파라미터 후보군 (적당히 좁게, n_iter=20 정도부터 시작)
param_dist = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'max_depth': [4, 6, 8, 10, 12],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.01, 0.05, 0.1, 0.5],
    'reg_lambda': [0, 0.01, 0.05, 0.1, 0.5],
}

lgbm = LGBMRegressor(random_state=42)
random_search = RandomizedSearchCV(
    lgbm,
    param_distributions=param_dist,
    n_iter=20,              # 20회만 시도, Colab이면 n_iter=10~20 추천
    scoring='neg_mean_absolute_error',  # 회귀면 MAE가 빠르고 실용적
    cv=3,
    verbose=1,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best Score (neg MAE):", random_search.best_score_)
# 평가
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    smape_vals = np.abs(y_true - y_pred) / denominator
    return np.mean(smape_vals) * 100
# 튜닝된 모델로 검증셋 예측
best_lgbm = random_search.best_estimator_
lgbm_pred = best_lgbm.predict(X_valid)
print("튜닝 후 LightGBM SMAPE:", smape(y_valid, lgbm_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1173
[LightGBM] [Info] Number of data points in the train set: 101325, number of used features: 13
[LightGBM] [Info] Start training from score 0.007801
Best Params: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Best Score (neg MAE): -0.006048595046891985
튜닝 후 LightGBM SMAPE: 66.29621524987893


In [18]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'max_depth': [4, 6, 8, 10, 12],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.01, 0.05, 0.1, 0.5],
    'reg_lambda': [0, 0.01, 0.05, 0.1, 0.5],
}

xgb = XGBRegressor(tree_method='hist', random_state=42)
random_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
xgb_pred = random_search.best_estimator_.predict(X_valid)
print("튜닝 후 XGBoost SMAPE:", smape(y_valid, xgb_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Params: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.03, 'colsample_bytree': 1.0}
튜닝 후 XGBoost SMAPE: 67.14715924293961
