<a href="https://colab.research.google.com/github/yeonyeo/LG_Aimers/blob/main/test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm


!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (11.3 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 126284 files and direc

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
train = pd.read_csv('/content/drive/MyDrive/open/train/train.csv')

In [7]:
# 결측치 확인
print(train.isna().sum())

# 음수 판매량 개수 확인
print("음수 매출 개수:", (train['매출수량']<0).sum())

영업일자        0
영업장명_메뉴명    0
매출수량        0
dtype: int64
음수 매출 개수: 14


In [8]:
# 음수 이상치 0으로 처리
train['매출수량'] = train['매출수량'].clip(lower=0)

In [9]:
# 결측치 확인
print(train.isna().sum())

# 음수 판매량 개수 확인
print("음수 매출 개수:", (train['매출수량']<0).sum())

영업일자        0
영업장명_메뉴명    0
매출수량        0
dtype: int64
음수 매출 개수: 0


In [11]:
from sklearn.preprocessing import LabelEncoder
# 카테고리 인코딩
# 업장명, 메뉴명 분리
train[['업장명', '메뉴명']] = train['영업장명_메뉴명'].str.split('_', expand=True)
# 각각 인코딩
le1 = LabelEncoder()
le2 = LabelEncoder()
train['업장명'] = le1.fit_transform(train['업장명'])
train['메뉴명'] = le2.fit_transform(train['메뉴명'])

In [12]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train[['매출수량']] = scaler.fit_transform(train[['매출수량']])

In [15]:
# 피처엔지니링
# 날짜 컬럼을 datetime 타입으로 변환
train['영업일자'] = pd.to_datetime(train['영업일자'])
# 요일 컬럼생성
weekday_map = {
    'Monday': '월', 'Tuesday': '화', 'Wednesday': '수',
    'Thursday': '목', 'Friday': '금', 'Saturday': '토', 'Sunday': '일'
}
train['요일'] = train['영업일자'].dt.day_name().map(weekday_map)

In [18]:
from sklearn.preprocessing import LabelEncoder

for col in ['영업장명_메뉴명', '요일']:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

In [19]:
split_day = train['영업일자'].sort_values().unique()[-7]  # 마지막 7일 기준
train_set = train[train['영업일자'] < split_day]
valid_set = train[train['영업일자'] >= split_day]

X_train = train_set.drop(['매출수량', '영업일자'], axis=1)
y_train = train_set['매출수량']
X_valid = valid_set.drop(['매출수량', '영업일자'], axis=1)
y_valid = valid_set['매출수량']

In [20]:
# XGBoost 모델 TEST
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    tree_method='hist'
)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_valid)

# 평가
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    smape_vals = np.abs(y_true - y_pred) / denominator
    return np.mean(smape_vals) * 100

print("XGBoost SMAPE:", smape(y_valid, xgb_pred))

XGBoost SMAPE: 71.54822865391725


In [21]:
# LightGBM test
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_valid)

print("LightGBM SMAPE:", smape(y_valid, lgbm_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 101325, number of used features: 4
[LightGBM] [Info] Start training from score 0.007801
LightGBM SMAPE: 73.45586497412998
