In [15]:
#!pip install gluonts==0.14.4
#!pip install 'gluonts[torch]'
#!pip install --upgrade gluonts
#!pip install --upgrade transformers

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
import holidays
from transformers import PatchTSTConfig, PatchTSTForPrediction, TrainingArguments, Trainer

# 💡 라이브러리 변경: tsfm_public의 도구들을 가져옵니다.
from tsfm_public.toolkit.time_series_preprocessor import TimeSeriesPreprocessor
from tsfm_public.toolkit.dataset import ForecastDFDataset

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
import torch
import torch.nn.functional as F
from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction

TARGET_CH = 0  # sales_log 채널 인덱스(보통 0)

class PatchTSTSalesOnly(torch.nn.Module):
    def __init__(self, base: PatchTSTForPrediction, target_ch: int = TARGET_CH):
        super().__init__()
        self.base = base
        self.target_ch = target_ch
        self.config = base.config  # HF가 참조

    # ★ Trainer가 state_dict 저장할 때 base만 저장되도록
    def state_dict(self, *args, **kwargs):
        return self.base.state_dict(*args, **kwargs)

    # ★ 로드 시에도 base로 로드되도록
    def load_state_dict(self, state_dict, strict=True):
        return self.base.load_state_dict(state_dict, strict)

    # ★ 명시적으로 HuggingFace 형식으로 저장하고 싶을 때
    def save_pretrained(self, save_directory):
        self.base.save_pretrained(save_directory)

    @staticmethod
    def _extract_preds_from_output(out, pred_len: int, num_in_ch: int):
        # dict-like
        if hasattr(out, "keys"):
            for k in ["logits", "predictions", "prediction_outputs", "y_hat", "yhat", "forecast"]:
                v = out.get(k, None)
                if isinstance(v, torch.Tensor):
                    return v
        # attribute
        for k in ["logits", "predictions", "prediction_outputs", "y_hat", "yhat", "forecast"]:
            v = getattr(out, k, None)
            if isinstance(v, torch.Tensor):
                return v
        # tuple/list
        if isinstance(out, (tuple, list)):
            cand = [t for t in out if isinstance(t, torch.Tensor)]
            for t in cand:
                if t.ndim == 3 and t.shape[-2] == pred_len and (t.shape[-1] in (1, num_in_ch)):
                    return t
            for t in cand:
                if t.ndim == 2 and t.shape[-1] == pred_len:
                    return t
            if cand:
                return max(cand, key=lambda x: x.numel())
        # fallback to tuple conversion
        try:
            tup = out.to_tuple()
            for t in tup:
                if isinstance(t, torch.Tensor) and t.ndim >= 2:
                    return t
        except Exception:
            pass
        raise AttributeError("예측 텐서를 출력에서 찾지 못했습니다.")

    def forward(self, past_values, past_observed_mask=None, future_values=None, **kwargs):
        # 내부 기본 loss는 피하고 예측만 얻기 위해 future_values=None으로 호출
        base_out = self.base(
            past_values=past_values,
            past_observed_mask=past_observed_mask,
            future_values=None,
            **kwargs
        )

        # 예측 텐서 추출
        preds_all = self._extract_preds_from_output(
            base_out,
            pred_len=self.config.prediction_length,
            num_in_ch=self.config.num_input_channels,
        )  # (B, pred_len, C) or (B, pred_len)

        # 타깃 채널만 선택
        if preds_all.ndim == 3:
            preds_target = preds_all[..., self.target_ch]  # (B, pred_len)
        else:
            preds_target = preds_all  # 이미 (B, pred_len)

        # 라벨도 타깃 채널만으로 맞춰서 손실 계산
        loss = None
        if future_values is not None:
            fv = future_values
            if fv.ndim == 3 and fv.shape[-1] == self.config.num_input_channels:
                target = fv[..., self.target_ch].float()      # (B, pred_len)
            elif fv.ndim == 3 and fv.shape[-1] == 1:
                target = fv.squeeze(-1).float()               # (B, pred_len)
            elif fv.ndim == 2:
                target = fv.float()
            else:
                raise RuntimeError(f"future_values shape 예상 밖: {fv.shape}")
            loss = F.mse_loss(preds_target.float(), target)

        # HF Trainer가 인식하는 dict 반환 (loss/logits 필수)
        ret = {
            "logits": preds_target,           # predict/eval에서 사용
            "predictions": preds_target,      # predict() 시 편의
        }
        if loss is not None:
            ret["loss"] = loss
        # 필요하면 loc/scale도 패스스루
        for k in ["loc", "scale"]:
            v = getattr(base_out, k, None) if not isinstance(base_out, dict) else base_out.get(k, None)
            if isinstance(v, torch.Tensor):
                ret[k] = v
        return ret


## 학습 데이터 준비

In [3]:
# ==============================================
# 1. 데이터 로드 및 전처리 (사용자 코드 유지)
# ==============================================
df = pd.read_csv("./dataset/train/train.csv")
df.columns = ["date", "store_menu", "sales"]
df["date"] = pd.to_datetime(df["date"])

df.loc[df['sales'] < 0, 'sales'] = 0
df["sales"] = df["sales"].astype(float)
df["sales_log"] = np.log1p(df["sales"])     # target은 이제 sales_log

# entity embedding용 ID 인코딩
# LabelEncoder 객체를 저장해두면 나중에 원래 이름으로 복원할 때 유용합니다.
encoder = LabelEncoder()
df["store_menu_id"] = encoder.fit_transform(df["store_menu"])
num_entities = df["store_menu_id"].nunique() # 고유 ID 개수 저장

# feature 추가
kr_holidays = holidays.KR(years=df['date'].dt.year.unique())
df["is_holiday"] = df["date"].isin(kr_holidays).astype(int)
df["is_weekend"] = df["date"].dt.day_of_week.isin([5, 6]).astype(int)
df["is_ski_season"] = df["date"].dt.month.isin([12, 1, 2]).astype(int)

print("데이터 전처리 완료. DataFrame 샘플:")
print(df.head())

데이터 전처리 완료. DataFrame 샘플:
        date          store_menu  sales  sales_log  store_menu_id  is_holiday  \
0 2023-01-01  느티나무 셀프BBQ_1인 수저세트    0.0        0.0              0           1   
1 2023-01-02  느티나무 셀프BBQ_1인 수저세트    0.0        0.0              0           0   
2 2023-01-03  느티나무 셀프BBQ_1인 수저세트    0.0        0.0              0           0   
3 2023-01-04  느티나무 셀프BBQ_1인 수저세트    0.0        0.0              0           0   
4 2023-01-05  느티나무 셀프BBQ_1인 수저세트    0.0        0.0              0           0   

   is_weekend  is_ski_season  
0           1              1  
1           0              1  
2           0              1  
3           0              1  
4           0              1  


  df["is_holiday"] = df["date"].isin(kr_holidays).astype(int)


In [4]:
# ==============================================
# 2. ForecastDFDataset으로 변환
# ==============================================
forecast_horizon = 7
context_length = 28

# 학습/검증 데이터 분리
split_date = df['date'].max() - pd.Timedelta(days=forecast_horizon * 2)
train_data = df[df['date'] < split_date]
valid_data = df  # 검증 데이터는 전체 사용

# ForecastDFDataset 생성
train_dataset = ForecastDFDataset(
    train_data,
    id_columns=["store_menu_id"],
    timestamp_column="date",
    target_columns=["sales_log"],
    control_columns=["is_holiday", "is_weekend", "is_ski_season"],
    context_length=context_length,
    prediction_length=forecast_horizon,
)

valid_dataset = ForecastDFDataset(
    valid_data,
    id_columns=["store_menu_id"],
    timestamp_column="date",
    target_columns=["sales_log"],
    control_columns=["is_holiday", "is_weekend", "is_ski_season"],
    context_length=context_length,
    prediction_length=forecast_horizon,
)

print("데이터셋 변환 완료 ✅")
print("train_dataset 길이:", len(train_dataset))
print("valid_dataset 길이:", len(valid_dataset))

데이터셋 변환 완료 ✅
train_dataset 길이: 93219
valid_dataset 길이: 96114


## 모델 및 학습 설정

In [5]:
# ==============================================
# 3. PatchTST 모델 및 학습 설정 (Hugging Face 코드)
# ==============================================
config = PatchTSTConfig(
    # --- 데이터 관련 설정 ---
    num_input_channels=4, # sales + 3 known covariates
    context_length=context_length,
    prediction_length=forecast_horizon,
    # 💡 시간에 따라 변하는 외부 변수의 개수
    num_time_varying_known_reals=3, # is_holiday, is_weekend, is_ski_season

    # --- Entity Embedding 관련 설정 ---
    # 💡 고유 ID를 embedding 하기 위한 설정
    num_static_categorical_features=1, # store_menu_id 1개
    cardinality=[num_entities],      # store_menu_id의 고유값 개수
    embedding_dimension=[32],        # store_menu_id를 32차원으로 임베딩

    # --- 모델 구조 설정 ---
    patch_length=8,
    patch_stride=8,
    d_model=128,
    num_attention_heads=16,
    num_hidden_layers=3,
    ffn_dim=256,
    dropout=0.2,
    head_dropout=0.2,
    scaling="std",
    loss="mse",
)

#model = PatchTSTForPrediction(config)
# 기존 구성
base_model = PatchTSTForPrediction(config)

# 래핑
model = PatchTSTSalesOnly(base_model, target_ch=0)

training_args = TrainingArguments(
    output_dir="./patchtst_sales_forecast",
    overwrite_output_dir=True,
    num_train_epochs=50, # 예시로 에폭 수 줄임
    do_eval=True,
    eval_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    label_names=["future_values"],
    dataloader_pin_memory=True,
    use_mps_device=False,
)

# 그대로 Hugging Face Trainer 사용
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

### Optuna

In [8]:
import optuna
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers.models.patchtst import PatchTSTConfig, PatchTSTForPrediction
import os, optuna

STUDY_NAME = "patchtst_sales_forecast"  # 원하는 이름 (기존과 동일해야 이어짐)
STORAGE = f"sqlite:///{os.path.abspath('./patchtst_sales_forecast/optuna.sqlite3')}"

# ---- 1) trial=None 안전한 helper ----
def s_cat(trial, name, choices, default):
    return trial.suggest_categorical(name, choices) if trial else default

def s_int(trial, name, low, high, default):
    return trial.suggest_int(name, low, high) if trial else default

def s_float(trial, name, low, high, default, log=False):
    return trial.suggest_float(name, low, high, log=log) if trial else default


# --- 1) 모델 생성 함수: trial로부터 아키텍처/하이퍼파라미터를 받아서 모델 구성 ---
def model_init(trial):
    # [디버깅] 이 함수가 호출될 때마다 실제 사용되는 값을 출력합니다.
    print(f"--- Optuna Trial: Creating model with context={context_length}, horizon={forecast_horizon} ---")
    # ⬇︎ 아키텍처 탐색 공간 (필요한 것만 남기고/늘려도 됨)
    d_model  = s_cat(trial, "d_model", [64, 128, 256], 128)
    # d_model로 나누어떨어지는 head만 허용
    heads_cand = [h for h in [4, 8, 16] if d_model % h == 0]
    num_heads = s_cat(trial, "num_attention_heads", heads_cand, heads_cand[0])
    num_layers = s_int(trial, "num_hidden_layers", 2, 4, 3)
    ffn_dim   = s_cat(trial, "ffn_dim", [128, 256, 512], 256)
    dropout   = s_float(trial, "dropout", 0.0, 0.3, 0.2)
    head_do   = s_float(trial, "head_dropout", 0.0, 0.3, 0.2)
    patch_choices = [1, 7]
    patch_len = s_cat(trial, "patch_length", patch_choices, 7)
    patch_str = patch_len  # stride=length 고정

    cfg = PatchTSTConfig(
        # --- 고정 (네 파이프라인) ---
        num_input_channels=4,
        context_length=context_length,
        prediction_length=forecast_horizon,
        num_time_varying_known_reals=3,
        num_static_categorical_features=1,
        cardinality=[num_entities],
        embedding_dimension=[32],
        scaling="std",
        loss="mse",
        # --- 탐색 대상 ---
        d_model=d_model,
        num_attention_heads=num_heads,
        num_hidden_layers=num_layers,
        ffn_dim=ffn_dim,
        dropout=dropout,
        head_dropout=head_do,
        patch_length=patch_len,
        patch_stride=patch_str,
    )
    import math
    def _eff(L,p,s): return p * math.ceil(L / s)
    def assert_no_padding(cfg):
        ec = _eff(cfg.context_length, cfg.patch_length, cfg.patch_stride)
        ep = _eff(cfg.prediction_length, cfg.patch_length, cfg.patch_stride)
        if (ec, ep) != (cfg.context_length, cfg.prediction_length):
            raise ValueError(f"padding: ctx {cfg.context_length}->{ec}, pred {cfg.prediction_length}->{ep} "
                            f"(p={cfg.patch_length}, s={cfg.patch_stride})")
    # model_init 내부에서 cfg 만든 직후 호출
    assert_no_padding(cfg)

    base = PatchTSTForPrediction(cfg)
    # sales 채널만 loss/예측하도록 만든 래퍼
    return PatchTSTSalesOnly(base, target_ch=0)

# --- 2) 학습 세팅 쪽 탐색 공간 (TrainingArguments) ---
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay":  trial.suggest_float("weight_decay", 0.0, 0.1),
        "warmup_ratio":  trial.suggest_float("warmup_ratio", 0.0, 0.2),
        "lr_scheduler_type": trial.suggest_categorical(
            "lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial"]
        ),
        # 필요시 배치/에폭도 탐색
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [32, 64, 96]),
        "per_device_eval_batch_size":  trial.suggest_categorical("per_device_eval_batch_size",  [32, 64, 96]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 10, 40),
    }

# --- 3) 목표 메트릭 (작을수록 좋게) ---
def compute_objective(metrics):
    # eval_loss만 최소화
    return metrics["eval_loss"]

# --- 4) HPO용 트레이너: model이 아니라 model_init를 넘겨야 함! ---
trainer_hpo = Trainer(
    model_init=model_init,
    args=training_args,                # 네 기존 args (eval_strategy="epoch" 등 포함)
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
)

# --- 5) 탐색 실행 ---
best_run = trainer_hpo.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    n_trials=30,                # 리소스에 맞게 늘리기/줄이기
    hp_space=hp_space,
    study_name=STUDY_NAME,
    storage=STORAGE,
    load_if_exists=True,
    compute_objective=compute_objective,
)
print("BEST:", best_run)
print("BEST params:", best_run.hyperparameters)


--- Optuna Trial: Creating model with context=28, horizon=7 ---


[I 2025-08-22 04:34:36,906] A new study created in RDB with name: patchtst_sales_forecast


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6971,0.700241
2,0.6804,0.671609
3,0.6478,0.651422
4,0.6225,0.643771
5,0.6245,0.62693
6,0.6189,0.617874
7,0.6155,0.612385
8,0.5957,0.605289
9,0.6145,0.599652
10,0.6051,0.593043


[I 2025-08-22 04:59:00,551] Trial 0 finished with value: 0.5285714864730835 and parameters: {'learning_rate': 0.00010793239216264518, 'weight_decay': 0.03292088060709223, 'warmup_ratio': 0.04209295418061774, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 64, 'num_train_epochs': 25, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'ffn_dim': 128, 'dropout': 0.28414348884929624, 'head_dropout': 0.07745562098589157, 'patch_length': 1}. Best is trial 0 with value: 0.5285714864730835.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8541,0.802091
2,0.7973,0.802132
3,0.803,0.795645
4,0.7893,0.759923
5,0.7376,0.698147
6,0.698,0.69156
7,0.6919,0.692828
8,0.6959,0.694297


[I 2025-08-22 05:04:23,243] Trial 1 finished with value: 0.6942970156669617 and parameters: {'learning_rate': 4.3996753134659594e-05, 'weight_decay': 0.04991938585704693, 'warmup_ratio': 0.0005219290583927228, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 96, 'num_train_epochs': 33, 'd_model': 64, 'num_attention_heads': 4, 'num_hidden_layers': 2, 'ffn_dim': 256, 'dropout': 0.16749997123468874, 'head_dropout': 0.28141954238719524, 'patch_length': 1}. Best is trial 0 with value: 0.5285714864730835.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6731,0.667829
2,0.6732,0.654978
3,0.6507,0.648486
4,0.6356,0.64279
5,0.6389,0.638531
6,0.6362,0.637543
7,0.6393,0.63481
8,0.618,0.628272
9,0.6431,0.625247
10,0.6379,0.62243


[I 2025-08-22 05:39:05,143] Trial 2 finished with value: 0.5882025957107544 and parameters: {'learning_rate': 0.00016786953982498686, 'weight_decay': 0.06676438444844562, 'warmup_ratio': 0.0008439437765151103, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 25, 'd_model': 64, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'ffn_dim': 512, 'dropout': 0.19787695048229345, 'head_dropout': 0.04941986357298593, 'patch_length': 7}. Best is trial 0 with value: 0.5285714864730835.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8033,0.80338
2,0.7343,0.704949
3,0.6911,0.697798
4,0.6741,0.684363
5,0.6784,0.674262
6,0.6698,0.673136
7,0.6677,0.669068
8,0.6463,0.660094
9,0.6707,0.650964
10,0.6576,0.649759


[I 2025-08-22 06:08:02,518] Trial 3 finished with value: 0.598393976688385 and parameters: {'learning_rate': 3.416390967184238e-05, 'weight_decay': 0.07941039674171749, 'warmup_ratio': 0.1467171600093583, 'lr_scheduler_type': 'cosine', 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 96, 'num_train_epochs': 30, 'd_model': 128, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'ffn_dim': 256, 'dropout': 0.23949996716017746, 'head_dropout': 0.10244688278209647, 'patch_length': 1}. Best is trial 0 with value: 0.5285714864730835.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6793,0.669005
2,0.6579,0.65025
3,0.6295,0.633815
4,0.6357,0.625567
5,0.6288,0.609897
6,0.599,0.597087
7,0.6057,0.587092
8,0.5828,0.575296
9,0.5823,0.565579
10,0.5661,0.560395


[I 2025-08-22 06:16:48,854] Trial 4 finished with value: 0.5603946447372437 and parameters: {'learning_rate': 9.807525908257408e-05, 'weight_decay': 0.01953736085700001, 'warmup_ratio': 0.08017548092920589, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 64, 'num_train_epochs': 10, 'd_model': 256, 'num_attention_heads': 4, 'num_hidden_layers': 3, 'ffn_dim': 512, 'dropout': 0.2267275045821478, 'head_dropout': 0.029226778263806284, 'patch_length': 7}. Best is trial 0 with value: 0.5285714864730835.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6945,0.681628
2,0.6875,0.664072


[I 2025-08-22 06:18:45,471] Trial 5 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.9041,0.77119


[I 2025-08-22 06:19:36,348] Trial 6 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7065,0.705405


[I 2025-08-22 06:20:35,021] Trial 7 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7167,0.701302


[I 2025-08-22 06:21:25,648] Trial 8 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.686,0.672234


[I 2025-08-22 06:22:22,550] Trial 9 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8606,0.717744
2,0.6903,0.697824
3,0.6796,0.690459
4,0.6594,0.648446
5,0.6478,0.636571
6,0.6347,0.627982
7,0.623,0.61879
8,0.6232,0.613101
9,0.6159,0.610492
10,0.594,0.606807


[I 2025-08-22 06:52:57,580] Trial 10 finished with value: 0.47738322615623474 and parameters: {'learning_rate': 0.00017345720094921798, 'weight_decay': 0.005301742879223668, 'warmup_ratio': 0.06832212087096515, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 40, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'ffn_dim': 128, 'dropout': 0.2772002338612267, 'head_dropout': 0.10466284091613975, 'patch_length': 1}. Best is trial 10 with value: 0.47738322615623474.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8297,0.724937
2,0.688,0.683605
3,0.6671,0.676372
4,0.6443,0.635629
5,0.6369,0.626288
6,0.6247,0.615874
7,0.6128,0.611035
8,0.6135,0.605328
9,0.6067,0.600608
10,0.5838,0.5864


[I 2025-08-22 07:06:41,309] Trial 11 finished with value: 0.5499080419540405 and parameters: {'learning_rate': 0.00014071518301440942, 'weight_decay': 0.001046185997828207, 'warmup_ratio': 0.06916905989382355, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 18, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'ffn_dim': 128, 'dropout': 0.2975755701467377, 'head_dropout': 0.09595622656965941, 'patch_length': 1}. Best is trial 10 with value: 0.47738322615623474.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8726,0.710243
2,0.6913,0.699942


[I 2025-08-22 07:08:14,478] Trial 12 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7926,0.706969
2,0.6936,0.690428
3,0.6793,0.677782
4,0.6474,0.635964
5,0.639,0.633105
6,0.6254,0.622331
7,0.6136,0.607284
8,0.612,0.607183
9,0.6062,0.599517
10,0.5836,0.586377


[I 2025-08-22 07:40:30,448] Trial 13 finished with value: 0.4165944457054138 and parameters: {'learning_rate': 0.00023695284163224964, 'weight_decay': 0.02514135162873802, 'warmup_ratio': 0.05613506650397164, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 38, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 3, 'ffn_dim': 128, 'dropout': 0.26321548724481086, 'head_dropout': 0.001565806978548051, 'patch_length': 1}. Best is trial 13 with value: 0.4165944457054138.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8203,0.714568
2,0.6898,0.694639


[I 2025-08-22 07:42:11,105] Trial 14 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7943,0.719646
2,0.6947,0.701884


[I 2025-08-22 07:43:55,146] Trial 15 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8986,0.713865
2,0.699,0.69203
3,0.6923,0.685917


[I 2025-08-22 07:46:40,944] Trial 16 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8613,0.699288
2,0.6882,0.690864
3,0.6804,0.678848
4,0.6531,0.640322
5,0.6439,0.639923


[I 2025-08-22 07:51:03,308] Trial 17 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.946,0.768812


[I 2025-08-22 07:52:00,223] Trial 18 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8573,0.729964


[I 2025-08-22 07:52:51,298] Trial 19 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7625,0.734432


[I 2025-08-22 07:53:49,513] Trial 20 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6961,0.687578


[I 2025-08-22 07:54:49,766] Trial 21 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7033,0.722637


[I 2025-08-22 07:55:47,641] Trial 22 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.6977,0.692528


[I 2025-08-22 07:56:45,527] Trial 23 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.888,0.707333
2,0.6951,0.701137


[I 2025-08-22 07:58:19,070] Trial 24 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.7071,0.7489


[I 2025-08-22 07:59:26,966] Trial 25 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8053,0.672881
2,0.6631,0.655926
3,0.6562,0.637411
4,0.6374,0.638001
5,0.6361,0.62033
6,0.6247,0.617106
7,0.6126,0.609648
8,0.6127,0.599591
9,0.6064,0.592296
10,0.5821,0.583395


[I 2025-08-22 08:20:03,453] Trial 26 finished with value: 0.4912770688533783 and parameters: {'learning_rate': 0.0001869249738107361, 'weight_decay': 0.059760763227940494, 'warmup_ratio': 0.04606620346311894, 'lr_scheduler_type': 'cosine', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 27, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'ffn_dim': 128, 'dropout': 0.2774114755291166, 'head_dropout': 0.0406579817457119, 'patch_length': 7}. Best is trial 13 with value: 0.4165944457054138.


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8141,0.674277
2,0.6611,0.661028
3,0.6607,0.649148
4,0.6474,0.646006


[I 2025-08-22 08:23:36,927] Trial 27 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8144,0.679086
2,0.6603,0.651571
3,0.6536,0.63932
4,0.6372,0.634629
5,0.6375,0.623367
6,0.6271,0.619032
7,0.6175,0.613741


[I 2025-08-22 08:29:14,350] Trial 28 pruned. 


--- Optuna Trial: Creating model with context=28, horizon=7 ---


Epoch,Training Loss,Validation Loss
1,0.8789,0.678473
2,0.6665,0.663906
3,0.6648,0.653196
4,0.6515,0.649212


[I 2025-08-22 08:32:21,596] Trial 29 pruned. 


BEST: BestRun(run_id='13', objective=0.4165944457054138, hyperparameters={'learning_rate': 0.00023695284163224964, 'weight_decay': 0.02514135162873802, 'warmup_ratio': 0.05613506650397164, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 38, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 3, 'ffn_dim': 128, 'dropout': 0.26321548724481086, 'head_dropout': 0.001565806978548051, 'patch_length': 1}, run_summary=None)
BEST params: {'learning_rate': 0.00023695284163224964, 'weight_decay': 0.02514135162873802, 'warmup_ratio': 0.05613506650397164, 'lr_scheduler_type': 'polynomial', 'per_device_train_batch_size': 96, 'per_device_eval_batch_size': 32, 'num_train_epochs': 38, 'd_model': 256, 'num_attention_heads': 8, 'num_hidden_layers': 3, 'ffn_dim': 128, 'dropout': 0.26321548724481086, 'head_dropout': 0.001565806978548051, 'patch_length': 1}


### 최종 train

In [9]:
best = best_run.hyperparameters

# TrainingArguments 반영

args_dict = training_args.to_dict()
for k, v in best.items():
    if k in args_dict:
        args_dict[k] = v
best_args = TrainingArguments(**args_dict)

def model_init_best():
    # best 값으로 동일하게 구성
    trial_like = None
    # 그냥 model_init(None) 쓰면 기본값이 들어가므로,
    # 아래처럼 직접 config를 만드는 게 안전. (간단히는 best를 model_init에서 읽도록 바꿔도 OK)
    patch_length = best["patch_length"]
    cfg_best = PatchTSTConfig(
        num_input_channels=4,
        context_length=context_length,
        prediction_length=forecast_horizon,
        num_time_varying_known_reals=3,
        num_static_categorical_features=1,
        cardinality=[num_entities],
        embedding_dimension=[32],
        d_model=best["d_model"],
        num_attention_heads=best["num_attention_heads"],
        num_hidden_layers=best["num_hidden_layers"],
        ffn_dim=best["ffn_dim"],
        dropout=best["dropout"],
        head_dropout=best["head_dropout"],
        patch_length=patch_length,
        patch_stride=patch_length,
        scaling="std",
        loss="mse",
    )

    # 안전가드
    import math
    def _eff(L,p,s): return p * math.ceil(L/s)
    assert _eff(cfg_best.context_length, cfg_best.patch_length, cfg_best.patch_stride) == cfg_best.context_length
    assert _eff(cfg_best.prediction_length, cfg_best.patch_length, cfg_best.patch_stride) == cfg_best.prediction_length

    base = PatchTSTForPrediction(cfg_best)
    return PatchTSTSalesOnly(base, target_ch=0)

final_trainer = Trainer(
    model_init=model_init_best,
    args=best_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)
final_trainer.train()

# 훈련 직후
SAVE_DIR = "./patchtst_sales_forecast_best/base"   # 새 폴더
final_trainer.model.base.save_pretrained(SAVE_DIR) # ★ config.json까지 생성됨
print("saved to:", SAVE_DIR)



Epoch,Training Loss,Validation Loss
1,0.7926,0.706969
2,0.6936,0.690428
3,0.6793,0.677782
4,0.6474,0.635964
5,0.639,0.633105
6,0.6254,0.622331
7,0.6136,0.607284
8,0.612,0.607183
9,0.6062,0.599517
10,0.5836,0.586377


saved to: ./patchtst_sales_forecast_best/base


In [10]:
best.items()

dict_items([('learning_rate', 0.00023695284163224964), ('weight_decay', 0.02514135162873802), ('warmup_ratio', 0.05613506650397164), ('lr_scheduler_type', 'polynomial'), ('per_device_train_batch_size', 96), ('per_device_eval_batch_size', 32), ('num_train_epochs', 38), ('d_model', 256), ('num_attention_heads', 8), ('num_hidden_layers', 3), ('ffn_dim', 128), ('dropout', 0.26321548724481086), ('head_dropout', 0.001565806978548051), ('patch_length', 1)])

## Predict

In [None]:
import numpy as np
import torch

def _to_numpy(x):
    return x.detach().cpu().numpy() if isinstance(x, torch.Tensor) else x

def _pick_pred_array(preds, horizon=forecast_horizon, target_ch=0):
    """
    pred_output.predictions가 tuple/list/dict/object ndarray인 다양한 경우를 모두 커버해서
    (N, horizon) 형태의 sales 채널만 꺼내 반환.
    """
    # dict-like
    if isinstance(preds, dict):
        for k in ["predictions", "logits", "prediction_outputs", "y_hat", "forecast"]:
            if k in preds:
                arr = _to_numpy(preds[k])
                if isinstance(arr, np.ndarray):
                    return arr

    # tuple/list
    if isinstance(preds, (list, tuple)):
        for x in preds:
            arr = _to_numpy(x)
            if isinstance(arr, np.ndarray) and arr.ndim >= 2:
                return arr

    # numpy object 배열 (ragged)
    if isinstance(preds, np.ndarray) and preds.dtype == object:
        for x in preds.tolist():
            arr = _to_numpy(x)
            if isinstance(arr, np.ndarray) and arr.ndim >= 2:
                return arr

    # 이미 ndarray인 경우
    if isinstance(preds, np.ndarray):
        return preds

    # 마지막 수단
    arr = np.asarray(preds, dtype=object)
    raise ValueError(f"예측 배열을 추출하지 못함: type={type(preds)}, dtype={getattr(arr,'dtype',None)}")

'''
🏷️ 매출 예측이라면?
소수점 0.5 기준 반올림 (np.rint) 이 가장 많이 씁니다.
다만 0.07, 0.08 같은 작은 값들이 실제로는 “0건 매출”인 경우가 많기 때문에, 
임계값(threshold) 규칙을 추가하면 더 좋아요.
'''

def round_with_threshold(x, threshold=0.3):
    if x < threshold:
        return 0
    return int(np.rint(x))

#df_result["y_pred_int"] = df_result["y_pred"].apply(round_with_threshold)

In [12]:
import pandas as pd
import numpy as np
import torch
from transformers import Trainer, AutoConfig
from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction


# 1. 학습 시 TrainingArguments의 output_dir에 저장된 'best' 모델 경로를 지정합니다.
#    보통 output_dir 내부에 'checkpoint-...' 형태의 폴더로 저장됩니다.
MODEL_PATH = "./patchtst_sales_forecast_best/base"

# 2. 저장된 경로에서 config.json을 명시적으로 먼저 불러옵니다.
print(f"Loading configuration from: {MODEL_PATH}")
config = AutoConfig.from_pretrained(MODEL_PATH)

# 3. 위에서 불러온 config 객체를 사용하여 모델을 생성합니다.
# 이렇게 하면 context_length 등이 올바르게 설정됩니다.
print("Loading model with specified configuration...")
model = PatchTSTForPrediction.from_pretrained(MODEL_PATH, config=config)

# 3. 예측 전용 Trainer를 생성합니다.
trainer = Trainer(model=model)

print(f"✅ 모델 로드 완료: {MODEL_PATH}")

# 2) 모델이 기대하는 길이 읽기
CTX = getattr(model.config, "context_length", None) or getattr(model.config, "sequence_length", None)
H   = getattr(model.config, "prediction_length", None)
print(f"model expects context_length={CTX}, prediction_length={H}, num_input_channels={model.config.num_input_channels}")

Loading configuration from: ./patchtst_sales_forecast_best/base
Loading model with specified configuration...
✅ 모델 로드 완료: ./patchtst_sales_forecast_best/base
model expects context_length=28, prediction_length=7, num_input_channels=4


In [None]:
import os

path = "./dataset/test"
files = os.listdir(path)
rows = []

for file in files:

    test_df = pd.read_csv(os.path.join(path, file))
    test_df.columns = ["date", "store_menu", "sales"]
    test_df["date"] = pd.to_datetime(test_df["date"])

    test_df.loc[test_df['sales'] < 0, 'sales'] = 0
    test_df["sales"] = test_df["sales"].astype(float)
    test_df["sales_log"] = np.log1p(test_df["sales"])     # target은 이제 sales_log

    # 기존 인코더 사용 (encoder는 train 단계에서 fit된 걸 그대로 써야 consistency 보장)
    test_df["store_menu_id"] = encoder.transform(test_df["store_menu"])

    # 동일한 feature 생성
    kr_holidays = holidays.KR(years=test_df['date'].dt.year.unique())
    test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)
    test_df["is_weekend"] = test_df["date"].dt.day_of_week.isin([5, 6]).astype(int)
    test_df["is_ski_season"] = test_df["date"].dt.month.isin([12, 1, 2]).astype(int)

    print(f"{file}테스트 데이터 전처리 완료")

    # ==============================================
    # 2. ForecastDFDataset 변환
    # ==============================================
    test_dataset = ForecastDFDataset(
        test_df,
        id_columns=["store_menu_id"],
        timestamp_column="date",
        target_columns=["sales_log"],
        control_columns=["is_holiday", "is_weekend", "is_ski_season"],
        context_length=context_length,
        prediction_length=forecast_horizon,
    )

    print("테스트 데이터셋 길이:", len(test_dataset))

    # ==============================================
    # 3. 예측 실행 (견고 추출 버전)
    # ==============================================
    pred_output = trainer.predict(test_dataset)
    preds_raw = pred_output.predictions  # 컨테이너일 수 있음


    arr = _pick_pred_array(preds_raw, horizon=forecast_horizon, target_ch=0)

    # ---- (N, 7)로 정규화 ----
    if arr.ndim == 3:
        # 흔한 케이스 1: (N, horizon, C)
        if arr.shape[-2] == forecast_horizon:
            arr = arr[..., 0]                 # sales 채널만
        # 흔한 케이스 2: (N, C, horizon)
        elif arr.shape[-1] == forecast_horizon:
            arr = arr[:, 0, :]                # sales 채널만
        # 백업: 두 번째 축이 horizon이면 3번째 축을 잘라본다
        elif arr.shape[1] == forecast_horizon:
            arr = arr[:, :, 0]
        else:
            raise ValueError(f"예상 밖 3D shape: {arr.shape}")
    elif arr.ndim == 2:
        # (horizon, N) 이면 전치
        if arr.shape[0] == forecast_horizon and arr.shape[1] != forecast_horizon:
            arr = arr.T

    # 이제 (N, 7)이어야 정상
    assert arr.ndim == 2 and arr.shape[1] == forecast_horizon, f"정규화 실패: {arr.shape}"

    # 로그 역변환 + 음수 방지
    y_pred_log = arr
    y_pred_sales = np.expm1(y_pred_log)
    y_pred_sales = np.clip(y_pred_sales, 0, None)

    # 7일 미래 날짜
    last_date = test_df["date"].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                                periods=forecast_horizon)

    # 매장명 순서 고정 (id 정렬)
    keys_df = (
        test_df.sort_values(["store_menu_id", "date"])
            .drop_duplicates("store_menu_id")[["store_menu_id", "store_menu"]]
    )
    store_names = keys_df["store_menu"].to_numpy()

    # N 검증
    assert y_pred_sales.shape[0] == len(store_names), (
        f"N 불일치: preds={y_pred_sales.shape[0]} vs stores={len(store_names)}"
    )

    # 매장×7일 테이블
    file_name = file.split(".c")[0]
    for store_name, pred_row in zip(store_names, y_pred_sales):   # pred_row: (7,)
        for day_num, (d, yhat) in enumerate(zip(future_dates, pred_row), start=1):
            date_str = f"{file_name}+{day_num}일"
            rows.append({"date": date_str, "store_menu": store_name, "y_pred": float(yhat)})

df_result = pd.DataFrame(rows).pivot(index="date", columns="store_menu", values="y_pred")
#df_result["y_pred_int"] = df_result["y_pred"].apply(round_with_threshold)  # 결과 정수화를 원할 경우 사용
df_result.index.name = "영업일자"
df_result.to_csv("test_predictions.csv", index=True, encoding="utf-8-sig")

print("예측 테이블 shape:", df_result.shape)  # (7, 매장수)
print(df_result.head(7))


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


TEST_06.csv테스트 데이터 전처리 완료


테스트 데이터셋 길이: 193


TEST_05.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_04.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_09.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_08.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_07.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_02.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_03.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_01.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


TEST_00.csv테스트 데이터 전처리 완료


  test_df["is_holiday"] = test_df["date"].isin(kr_holidays).astype(int)


테스트 데이터셋 길이: 193


예측 테이블 shape: (70, 193)
store_menu  느티나무 셀프BBQ_1인 수저세트  느티나무 셀프BBQ_BBQ55(단체)  느티나무 셀프BBQ_대여료 30,000원  \
영업일자                                                                           
TEST_00+1일            1.723788              0.234112                1.796696   
TEST_00+2일            1.250817             12.516625                0.828242   
TEST_00+3일            1.545145              5.365510                1.399596   
TEST_00+4일            1.616241              4.484167                1.634028   
TEST_00+5일            2.740002             36.775391                1.350462   
TEST_00+6일            4.593117             17.783175                5.138574   
TEST_00+7일            6.441693              0.000000                7.540094   

store_menu  느티나무 셀프BBQ_대여료 60,000원  느티나무 셀프BBQ_대여료 90,000원  \
영업일자                                                         
TEST_00+1일                2.869678                0.587767   
TEST_00+2일                0.902708                0.178363   
TEST_00