## Import

In [None]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm


## Fixed RandomSeed & Setting Hyperparameter

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Load

In [None]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [None]:
train = pd.read_csv('train.csv')

train['영업일자'] = pd.to_datetime(train['영업일자'])

#날짜 파생 변수 생성
train['year'] = train['영업일자'].dt.year
train['month'] = train['영업일자'].dt.month
train['day'] = train['영업일자'].dt.day
train['dayofweek'] = train['영업일자'].dt.dayofweek
train['is_weekend'] = train['dayofweek'].isin([5,6]).astype(int)
train['weekofyear'] = train['영업일자'].dt.isocalendar().week.astype(int)

# 음수 변수 수정
train.loc[train['매출수량'] < 0, '매출수량'] = 0

## Define Model

In [None]:
class MultiOutputLSTM(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=64, num_layers=2, output_dim=7):
        super(MultiOutputLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

## Train

In [None]:
def train_lstm(train_df):
    trained_models = {}

    for store_menu, group in tqdm(train_df.groupby(['영업장명_메뉴명']), desc ='Training LSTM'):
        store_train = group.sort_values('영업일자').copy()
        if len(store_train) < LOOKBACK + PREDICT:
            continue

        features = ['매출수량', 'dayofweek', 'is_weekend']
        scaler = MinMaxScaler()
        store_train[features] = scaler.fit_transform(store_train[features])
        train_vals = store_train[features].values  # shape: (N, 5)

        # 시퀀스 구성
        X_train, y_train = [], []
        for i in range(len(train_vals) - LOOKBACK - PREDICT + 1):
            X_train.append(train_vals[i:i+LOOKBACK, :]) # Select only '매출수량' (index 0)
            y_train.append(train_vals[i+LOOKBACK:i+LOOKBACK+PREDICT, 0])

        X_train = torch.tensor(X_train).float().to(DEVICE)
        y_train = torch.tensor(y_train).float().to(DEVICE)

        model = MultiOutputLSTM(input_dim=len(features), output_dim=PREDICT).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()

        model.train()
        for epoch in range(EPOCHS):
            idx = torch.randperm(len(X_train))
            for i in range(0, len(X_train), BATCH_SIZE):
                batch_idx = idx[i:i+BATCH_SIZE]
                X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
                output = model(X_batch)
                loss = criterion(output, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        trained_models[store_menu] = {
            'model': model.eval(),
            'scaler': scaler,
            'last_sequence': train_vals[-LOOKBACK:, :]
        }

    return trained_models

In [None]:
# 학습
trained_models = train_lstm(train)

  X_train = torch.tensor(X_train).float().to(DEVICE)
Training LSTM: 100%|██████████| 193/193 [13:03<00:00,  4.06s/it]


## Prediction

In [None]:
def predict_lstm(test_df, trained_models, test_prefix: str):
    results = []

    for store_menu, store_test in test_df.groupby(['영업장명_메뉴명']):
        key = store_menu
        if key not in trained_models:
            continue

        model = trained_models[key]['model']
        scaler = trained_models[key]['scaler']

        features = ['매출수량', 'dayofweek', 'is_weekend']

        store_test_sorted = store_test.sort_values('영업일자')
        if len(store_test_sorted) < LOOKBACK:
            continue

        recent_df = store_test_sorted[features].iloc[-LOOKBACK:].copy()
        if len(recent_df) < LOOKBACK:
            continue

        # 정규화
        recent_vals_scaled = scaler.transform(recent_df.values)
        x_input = torch.tensor([recent_vals_scaled]).float().to(DEVICE)

        with torch.no_grad():
            pred_scaled = model(x_input).squeeze().cpu().numpy()

        # 역변환
        restored = []
        for i in range(PREDICT):
            dummy_input = np.zeros((1, len(features)))
            dummy_input[0, 0] = pred_scaled[i]  # 매출수량에만 값 삽입
            restored_val = scaler.inverse_transform(dummy_input)[0, 0]
            restored.append(max(restored_val, 0))

        # 예측일자: TEST_00+1일 ~ TEST_00+7일
        pred_dates = [f"{test_prefix}+{i+1}일" for i in range(PREDICT)]

        for d, val in zip(pred_dates, restored):
            results.append({
                '영업일자': d,
                '영업장명_메뉴명': store_menu,
                '매출수량': val
            })

    return pd.DataFrame(results)


In [None]:
uploaded = files.upload()

Saving TEST_00.csv to TEST_00.csv
Saving TEST_01.csv to TEST_01.csv
Saving TEST_02.csv to TEST_02.csv
Saving TEST_03.csv to TEST_03.csv
Saving TEST_04.csv to TEST_04.csv
Saving TEST_05.csv to TEST_05.csv
Saving TEST_06.csv to TEST_06.csv
Saving TEST_07.csv to TEST_07.csv
Saving TEST_08.csv to TEST_08.csv
Saving TEST_09.csv to TEST_09.csv


In [None]:
import os
os.makedirs('test', exist_ok=True)

import shutil
for i in range(10):
    src = f'TEST_0{i}.csv'
    dst = f'test/TEST_0{i}.csv'
    shutil.move(src, dst)

test_files = sorted(glob.glob('./test/TEST_*.csv'))
print(test_files)  # 파일 리스트 잘 나오는지 확인

['./test/TEST_00.csv', './test/TEST_01.csv', './test/TEST_02.csv', './test/TEST_03.csv', './test/TEST_04.csv', './test/TEST_05.csv', './test/TEST_06.csv', './test/TEST_07.csv', './test/TEST_08.csv', './test/TEST_09.csv']


In [None]:
all_preds = []
test_files = sorted(glob.glob('./test/TEST_*.csv'))

for path in test_files:
    test_df = pd.read_csv(path)

    test_df['영업일자'] = pd.to_datetime(test_df['영업일자'])

    #날짜 파생 변수 생성
    test_df['year'] = test_df['영업일자'].dt.year
    test_df['month'] = test_df['영업일자'].dt.month
    test_df['day'] = test_df['영업일자'].dt.day
    test_df['dayofweek'] = test_df['영업일자'].dt.dayofweek
    test_df['is_weekend'] = test_df['dayofweek'].isin([5,6]).astype(int)
    test_df['weekofyear'] = test_df['영업일자'].dt.isocalendar().week.astype(int)
    # 음수 변수 수정
    test_df.loc[test_df['매출수량'] < 0, '매출수량'] = 0

    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    pred_df = predict_lstm(test_df, trained_models, test_prefix)
    all_preds.append(pred_df)

full_pred_df = pd.concat(all_preds, ignore_index=True)



In [None]:
print(full_pred_df.head())
print(full_pred_df.columns)
print(full_pred_df.isna().sum())

         영업일자               영업장명_메뉴명      매출수량
0  TEST_00+1일  (느티나무 셀프BBQ_1인 수저세트,)  7.393426
1  TEST_00+2일  (느티나무 셀프BBQ_1인 수저세트,)  1.613394
2  TEST_00+3일  (느티나무 셀프BBQ_1인 수저세트,)  2.793552
3  TEST_00+4일  (느티나무 셀프BBQ_1인 수저세트,)  4.292424
4  TEST_00+5일  (느티나무 셀프BBQ_1인 수저세트,)  5.814300
Index(['영업일자', '영업장명_메뉴명', '매출수량'], dtype='object')
영업일자        0
영업장명_메뉴명    0
매출수량        0
dtype: int64


In [None]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            value = pred_dict.get((date, (col,)), 0)
            final_df.loc[row_idx, col] = int(value)

    return final_df

In [None]:
uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv


In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
submission = convert_to_submission_format(full_pred_df, sample_submission)
submission.to_csv('0809_try2_submission.csv', index=False, encoding='utf-8-sig')

In [None]:
print(sample_submission.columns[:5])

Index(['영업일자', '느티나무 셀프BBQ_1인 수저세트', '느티나무 셀프BBQ_BBQ55(단체)',
       '느티나무 셀프BBQ_대여료 30,000원', '느티나무 셀프BBQ_대여료 60,000원'],
      dtype='object')


In [None]:
submission.head()
submission.dtypes

Unnamed: 0,0
영업일자,object
느티나무 셀프BBQ_1인 수저세트,int64
느티나무 셀프BBQ_BBQ55(단체),int64
"느티나무 셀프BBQ_대여료 30,000원",int64
"느티나무 셀프BBQ_대여료 60,000원",int64
...,...
화담숲카페_메밀미숫가루,int64
화담숲카페_아메리카노 HOT,int64
화담숲카페_아메리카노 ICE,int64
화담숲카페_카페라떼 ICE,int64


In [None]:
# 1. sample_submission과 full_pred_df 확인
print("샘플 제출 파일 컬럼:", submission.columns.tolist())
print("예측 결과 컬럼:", full_pred_df.columns.tolist())

# 2. full_pred_df 값 중 0 아닌 게 있는지 확인
print(full_pred_df.describe())

# 3. submission에 복사 후 확인
# Create a dictionary mapping ('영업일자', '영업장명_메뉴명') to '매출수량' from full_pred_df
pred_dict = dict(zip(
    zip(full_pred_df['영업일자'], full_pred_df['영업장명_메뉴명']),
    full_pred_df['매출수량']
))

# Iterate through the submission DataFrame and fill in the predicted values
for row_idx in submission.index:
    date = submission.loc[row_idx, '영업일자']
    for col in submission.columns[1:]:  # Iterate through menu columns
        # Get the predicted value from the dictionary, default to 0 if not found
        val = pred_dict.get((date, col), 0)
        # Assign the value, converting to int after ensuring it's not a string
        submission.loc[row_idx, col] = int(val)

print(submission.head())

샘플 제출 파일 컬럼: ['영업일자', '느티나무 셀프BBQ_1인 수저세트', '느티나무 셀프BBQ_BBQ55(단체)', '느티나무 셀프BBQ_대여료 30,000원', '느티나무 셀프BBQ_대여료 60,000원', '느티나무 셀프BBQ_대여료 90,000원', '느티나무 셀프BBQ_본삼겹 (단품,실내)', '느티나무 셀프BBQ_스프라이트 (단체)', '느티나무 셀프BBQ_신라면', '느티나무 셀프BBQ_쌈야채세트', '느티나무 셀프BBQ_쌈장', '느티나무 셀프BBQ_육개장 사발면', '느티나무 셀프BBQ_일회용 소주컵', '느티나무 셀프BBQ_일회용 종이컵', '느티나무 셀프BBQ_잔디그늘집 대여료 (12인석)', '느티나무 셀프BBQ_잔디그늘집 대여료 (6인석)', '느티나무 셀프BBQ_잔디그늘집 의자 추가', '느티나무 셀프BBQ_참이슬 (단체)', '느티나무 셀프BBQ_친환경 접시 14cm', '느티나무 셀프BBQ_친환경 접시 23cm', '느티나무 셀프BBQ_카스 병(단체)', '느티나무 셀프BBQ_콜라 (단체)', '느티나무 셀프BBQ_햇반', '느티나무 셀프BBQ_허브솔트', '담하_(단체) 공깃밥', '담하_(단체) 생목살 김치전골 2.0', '담하_(단체) 은이버섯 갈비탕', '담하_(단체) 한우 우거지 국밥', '담하_(단체) 황태해장국 3/27까지', '담하_(정식) 된장찌개', '담하_(정식) 물냉면 ', '담하_(정식) 비빔냉면', '담하_(후식) 된장찌개', '담하_(후식) 물냉면', '담하_(후식) 비빔냉면', '담하_갑오징어 비빔밥', '담하_갱시기', '담하_공깃밥', '담하_꼬막 비빔밥', '담하_느린마을 막걸리', '담하_담하 한우 불고기', '담하_담하 한우 불고기 정식', '담하_더덕 한우 지짐', '담하_들깨 양지탕', '담하_라면사리', '담하_룸 이용료', '담하_메밀면 사리', '담하_명인안동소주', '담하_명태회 비빔냉면', '담하_문막 복분자 칵테일', '담하_봉평메밀 물냉면', '담하_생목살 김치찌개', '담하