In [12]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
torch.set_float32_matmul_precision('high')

# Darts
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
from darts.metrics import mape, rmse, smape
from darts.models import TFTModel
from darts.utils.likelihood_models import QuantileRegression
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.timeseries_generation import holidays_timeseries

# ------------------------------------------------------------
# 0) 변수 설정
# ------------------------------------------------------------

# 경로 
train_path = "DATA/train.csv"
test_path = "DATA/test.csv"

# 건물번호
building_num = 1

# 타겟 컬럼
target_col = "전력소비량(kWh)"
df_cols = ['건물번호', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)']
col_names = ["building_num", "datetime", "temp", "rain", "wind", "humidity", "sun_hr", "sun_mj", "power"]

# 데이터 빈도
FREQ = "h"
INPUT_CHUNK = 168        # 과거 7일 창
OUTPUT_CHUNK = 24        # 24시간 예측
USE_GPU = True

# 모델 파라미터
QUANTILES = [0.1, 0.5, 0.9]
N_EPOCHS = 50
BATCH_SIZE = 256    # BATCH 는 window 사이즈 기준
PRED_H = 168

likelihood = QuantileRegression(quantiles=QUANTILES)


### 1. 기본 유틸함수

In [13]:
# ------------------------------------------------------------
# 1) 기본 유틸리티 함수
# ------------------------------------------------------------

def load_data(file_path: str):
    df = pd.read_csv(file_path, header=0, index_col=0)
    return df


def split_by_building(df: pd.DataFrame, building_num: int):
    cols = set(df.columns)
    if "건물번호" in cols:
        return df[df["건물번호"] == building_num]
    if "building_num" in cols:
        return df[df["building_num"] == building_num]
    if df.index.name in ("건물번호", "building_num"):
        return df[df.index == building_num]
    raise KeyError("건물번호 컬럼/인덱스를 찾을 수 없습니다.")


def eng_col_name(df: pd.DataFrame, col_names: dict = None):
    """
    각 컬럼명을 한글에서 영문으로 하나씩 매핑하여 변경하는 함수
    col_names: {'기존컬럼명': '영문컬럼명', ...} 형태의 딕셔너리
    """
    # 기본 매핑 딕셔너리
    default_map = {
        '건물번호': 'building_num',
        '일시': 'datetime',
        '기온(°C)': 'temp',
        '강수량(mm)': 'rain',
        '풍속(m/s)': 'wind',
        '습도(%)': 'humidity',
        '일조(hr)': 'sun_hr',
        '일사(MJ/m2)': 'sun_mj',
        '전력소비량(kWh)': 'power'
    }
    # 사용자가 매핑을 지정하면 덮어씀
    if col_names is not None:
        default_map.update(col_names)
    # 컬럼별로 하나씩 영문명으로 변경
    df = df.rename(columns={k: v for k, v in default_map.items() if k in df.columns})
    return df


def plot_data(df: pd.DataFrame, target_col: str, building_num: int):
    plt.rcParams['font.family'] = 'NanumGothic'
    plt.figure(figsize=(15, 5))
    df[target_col].plot(title=f"건물번호 {building_num} - {target_col}")
    plt.show()


In [14]:
# ------------------------------------------------------------
# 1.2 ) building_info.csv 정적 공변량 준비
# ------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

def load_building_static(info_csv_path: str):
    bi = pd.read_csv(info_csv_path)
    bi = bi.rename(columns={
        '건물번호':'building_num',
        '건물유형':'bldg_type',
        '연면적(m2)':'gross_area',
        '냉방면적(m2)':'cool_area',
        '태양광용량(kW)':'pv_kw',
        'ESS저장용량(kWh)':'ess_kwh',
        'PCS용량(kW)':'pcs_kw'
    })
    # 수치 변환
    for c in ['pv_kw','ess_kwh','pcs_kw','gross_area','cool_area']:
        bi[c] = pd.to_numeric(bi[c], errors='coerce')

    # 파생
    bi['cool_ratio'] = (bi['cool_area'] / bi['gross_area']).replace([np.inf, -np.inf], np.nan)
    bi['has_pv']  = (bi['pv_kw'].fillna(0)  > 0).astype(int)
    bi['has_ess'] = (bi['ess_kwh'].fillna(0) > 0).astype(int)
    bi['has_pcs'] = (bi['pcs_kw'].fillna(0) > 0).astype(int)

    # 건물유형 원-핫 → 반드시 수치형으로
    bi = pd.get_dummies(bi, columns=['bldg_type'], prefix='type', dtype=float)

    # 결측 0 채움
    bi = bi.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    # 인덱스
    bi = bi.set_index('building_num').sort_index()

    # 스케일링(원-핫 제외 가능하지만, 모두 수치이므로 통일 스케일링해도 OK)
    scaler_static = StandardScaler()
    bi_scaled = pd.DataFrame(
        scaler_static.fit_transform(bi.values),
        index=bi.index, columns=bi.columns
    ).astype(np.float32)  # dtype을 확실히 float32로

    return bi_scaled, scaler_static

STATIC_DF, STATIC_SCALER = load_building_static("DATA/building_info.csv")

def attach_static(series, building_num: int):
    row = STATIC_DF.loc[[building_num]].astype(np.float32)  # 열 전체 float 보장
    return series.with_static_covariates(row)

# 디버그: 모든 열이 수치형인지 사전 검증
assert all(np.issubdtype(dt, np.number) for dt in STATIC_DF.dtypes), "정적 공변량에 비수치 열 존재"

### 2. Darts 유틸리티 함수

In [15]:
# ------------------------------------------------------------
# 2) Darts 유틸리티 함수
# ------------------------------------------------------------
def to_timeseries(df: pd.DataFrame, value_cols: list[str], time_col: str, freq: str = "h", **kwargs) -> TimeSeries:
    """
    df를 TimeSeries로 변환
    """
    ts = TimeSeries.from_dataframe(
        df.sort_values(time_col), 
        time_col=time_col,
        value_cols=value_cols,
        freq=freq,
        **kwargs
    )
    return ts


### 2.2 데이터 적용 

In [16]:
# Test Code
df = load_data("DATA/train.csv")    # 데이터 로드
df_cols = df.columns.tolist(); print(df_cols) # 컬럼 확인
df_01 = split_by_building(df, 1) # 건물번호 1번 데이터 추출
df_01 = eng_col_name(df_01); print(df_01.columns.tolist()) # 컬럼 이름 변경

# print(df_01.head())
ts = to_timeseries(
    df = df_01, 
    value_cols = ['power'],
    time_col = "datetime", 
    freq = "h"
)
ts

['건물번호', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)']
['building_num', 'datetime', 'temp', 'rain', 'wind', 'humidity', 'sun_hr', 'sun_mj', 'power']


### 3. 공변량 추가

In [17]:
# ------------------------------------------------------------
# 3) 공변량 추가 
# - 타깃 시계열 : power
# - 과거 공변량 : temp, rain, wind, humidity, sun_hr, sun_mj
# - 캘린더 공변량 : hour, dayofweek, month, holidays
# - 
# ------------------------------------------------------------

def build_timeseries(df: pd.DataFrame) -> pd.Timestamp:
    # 데이터 정렬 및 시간 파싱
    df = df.copy()
    df["datetime"] = pd.to_datetime(df["datetime"])
    df = df.sort_values("datetime")

    # 타깃 시계열
    series = TimeSeries.from_dataframe(
        df, time_col="datetime", value_cols=["power"], freq="h"
    )

    # 과거 공변량 (기상 데이터)
    past_cov = TimeSeries.from_dataframe(
        df, time_col="datetime", value_cols=["temp", "rain", "wind", "humidity", "sun_hr", "sun_mj"], freq="h"
    )

    # 캘린더 공변량
    hour = datetime_attribute_timeseries(series.time_index, attribute="hour", one_hot=True)
    dow = datetime_attribute_timeseries(series.time_index, attribute="dayofweek", one_hot=True)
    month = datetime_attribute_timeseries(series.time_index, attribute="month", one_hot=True)
    # 공휴일 (한국)
    hol = holidays_timeseries(series.time_index, country_code="KR")
    future_cov_calendar = hour.stack(dow).stack(month).stack(hol)

    return series, past_cov, future_cov_calendar

series, past_cov, future_cov_calendar = build_timeseries(df_01)

assert series.start_time() == past_cov.start_time() == future_cov_calendar.start_time()
assert series.end_time()   == past_cov.end_time()   == future_cov_calendar.end_time()

future_cov_calendar.components

Index(['hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6',
       'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'dayofweek_0',
       'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4',
       'dayofweek_5', 'dayofweek_6', 'month_0', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'holidays'],
      dtype='object')

### 3.3 건물번호 공변량 생성 및 부착

In [18]:
# -------------------------------
# 다수 빌딩 시계열/공변량 생성 + 정적 공변량 부착
# -------------------------------
train_all = load_data(train_path)
train_all = eng_col_name(train_all)
ALL_BUILDINGS = sorted(train_all["building_num"].unique().tolist())

series_list, past_list, future_list = [], [], []
for b in ALL_BUILDINGS:
    df_b = split_by_building(train_all, b)
    s, p, f = build_timeseries(df_b)  # 기존 함수(캘린더/기상 포함)
    s = attach_static(s, b)           # 정적 공변량 부착
    series_list.append(s)
    past_list.append(p)
    future_list.append(f)

### 4. 스케일링

In [19]:
# ------------------------------------------------------------
# 4) 스케일링
# ------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from darts.dataprocessing.transformers import Scaler

scaler_y = Scaler(scaler=StandardScaler())
scaler_cov = Scaler(scaler=StandardScaler())

series_sc_list = scaler_y.fit_transform(series_list)
past_sc_list   = scaler_cov.fit_transform(past_list)

In [20]:

# 역변환 검증(리스트 전체에 대해 1:1 매칭)
# 1) 리스트 단위로 역변환
recovered_list = scaler_y.inverse_transform(series_sc_list)

# 2) 각 시계열 간 일치 검증
for i, (orig, rec) in enumerate(zip(series_list, recovered_list)):
    # 시간축/컴포넌트 일치 확인
    assert rec.time_index.equals(orig.time_index), f"[{i}] time_index 불일치"
    assert rec.components == orig.components, f"[{i}] components 불일치: {rec.components} vs {orig.components}"

    # NaN 마스킹 후 값 비교
    a = orig.values(copy=False)
    b = rec.values(copy=False)
    mask = ~np.isnan(a)
    # 허용 오차는 약간 완화
    assert np.allclose(a[mask], b[mask], rtol=1e-6, atol=1e-8), f"[{i}] 값 불일치"

print("스케일러 역변환 검증: OK")

# print(series)
# print(series_sc)

스케일러 역변환 검증: OK


### 5. 학습/예측 설정

In [21]:
# ------------------------------------------------------------
# 5) 학습/예측 설정
# ------------------------------------------------------------
INPUT_CHUNK = INPUT_CHUNK   # 앞서 정의함
OUTPUT_CHUNK = OUTPUT_CHUNK 
PRED_H = PRED_H
QUANTILES = QUANTILES

likelihood = likelihood

print(f"INPUT_CHUNK: {INPUT_CHUNK}")
print(f"OUTPUT_CHUNK: {OUTPUT_CHUNK}")
print(f"PRED_H: {PRED_H}")
print(f"QUANTILES: {QUANTILES}")
print(f"likelihood: {likelihood}")

INPUT_CHUNK: 168
OUTPUT_CHUNK: 24
PRED_H: 168
QUANTILES: [0.1, 0.5, 0.9]
likelihood: QuantileRegression(quantiles=[0.1, 0.5, 0.9], prior_strength=1.0)


### 6. 모델 정의

In [22]:
# ------------------------------------------------------------
# 6) 모델 정의 & 학습
# ------------------------------------------------------------
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

print(f"N_EPOCHS: {N_EPOCHS}")
print(f"BATCH_SIZE: {BATCH_SIZE}")

pl_kwargs = {
    "callbacks": [
        EarlyStopping(monitor="val_loss", mode="min", patience=5),
    ],
    "precision": "64-true",
    "accumulate_grad_batches": 4,   # 실효 배치 256 (64×4)
}

model = TFTModel(
    input_chunk_length=INPUT_CHUNK,     # 168 권장
    output_chunk_length=24,             # 24 유지 (AR로 n=168 예측)
    hidden_size=64,
    lstm_layers=2,
    dropout=0.1,
    num_attention_heads=4,
    add_relative_index=True,
    likelihood=likelihood,
    batch_size=BATCH_SIZE,
    n_epochs=N_EPOCHS,
    random_state=42,
    pl_trainer_kwargs=pl_kwargs,
)

N_EPOCHS: 50
BATCH_SIZE: 256


### 7. 학습 

In [None]:
# ------------------------------------------------------------
# 7) 학습
# ------------------------------------------------------------
N_EPOCHS = 1

model.fit(
    series=series_sc_list,
    past_covariates=past_sc_list,
    future_covariates=future_list,
    verbose=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                              | Type                             | Params | Mode 
------------------------------------------------------------------------------------------------
0  | train_metrics                     | MetricCollection                 | 0      | train
1  | val_metrics                       | MetricCollection                 | 0      | train
2  | input_embeddings                  | _MultiEmbedding                  | 0      | train
3  | static_covariates_vsn             | _VariableSelectionNetwork        | 35.1 K | train
4  | encoder_vsn                       | _VariableSelectionNetwork        | 118 K  | train
5  | decoder_vsn                       | _VariableSelectionNetwork        | 99.0 K | train
6  | static_context_grn                | _GatedResidualNetwork            | 16.8 K | train
7  | static_cont

Epoch 0:  19%|█▉        | 139/723 [09:26<39:42,  0.25it/s, train_loss=0.634]

### 8.1 예측 준비 

In [None]:
# ------------------------------------------------------------
# 8.1) 예측 준비
# - 전제: 아래 객체들이 이미 준비되어 있어야 함
#   model, series_sc_list, past_sc_list, future_list, scaler_y, scaler_cov
#   ALL_BUILDINGS (정렬된 건물번호 리스트), FREQ, PRED_H, test_path
# ------------------------------------------------------------
from darts.utils.timeseries_generation import datetime_attribute_timeseries, holidays_timeseries

def predict_all_buildings(
    model,
    series_sc_list,
    past_cov_sc_list,
    future_calendar_list,
    scaler_y,
    scaler_cov,
    test_path: str,
    all_buildings: list[int],
    pred_h: int = 168,
    freq: str = "h",
    num_samples: int = 300,
):
    # 1) test.csv 로드 및 컬럼 정리
    test_df = load_data(test_path)
    test_df = eng_col_name(test_df)
    if "sun_hr" not in test_df.columns:
        test_df["sun_hr"] = 0
    if "sun_mj" not in test_df.columns:
        test_df["sun_mj"] = 0

    # 2) 미래 과거 공변량 + 캘린더 공변량 생성(list)
    past_cov_full_sc_list = []
    calendar_full_list = []

    for i, b in enumerate(all_buildings):
        df_b = split_by_building(test_df, b)

        # 미래 과거 공변량 (기상) 생성 및 스케일 변환
        future_past_cov_raw = to_timeseries(
            df=df_b,
            value_cols=["temp", "rain", "wind", "humidity", "sun_hr", "sun_mj"],
            time_col="datetime",
            freq=freq,
        )
        future_past_cov_sc = scaler_cov.transform(future_past_cov_raw)
        past_cov_full_sc = past_cov_sc_list[i].append(future_past_cov_sc)
        past_cov_full_sc_list.append(past_cov_full_sc)

        # 미래 캘린더 공변량 생성
        last_time = series_sc_list[i].end_time()
        future_index = pd.date_range(start=last_time + pd.Timedelta(hours=1), periods=pred_h, freq=freq)
        hour_f = datetime_attribute_timeseries(future_index, attribute="hour", one_hot=True)
        dow_f  = datetime_attribute_timeseries(future_index, attribute="dayofweek", one_hot=True)
        mon_f  = datetime_attribute_timeseries(future_index, attribute="month", one_hot=True)
        hol_f  = holidays_timeseries(future_index, country_code="KR")
        future_calendar = hour_f.stack(dow_f).stack(mon_f).stack(hol_f)

        # 학습 캘린더와 연결
        calendar_full = future_calendar_list[i].append(future_calendar)
        calendar_full_list.append(calendar_full)

    # 3) 예측 실행(리스트 입력)
    forecast_sc_list = model.predict(
        n=pred_h,
        series=series_sc_list,
        past_covariates=past_cov_full_sc_list,
        future_covariates=calendar_full_list,
        num_samples=num_samples,
        show_warnings=False,
    )

    # 4) 역변환 + p10/p50/p90 수집
    pred_rows = []
    for b, f_sc in zip(all_buildings, forecast_sc_list):
        f = scaler_y.inverse_transform(f_sc)
        p10 = f.quantile(0.1).values().flatten()
        p50 = f.quantile(0.5).values().flatten()
        p90 = f.quantile(0.9).values().flatten()
        dt_index = f.time_index  # pandas DatetimeIndex

        pred_rows.append(pd.DataFrame({
            "building_num": b,
            "datetime": dt_index,
            "p10": p10,
            "p50": p50,
            "p90": p90,
        }))

    df_pred = pd.concat(pred_rows, ignore_index=True)
    return df_pred



In [None]:
# # ------------------------------------------------------------
# # 8.1) 예측 준비 - test.csv 의 날씨 데이터 활용 
# # ------------------------------------------------------------
# # test.csv 데이터 로드 및 전처리
# test_df = load_data(test_path)
# test_df_01 = split_by_building(test_df, building_num)
# test_df_01 = eng_col_name(test_df_01)

# # 일조, 일사 데이터 추가(=0)
# if "sun_hr" not in test_df_01.columns:
#     test_df_01["sun_hr"] = 0
# if "sun_mj" not in test_df_01.columns:
#     test_df_01["sun_mj"] = 0

# # 예측할 데이터에 대한 미래 공변량 추가
# future_past_cov_raw = to_timeseries(
#     df=test_df_01,
#     value_cols=["temp", "rain", "wind", "humidity", "sun_hr", "sun_mj"],
#     time_col="datetime",
#     freq="h"
# )

# # 스케일링
# future_past_cov_sc = scaler_cov.transform(future_past_cov_raw)

# # 과거 공변량 + 미래 공변량
# past_cov_full_sc = past_cov_sc.append(future_past_cov_sc)

# print("학습 과거 공변량 종료 시점:", past_cov_sc.end_time())
# print("미래 과거 공변량 시작 시점:", future_past_cov_sc.start_time())
# print("전체 과거 공변량 종료 시점:", past_cov_full_sc.end_time())

In [None]:
# # ------------------------------------------------------------ 
# # 8.2) 예측 준비 - 캘린더 공변량 생성 
# # - 마지막 시점 이후 168시간의 future_covariates 생성
# # - 캘린더 공변량을 예측 창까지 확장
# # ------------------------------------------------------------
# last_time = series.end_time()   # 마지막 시점
# future_index = pd.date_range(start=last_time + pd.Timedelta(hours=1), periods=PRED_H, freq=FREQ)   # 미래 인덱스

# # 캘린더 공변량 생성 (미래)
# hour_f = datetime_attribute_timeseries(future_index, attribute="hour", one_hot=True)
# dow_f = datetime_attribute_timeseries(future_index, attribute="dayofweek", one_hot=True)
# month_f = datetime_attribute_timeseries(future_index, attribute="month", one_hot=True)
# hol_f = holidays_timeseries(future_index, country_code="KR")
# future_calendar = hour_f.stack(dow_f).stack(month_f).stack(hol_f)

# # # 기존 calendar 스케일러로 변환  >> Scaler 불필요
# # future_calendar_sc = scaler_cov.transform(future_calendar)

# # 전체 예측구간을 커버하도록 future_coveriates 확장
# calendar_full = future_cov_calendar.append(future_calendar)

### 8.3 예측

In [None]:
# ------------------------------------------------------------
# 8.3) 예측
# ------------------------------------------------------------
df_pred = predict_all_buildings(
    model=model,
    series_sc_list=series_sc_list,
    past_cov_sc_list=past_sc_list,
    future_calendar_list=future_list,
    scaler_y=scaler_y,
    scaler_cov=scaler_cov,
    test_path="DATA/test.csv",
    all_buildings=ALL_BUILDINGS,
    pred_h=PRED_H,
    freq=FREQ,
    num_samples=300,
)

In [None]:
# # ------------------------------------------------------------
# # 8.3) 예측
# # ------------------------------------------------------------
# forecast_sc = model.predict(
#     n=PRED_H,
#     series=series_sc,
#     past_covariates=past_cov_full_sc,   # 과거 공변량 + 미래 공변량
#     future_covariates=calendar_full,   # 예측 창 전체 커버
#     num_samples=300,
# )

# # 예측 직후/역변환 직후 범위 점검
# print("forecast_sc stats:", float(forecast_sc.mean().values()[0,0]),
#       float(forecast_sc.std().values()[0,0]))
# forecast = scaler_y.inverse_transform(forecast_sc)
# print("forecast stats:", float(forecast.mean().values()[0,0]),
#       float(forecast.std().values()[0,0]))


# # 결과 요약
# p10 = forecast.quantile(0.1)
# p50 = forecast.quantile(0.5)
# p90 = forecast.quantile(0.9)

# # print(f"p10: {p10}")
# # print(f"p50: {p50}")
# # print(f"p90: {p90}")

# print(f"예측 시작: {forecast.start_time()}, 종료: {forecast.end_time()}")
# print(f"P10 첫/마지막: {float(p10.values()[0,0])}, {float(p10.values()[-1,0])}")
# print(f"P50 첫/마지막: {float(p50.values()[0,0])}, {float(p50.values()[-1,0])}")
# print(f"P90 첫/마지막: {float(p90.values()[0,0])}, {float(p90.values()[-1,0])}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 1/1 [00:08<00:00,  0.12it/s]
forecast_sc stats: 683.4168111605328 3.737133733433828
forecast stats: 677268.2156519562 3674.308816620659
예측 시작: 2024-08-25 00:00:00, 종료: 2024-08-31 23:00:00
P10 첫/마지막: 674157.957663396, 674059.809665997
P50 첫/마지막: 675023.7016922327, 675026.6177229614
P90 첫/마지막: 684301.9772741183, 683673.5433544621


### 9. 데이터프레임 변환

In [None]:
# # 필요 시 DataFrame으로 저장
# df_forecast = pd.DataFrame({
#     "datetime": forecast.time_index,
#     "p10": p10.values().flatten(),
#     "p50": p50.values().flatten(),
#     "p90": p90.values().flatten(),
# })
# print(df_forecast.head())
# print(df_forecast.tail())

             datetime           p10           p50           p90
0 2024-08-25 00:00:00  4.237451e+06  4.243156e+06  4.299706e+06
1 2024-08-25 01:00:00  4.237296e+06  4.243155e+06  4.297500e+06
2 2024-08-25 02:00:00  4.237528e+06  4.243158e+06  4.300864e+06
3 2024-08-25 03:00:00  4.238187e+06  4.243156e+06  4.301973e+06
4 2024-08-25 04:00:00  4.237985e+06  4.243158e+06  4.298452e+06
               datetime           p10           p50           p90
163 2024-08-31 19:00:00  4.237823e+06  4.243171e+06  4.301968e+06
164 2024-08-31 20:00:00  4.237823e+06  4.243171e+06  4.301969e+06
165 2024-08-31 21:00:00  4.237568e+06  4.243174e+06  4.301968e+06
166 2024-08-31 22:00:00  4.237469e+06  4.243173e+06  4.301516e+06
167 2024-08-31 23:00:00  4.237663e+06  4.243174e+06  4.301968e+06


### 10. 실제 값과 비교

In [None]:
# ------------------------------------------------------------
# 10) 실제 값과 비교
# ------------------------------------------------------------
def smape_numpy(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.abs(y_true) + np.abs(y_pred) + eps
    return 200.0 * np.mean(np.abs(y_pred - y_true) / denom)

def evaluate_vs_highest(sub_df: pd.DataFrame, highest_path: str, per_building: bool = True):
    ref = pd.read_csv(highest_path)  # num_date_time, answer
    m = sub_df.merge(ref, on="num_date_time", how="inner", suffixes=("_pred", "_ref"))
    overall = smape_numpy(m["answer_ref"].values, m["answer_pred"].values)
    print(f"SMAPE(전체): {overall:.4f}")

    # 건물별
    if per_building:
        s = m["num_date_time"].str.split("_", n=1, expand=True)
        m["building_num"] = s[0].astype(int)
        stats = []
        for b, g in m.groupby("building_num"):
            score = smape_numpy(g["answer_ref"].values, g["answer_pred"].values)
            stats.append((b, score))
        stats = sorted(stats, key=lambda x: x[0])
        # 상위 몇 개만 표시
        print("건물별 SMAPE (일부):", stats[:5], "...", stats[-5:])
    return overall

In [None]:
# # ------------------------------------------------------------
# # 9) 실제 값과 비교
# # ------------------------------------------------------------
# highest_result = pd.read_csv("DATA/highest_submission.csv")
# # highest_result.head()
# y_true = highest_result["answer"][:168]

# # 수정 후 코드
# def evaluation(df_forecast, y_true):
#     if not isinstance(y_true, TimeSeries):
#         y_true = TimeSeries.from_series(y_true)
#     # Darts의 smape 함수를 명시적으로 불러옵니다 (가장 확실한 방법)
#     from darts.metrics import smape as smape_metric
    
#     for p in [0.1, 0.5, 0.9]:
#         y_pred = df_forecast[f'p{int(p*100)}']        
#         if not isinstance(y_pred, TimeSeries):
#             y_pred = TimeSeries.from_series(y_pred)
            
#         # [수정] 변수 이름을 함수 이름과 다르게 변경합니다.
#         smape_score = smape_metric(y_true, y_pred) 
#         print(f"{p} SMAPE: {smape_score:.4f}")

# # 함수 호출
# evaluation(df_forecast, y_true)

0.1 SMAPE: 199.4920
0.5 SMAPE: 199.4926
0.9 SMAPE: 199.4994


### 11. 제출 파일 생성

In [None]:
# ------------------------------------------------------------
# 11) 제출 파일 생성: sample_submission.csv의 순서/키를 그대로 따름
# - df_pred: predict_all_buildings()가 반환한 DataFrame
# - sample_path: "DATA/sample_submission.csv"
# - out_path: 저장 경로 (예: "output/my_submission.csv")
# ------------------------------------------------------------
def build_submission(df_pred: pd.DataFrame, sample_path: str, out_path: str, which: str = "p50"):
    # sample 로드 및 키 파싱
    sub = pd.read_csv(sample_path)
    # num_date_time: "{건물번호}_{YYYYMMDD HH}" 형태 → 건물번호/시간 분리
    # 예: "1_20240825 00"
    s = sub["num_date_time"].str.split("_", n=1, expand=True)
    sub["building_num"] = s[0].astype(int)
    sub["datetime"] = pd.to_datetime(s[1], format="%Y%m%d %H")

    # 예측 병합
    key_cols = ["building_num", "datetime"]
    pred_sel = df_pred[key_cols + [which]].rename(columns={which: "answer"})
    merged = sub.merge(pred_sel, on=key_cols, how="left")

    # 누락 시 0 대체(필요시 경고)
    miss = merged["answer"].isna().sum()
    if miss > 0:
        print(f"[경고] 예측 누락 {miss}건을 0으로 대체합니다.")
        merged["answer"] = merged["answer"].fillna(0.0)

    # 제출 형식 유지
    out_df = merged[["num_date_time", "answer"]].copy()
    out_df.to_csv(out_path, index=False)
    print(f"저장 완료: {out_path} (rows={len(out_df)})")
    return out_df

In [None]:
submission_df = build_submission(
    df_pred=df_pred,
    sample_path="DATA/sample_submission.csv",
    out_path="output/all_buildings_submission.csv",
    which="p50",
)

# 3) 최고 제출과 비교(SMAPE)
_ = evaluate_vs_highest(submission_df, highest_path="DATA/highest_submission.csv", per_building=True)

# 🧪🧪🧪🧪🧪🧪🧪🧪🧪🧪🧪🧪🧪🧪