In [4]:
import pandas as pd

# 파일 경로
train_path = r"C:\Users\user\Downloads\open (1)\train.csv"
building_info_path = r"C:\Users\user\Downloads\open (1)\building_info.csv"

# CSV 불러오기
train_df = pd.read_csv(train_path)
building_info_df = pd.read_csv(building_info_path)

# 병합 (건물번호 기준)
merged_df = pd.merge(train_df, building_info_df, on='건물번호', how='left')

# 결과 저장
merged_df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')

print("병합 완료! merged_train.csv로 저장됨")


병합 완료! merged_train.csv로 저장됨


In [5]:
def read_csv_smart(path):
    import pandas as pd
    for enc in ['cp949', 'utf-8-sig', 'utf-8', 'euc-kr']:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # 최후의 보루: 깨지는 글자는 � 로 대체
    return pd.read_csv(path, encoding='utf-8', errors='replace')

In [6]:
# CSV 불러오기 (예시)
df = read_csv_smart("C:\\Users\\user\\Downloads\\open (1)\\merged_train.csv")


# '일시'를 문자열로 변환 후 날짜와 시간 분리
df['일시'] = df['일시'].astype(str)

# 날짜(YYYYMMDD)와 시간(HH) 분리
df['날짜'] = df['일시'].str.slice(0, 8)     # 앞 8자리 → 날짜
df['시간'] = df['일시'].str.slice(9, 11)    # 9~10번째 자리 → 시간

# 날짜를 datetime 형식으로 변환
df['날짜'] = pd.to_datetime(df['날짜'], format='%Y%m%d')
df['시간'] = df['시간'].astype(int)

# 확인
print(df[['일시', '날짜', '시간']].head())



            일시         날짜  시간
0  20240601 00 2024-06-01   0
1  20240601 01 2024-06-01   1
2  20240601 02 2024-06-01   2
3  20240601 03 2024-06-01   3
4  20240601 04 2024-06-01   4


In [7]:
df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,호텔,82912.71,77586.0,-,-,-,2024-06-01,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,호텔,82912.71,77586.0,-,-,-,2024-06-01,1
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,호텔,82912.71,77586.0,-,-,-,2024-06-01,2
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,호텔,82912.71,77586.0,-,-,-,2024-06-01,3
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,호텔,82912.71,77586.0,-,-,-,2024-06-01,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,호텔,162070.24,152943.0,-,-,-,2024-08-24,19
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,호텔,162070.24,152943.0,-,-,-,2024-08-24,20
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,호텔,162070.24,152943.0,-,-,-,2024-08-24,21
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,호텔,162070.24,152943.0,-,-,-,2024-08-24,22


In [8]:
# 결측치 대체할 컬럼 목록
cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']

# '-'를 0으로 바꾸고 숫자형으로 변환
for col in cols:
    df[col] = df[col].replace('-', 0).astype(float)

In [9]:
df = df.drop(columns=['num_date_time', '일시'])

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['건물유형'] = le.fit_transform(df['건물유형'])
df['날짜'] = pd.to_datetime(df['날짜'])

In [11]:
import pandas as pd
import numpy as np

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    # 0) 기본 정렬 & datetime 만들기
    #    (이미 df['날짜']와 df['시간']이 있다면 그대로 쓰되, 한 줄짜리 datetime을 만들어두면 편함)
    df = df.copy()
    df['날짜'] = pd.to_datetime(df['날짜'])
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # -------------------------------------------------------
    # 1) 최근 24시간 평균, 최근 7일(같은 시각) 평균  → 전부 "과거만" 보도록 shift 사용
    # -------------------------------------------------------
    grp = df.groupby('건물번호', group_keys=False)

    # (a) 최근 24시간 평균 (전력소비량 기준)
    #  - window=24, past-only를 위해 shift(1) 후 rolling
    df['cons_lag1'] = grp['전력소비량(kWh)'].shift(1)
    df['cons_mean_24h'] = grp['cons_lag1'].rolling(window=24, min_periods=1).mean()

    # (b) 최근 7일 같은 시각 평균 (24시간 간격으로 7개)
    #  - 1일 전 같은 시각부터 7일 전 같은 시각까지 평균
    same_hour_lag = grp['전력소비량(kWh)'].shift(24)
    df['cons_samehour_mean_7d'] = same_hour_lag.rolling(window=7, min_periods=1).mean()

    # 참고로 모델에 바로 쓰진 않아도 되는 추가 라그들(원하면 활성화)
    df['cons_lag_24h'] = grp['전력소비량(kWh)'].shift(24)
    df['cons_lag_48h'] = grp['전력소비량(kWh)'].shift(48)
    df['cons_lag_72h'] = grp['전력소비량(kWh)'].shift(72)
    df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전 같은 시각

    # -------------------------------------------------------
    # 2) 기온·일사 기반 냉방 수요 지표 (CDD류)
    # -------------------------------------------------------
    # 한국 여름 기준 base temp 24°C 가합리(필요시 23~26으로 튜닝)
    base_temp = 24.0
    # ‘냉방도수’(Cooling Degree) 시간 단위
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    # 일사량(MJ/m2)과의 상호작용: 햇볕이 강할수록 체감 부하↑
    # 일사량이 0~상위 99퍼센타일 사이로 정규화(robust)
    q99 = df['일사(MJ/m2)'].quantile(0.99)
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / (q99 + 1e-6))
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    # 습도(%)와의 상호작용: 습도가 높으면 동일 온도에서도 냉방 부하↑
    # 간단히 (1 + 습도/100*알파) 가중. 알파=0.3 정도로 시작(튜닝 가능)
    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # -------------------------------------------------------
    # 3) 주말/평일, 공휴일
    # -------------------------------------------------------
    df['weekday'] = df['dt'].dt.weekday  # 월=0 ... 일=6
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    # 2024-06~08 사이 한국 공휴일: 현충일(6/6), 광복절(8/15)
    kr_holidays = {
        pd.Timestamp(2024, 6, 6),  # 현충일
        pd.Timestamp(2024, 8, 15), # 광복절
    }
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # -------------------------------------------------------
    # 4) 태양광·ESS·PCS 용량 대비 “동작 가능성” 지표
    #    (실제 제어 로그가 없으니 ‘가능성/잠재력’을 피처로 넣는다)
    # -------------------------------------------------------
    # 설비 유무 이진
    df['has_pv'] = (df['태양광용량(kW)'] > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)'] > 0).astype(int)

    # 낮/밤 플래그 (대략 일사량>0이면 주간으로 간주)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    # 태양광 ‘동작 가능성’ (설비 있고 + 주간/일사>0)
    df['pv_active_potential'] = ((df['has_pv'] == 1) & (df['is_daylight'] == 1)).astype(int)

    # 피크/오프피크 (현실 요금제와 다를 수 있지만 합리적 초기값)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    # ESS 충방전 ‘가능성’ 피처
    df['ess_charge_potential']   = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    # 용량 스케일 자체도 피처로 사용(로그 스케일로 완만화; 0은 0으로 유지)
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    # 누설 방지: 타깃 기반 비율은 과거 라그로만 계산
    # ESS 대비 부하 비율(전일 같은 시각 소비량 사용)
    df['ess_to_load_lag_ratio'] = np.where(
        df['cons_lag_24h'].notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # -------------------------------------------------------
    # 5) 기타 유틸리티 파생
    # -------------------------------------------------------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']  # 가독성용 복사
    df['dayofyear'] = df['dt'].dt.dayofyear

    # 모델 입력 전에 의미 없는 원본(또는 중복) 컬럼 정리 원하면 아래 사용
    # drop_cols = ['dt']  # 학습 시 굳이 안 써도 되면 제거
    # df = df.drop(columns=drop_cols)

    return df

# 사용 예시:
# df_feat = make_features(df)
# df_feat.head()


In [12]:
# =========================
# 0. 라이브러리 & 경로 설정
# =========================
import os
import numpy as np
import pandas as pd

# pip install lightgbm 먼저 (처음 1번만)
# pip install lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

DATA_DIR = r"C:\Users\user\Downloads\open (1)"
TRAIN_MERGED_PATH = os.path.join(DATA_DIR, "merged_train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")
BUILD_PATH = os.path.join(DATA_DIR, "building_info.csv")
SAMPLE_SUB = os.path.join(DATA_DIR, "sample_submission.csv")
OUT_SUB    = os.path.join(DATA_DIR, "baseline_lgbm_submission.csv")


In [16]:
# ===== IMPORTS (필요시 중복 있어도 무방) =====
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

# ===== 피처 선택 헬퍼: 숫자/카테고리만 남기기 + 불필요 컬럼 드롭 =====
def get_feature_cols(df: pd.DataFrame) -> list:
    base_drop = ['전력소비량(kWh)', 'dt', '날짜', '시간', '일시', 'num_date_time']
    cols = [c for c in df.columns if c not in base_drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# =========================
# 1. 유틸 함수들
# =========================
def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    """일시 → 날짜/시간 분리(또는 이미 분리돼 있으면 그대로) + dt 생성"""
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    """설비 용량에 '-'가 있으면 0으로 치환 후 float 변환"""
    df = df.copy()
    cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

# =========================
# 2. 특징 엔지니어링(수정 버전)
# =========================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: 전부 shift/rolling(=transform)로 과거만 사용
    - groupby().rolling() 대신 groupby().transform(...) 사용
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # 그룹 객체
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전
        # 최근 24시간 평균 (과거만 참고)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())
        # 최근 7일 같은 시각 평균
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    return df




In [14]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    # dt/시간 보장
    if 'dt' not in all_df.columns:
        raise ValueError("dt 없으면 ensure_datetime_cols 먼저 호출")
    all_df = all_df.copy()
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']

    # train/test 구분: 타깃 존재 여부로 판별
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else pd.Series(False, index=all_df.index)

    # 기준 통계 (train에서만)
    rad_ref = all_df.loc[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df.loc[is_train].groupby(['month','hour'])['일조(hr)'].median()

    # index 매칭해서 test 행만 채움
    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_fill_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_fill_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))

    all_df.loc[need_fill_rad, '일사(MJ/m2)'] = fill_rad[need_fill_rad.values]
    all_df.loc[need_fill_sun, '일조(hr)']    = fill_sun[need_fill_sun.values]

    # 혹시라도 남은 결측은 0으로
    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [None]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: shift/rolling 모두 과거만 사용
    - groupby().transform(...) 으로 인덱스 정렬 유지
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬 후 그룹 객체 생성
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)  # ← 여기서 grp 정의

    # ---------- 타깃 라그 & 롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전

        # 최근 24시간 평균(과거만; shift(1) 후 rolling)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())

        # 최근 7일 같은 시각 평균(24시간 간격 7개)
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())

        # ✅ 최근 24시간 표준편차(변동성) — cons_lag1(과거값) 기반
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(window=24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    # (옵션) 시간 사이클릭 인코딩 원하면 활성화
    # df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    # df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    return df




In [None]:
params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.4,  # 1.2~1.6 사이 튜닝
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}


In [None]:
MAX_ABS = 1e12  # 과도한 값 하드 클립 임계

def sanitize_matrix(X: pd.DataFrame) -> pd.DataFrame:
    """Inf 제거 + 과대값 클립 (LightGBM은 NaN은 허용, Inf는 불가)"""
    X = X.copy()
    num_cols = X.select_dtypes(include=[np.number]).columns
    X[num_cols] = X[num_cols].replace([np.inf, -np.inf], np.nan)
    X[num_cols] = X[num_cols].clip(lower=-MAX_ABS, upper=MAX_ABS)
    return X

def safe_log1p_vec(a):
    """음수/비정상값 방지 후 log1p"""
    a = np.asarray(a, dtype=float)
    # 비정상(y에 NaN/inf) → 0으로 대체 (혹은 np.nan 유지하고 마스킹하려면 전략 바꿔도 됨)
    a = np.where(np.isfinite(a), a, 0.0)
    a = np.clip(a, 0, None)  # -0 방지
    return np.log1p(a)



In [None]:
# ----- a, b 동시 추정(최소제곱) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))  # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print("Baseline-only  RMSE:", rmse(y_va, baseline_va), "MAE:", mean_absolute_error(y_va, baseline_va))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))


In [None]:
# 잔차 예측
test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)

# 검증에서 구한 a,b로 복원
a_use = a if 'a' in globals() else 1.0
b_use = b if 'b' in globals() else 0.0
test_pred = baseline_te.values + a_use * test_pred_resid + b_use


In [None]:
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


In [None]:
# 1) make_features() 셀(패치 포함) 실행

# 2) 피처 다시 생성
train_feat = make_features(train_df)

# 3) 확인
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_lag_24h': 'cons_lag_24h' in train_feat.columns,
  'cons_lag_168h': 'cons_lag_168h' in train_feat.columns,
  'delta_1h': 'delta_1h' in train_feat.columns,
  'delta_7d': 'delta_7d' in train_feat.columns
})


In [18]:
# =========================================
# 0) 설정 (경로만 네 환경에 맞게 바꿔)
# =========================================
import os, numpy as np, pandas as pd, lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ▶ 필요 시 네 경로로 교체
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"
TEST_MERGED_PATH  = r"C:\Users\user\Downloads\open (1)\merged_test.csv"      # test+building 병합본
SAMPLE_SUB        = r"C:\Users\user\Downloads\open (1)\sample_submission.csv"
OUT_SUB           = r"C:\Users\user\Downloads\open (1)\submission_rmse_residual.csv"
os.makedirs(os.path.dirname(OUT_SUB), exist_ok=True)

VAL_START = pd.Timestamp(2024,8,17,0)   # 시계열 홀드아웃
VAL_END   = pd.Timestamp(2024,8,24,23)

# (어제 쓰던 Day1 안전 파라미터)
LGBM_PARAMS = {
    'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05,
    'num_leaves': 48, 'feature_fraction': 0.85, 'bagging_fraction': 0.85,
    'bagging_freq': 1, 'min_data_in_leaf': 150, 'lambda_l2': 3.0,
    'seed': 42, 'verbosity': -1, 'num_threads': 4
}

# 하드 빌딩 가중(있으면 조금 더 학습)
HARD_BLD = {64, 1, 34, 3, 6, 10, 45, 79, 54, 23}
HARD_BLD_WEIGHT = 1.8

BASE_W7, BASE_W24 = 0.70, 0.30  # baseline 혼합 비율

In [19]:
# =========================================
# 1) 유틸
# =========================================
def read_csv_smart(path):
    try:
        return pd.read_csv(path, encoding='utf-8-sig')
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' not in df.columns:
            raise ValueError("일시 또는 (날짜,시간) 필요")
        s = df['일시'].astype(str)
        df['날짜'] = pd.to_datetime(s.str.slice(0,8), format='%Y%m%d')
        df['시간'] = s.str.slice(9,11).astype(int)
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def add_safe_weather_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '일사(MJ/m2)' not in df.columns: df['일사(MJ/m2)'] = 0.0
    if '일조(hr)'   not in df.columns: df['일조(hr)']   = 0.0
    return df

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = add_safe_weather_cols(df)
    df = df.sort_values(['건물번호','dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)

    # 기본 라그/롤링 (과거만)
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'].transform(lambda s: s.shift(1).rolling(24, min_periods=1).mean())
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'].transform(lambda s: s.shift(24).rolling(7, min_periods=1).mean())
        df['cons_std_24h'] = grp['cons_lag1'].transform(lambda s: s.rolling(24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # 냉방 수요 지표
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)
    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm
    df['CDD_humid_adj'] = df['CDD'] * (1 + 0.3 * (df['습도(%)'] / 100.0))

    # 달력/시간
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)
    df['month']      = df['dt'].dt.month
    df['hour']       = df['시간']
    df['dayofyear']  = df['dt'].dt.dayofyear

    # 설비·운영 힌트
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        if c in df.columns: df[f'log1p_{c}'] = np.log1p(df[c])
    df['has_pv']  = (df.get('태양광용량(kW)',0)  > 0).astype(int)
    df['has_ess'] = (df.get('ESS저장용량(kWh)',0) > 0).astype(int)
    df['has_pcs'] = (df.get('PCS용량(kW)',0)    > 0).astype(int)
    df['is_offpeak'] = df['hour'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['hour'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)
    df['ess_to_load_lag_ratio'] = np.where(df['cons_lag_24h'].notna(),
                                           df.get('ESS저장용량(kWh)',0.0) / (df['cons_lag_24h'] + 1e-6),
                                           np.nan)
    return df

def build_baseline(df: pd.DataFrame) -> pd.Series:
    """same-hour 7d mean과 24h mean 혼합. 결측은 건물 평균으로 백업"""
    s7  = df['cons_samehour_mean_7d']
    s24 = df['cons_mean_24h']
    base = BASE_W7 * s7 + BASE_W24 * s24
    # 백업: 건물별 과거 평균
    bld_mean = df.groupby('건물번호')['전력소비량(kWh)'].transform('mean') if '전력소비량(kWh)' in df.columns else 0.0
    return base.fillna(bld_mean).fillna(0.0)

def align_for_lgb(df_tr, df_va_or_te, feature_list, cat_cols):
    X1 = df_tr.reindex(columns=feature_list).copy()
    X2 = df_va_or_te.reindex(columns=feature_list).copy()
    for c in (cat_cols or []):
        X1[c] = X1[c].astype('category')
        X2[c] = X2[c].astype('category').cat.set_categories(X1[c].cat.categories)
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X1[num_cols] = X1[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    X2[num_cols] = X2[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    return X1, X2

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

In [18]:
# =========================
# 0) 기본 세팅 & 메트릭
# =========================
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.ensemble import HistGradientBoostingRegressor

# ---- 구성 (필요시 바꿔줘) ----
CONFIG = {
    "ts_col": "timestamp",         # 시간 컬럼
    "group_col": "building_id",    # 그룹(건물) 키
    "y_col": "TARGET",             # 타깃
    "weather": {                   # 기상 컬럼(없으면 None로)
        "temp": "T",
        "rh":   "RH",
        "ws":   "WS",
    },
    "log1p_target": False,         # 분산이 크면 True로 바꿔서 log1p 변환
    "train_days": 60,              # 한 fold의 학습 기간(일)
    "val_days": 14,                # 한 fold의 검증 기간(일)
    "n_folds": 4,                  # 폴드 수
}


In [19]:
# =========================
# 1) 전처리 유틸 함수들
# =========================
def add_time_features(df, ts_col):
    df = df.copy()
    df[ts_col] = pd.to_datetime(df[ts_col])
    df['hour'] = df[ts_col].dt.hour
    df['dow'] = df[ts_col].dt.dayofweek
    df['month'] = df[ts_col].dt.month
    df['is_weekend'] = (df['dow'] >= 5).astype(int)
    # 사이클릭(주기형, 모델이 시간 경계(23→0시)에서 끊기지 않게 함)
    df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)
    df['dow_sin']  = np.sin(2*np.pi*df['dow']/7)
    df['dow_cos']  = np.cos(2*np.pi*df['dow']/7)
    return df

def add_target_lags_rolls(df, y_col, group_col, ts_col):
    # 미래 정보가 새지 않게 lag/rolling 전에 정렬 + shift(1) 사용
    df = df.sort_values([group_col, ts_col]).copy()
    def _by_group(g):
        g[f'{y_col}_lag1']   = g[y_col].shift(1)
        g[f'{y_col}_lag24']  = g[y_col].shift(24)
        g[f'{y_col}_lag168'] = g[y_col].shift(168)
        for w in [3, 24, 168]:
            g[f'{y_col}_roll{w}_mean'] = g[y_col].shift(1).rolling(w, min_periods=2).mean()
            g[f'{y_col}_roll{w}_std']  = g[y_col].shift(1).rolling(w, min_periods=2).std()
        # 지수가중 이동평균 (최근치에 더 가중)
        g[f'{y_col}_ewm'] = g[y_col].shift(1).ewm(alpha=0.3, adjust=False).mean()
        return g
    return df.groupby(group_col, group_keys=False).apply(_by_group)

def add_weather_features(df, weather, ts_col):
    # 기상 파생: CDD/HDD, lag/roll, 상호작용(시간×온도)
    df = df.copy()
    temp = weather.get("temp")
    rh   = weather.get("rh")
    ws   = weather.get("ws")

    if temp and temp in df:
        df['CDD'] = (df[temp] - 23).clip(lower=0)   # 기준온도는 튜닝 포인트
        df['HDD'] = (18 - df[temp]).clip(lower=0)
        for lag in [1, 24]:
            df[f'{temp}_lag{lag}'] = df[temp].shift(lag)
        for w in [3, 24]:
            df[f'{temp}_roll{w}'] = df[temp].rolling(w, min_periods=2).mean()
    if rh and rh in df:
        df[f'{rh}_roll24'] = df[rh].rolling(24, min_periods=2).mean()
    if ws and ws in df:
        df[f'{ws}_roll24'] = df[ws].rolling(24, min_periods=2).mean()

    # 시간 주기와 온도의 상호작용(주기적 패턴이 온도에 따라 달라질 때 유용)
    if temp and ('hour_sin' in df):
        df['T_x_hour'] = df[temp] * df['hour_sin']
    return df

def impute_and_cap_target(df, y_col, group_col, ts_col):
    # TARGET 결측 보간 → 이상치 윈저라이즈(1~99 분위)로 안정화
    df = df.sort_values([group_col, ts_col]).copy()

    # 1) 그룹 내부 선형 대체(앞뒤 채움)
    df[y_col] = df.groupby(group_col)[y_col].apply(lambda s: s.ffill().bfill())

    # 2) 그래도 남으면 시간대별 중앙값(hour×dow)로 2차 보간
    if 'hour' not in df or 'dow' not in df:
        raise ValueError("add_time_features를 먼저 호출해서 hour/dow를 만들어줘.")
    medmap = df.groupby(['hour','dow'])[y_col].median()
    null_idx = df[y_col].isna()
    if null_idx.any():
        df.loc[null_idx, y_col] = df.loc[null_idx, ['hour','dow']].apply(
            lambda r: medmap.get((r['hour'], r['dow']), np.nan), axis=1
        )

    # 3) 이상치 캡핑(각 그룹마다 1~99% 범위로 자름)
    def _winsor(s):
        lo, hi = s.quantile(0.01), s.quantile(0.99)
        return s.clip(lo, hi)
    df[y_col] = df.groupby(group_col)[y_col].transform(_winsor)
    return df

def group_standardize(df, cols, group_col):
    # 그룹별로 평균/표준편차가 다른 경우 분포를 맞춰줌
    df = df.copy()
    for c in cols:
        mu = df.groupby(group_col)[c].tra_


In [20]:
# =========================
# 2) 전체 피처 빌더
# =========================
def build_features(df, cfg=CONFIG):
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    df_ = df.copy()
    # 시간/주기
    df_ = add_time_features(df_, ts)
    # 타깃 결측/이상치 정리(먼저 해두면 뒤의 롤링/라그 계산이 안정적)
    df_ = impute_and_cap_target(df_, y, grp, ts)
    # 라그/롤링
    df_ = add_target_lags_rolls(df_, y, grp, ts)
    # 기상 파생
    df_ = add_weather_features(df_, cfg["weather"], ts)
    # (선택) 그룹 표준화 — 필요한 컬럼만 넣자. 여기선 y만 예시.
    # df_ = group_standardize(df_, [y], grp)

    # 라그/롤링으로 생긴 초반 NaN 제거
    df_ = df_.dropna(subset=[y]).copy()
    return df_


In [21]:
# =========================
# 3) 롤링 폴드 분할
# =========================
def make_rolling_folds(df, cfg=CONFIG):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    assert len(uniq) >= need, "데이터가 부족해. train_days/val_days를 줄여줘."

    folds = []
    # 뒤에서부터 자르되, 반환은 오래된 fold부터
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        train_end = val_start
        train_start = max(0, train_end - train_days * 24)
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]  # 오래된→최신


In [25]:
df

Unnamed: 0,건물번호,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간
0,1,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,0
1,1,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,1
2,1,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,2
3,1,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,3
4,1,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,19
203996,100,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,20
203997,100,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,21
203998,100,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,22


In [22]:
# =========================
# 4) 학습/평가 루프
# =========================
def _select_feature_columns(df, cfg=CONFIG):
    # 모델 입력: 수치형만 선택하고, ID/시간/타깃/명백한 누수 컬럼 제외
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    drop_like = {ts, grp, y}
    cols = []
    for c in df.columns:
        if c in drop_like:
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            cols.append(c)
    return cols

def _metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    # MAPE는 0 나눔 방지용 epsilon
    eps = 1e-6
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(eps, np.abs(y_true)))) * 100
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "MAPE(%)": mape, "R2": r2}

def run_cv_training(df_feat, cfg=CONFIG, model_params=None, verbose=True):
    ts, y = cfg["ts_col"], cfg["y_col"]
    folds = make_rolling_folds(df_feat, cfg)
    feat_cols = _select_feature_columns(df_feat, cfg)

    if model_params is None:
        model_params = dict(
            max_depth=None,      # 자동
            learning_rate=0.06,  # 살짝 낮춰 안정성↑
            max_bins=255,
            min_samples_leaf=20,
            l2_regularization=0.0,
            random_state=42,
        )

    fold_reports = []
    oof_pred = pd.Series(index=df_feat.index, dtype=float)
    models = []

    # (선택) 타깃 로그 변환
    use_log = cfg.get("log1p_target", False)

    for i, ((tr_start, tr_end), (va_start, va_end)) in enumerate(folds, 1):
        tr_mask = (df_feat[ts] >= tr_start) & (df_feat[ts] <= tr_end)
        va_mask = (df_feat[ts] >= va_start) & (df_feat[ts] <= va_end)

        tr_df = df_feat.loc[tr_mask]
        va_df = df_feat.loc[va_mask]

        X_tr = tr_df[feat_cols]
        y_tr = tr_df[y].values
        X_va = va_df[feat_cols]
        y_va = va_df[y].values

        if use_log:
            y_tr_fit = np.log1p(y_tr)
        else:
            y_tr_fit = y_tr

        model = HistGradientBoostingRegressor(**model_params)
        model.fit(X_tr, y_tr_fit)
        models.append(model)

        # 예측 & 역변환
        yhat_va = model.predict(X_va)
        if use_log:
            yhat_va = np.expm1(yhat_va)

        oof_pred.loc[va_df.index] = yhat_va
        rep = _metrics(y_va, yhat_va)
        rep["fold"] = i
        rep["train_range"] = (tr_start, tr_end)
        rep["val_range"] = (va_start, va_end)
        fold_reports.append(rep)
        if verbose:
            print(f"[Fold {i}] {rep}")

    # 전체 OOF 점수
    whole = _metrics(df_feat[y].loc[oof_pred.index].values, oof_pred.values)
    if verbose:
        print("\n[OOF] ", whole)

    rep_df = pd.DataFrame(fold_reports)
    return {
        "feature_cols": feat_cols,
        "fold_reports": rep_df,
        "oof_metrics": whole,
        "oof_pred": oof_pred,
        "models": models,
    }


In [26]:
import numpy as np
import pandas as pd

# 0-1) 너 데이터 컬럼명에 맞춘 CONFIG
CONFIG = {
    "ts_col": "timestamp",           # 아래에서 '날짜'+'시간' 합쳐 만들어줄 컬럼
    "group_col": "건물번호",          # 그룹 키
    "y_col": "전력소비량(kWh)",       # 타깃
    "weather": {                     # 기상 컬럼 매핑
        "temp": "기온(°C)",
        "rh":   "습도(%)",
        "ws":   "풍속(m/s)",
        # 아래 3개는 있으면 자동으로 같이 씀(없으면 무시됨)
        "radiation": "일사(MJ/m2)",
        "sunshine":  "일조(hr)",
        "precip":    "강수량(mm)",
    },
    "log1p_target": False,           # 타깃 로그 변환이 필요하면 True
    "train_days": 60,
    "val_days": 14,
    "n_folds": 4,
}

# 0-2) '날짜' + '시간' → timestamp 생성 (형식이 '13', '13:00', '13시' 등이어도 웬만해선 처리)
def make_timestamp(df, date_col="날짜", time_col="시간", out_col="timestamp"):
    d = pd.to_datetime(df[date_col], errors="coerce")

    t_raw = df[time_col].astype(str).str.strip()
    # (1) '13시', '13:00' 등에서 시간 숫자만 뽑기
    hh = pd.to_numeric(t_raw.str.extract(r"(\d{1,2})")[0], errors="coerce")

    ts = d + pd.to_timedelta(hh.fillna(0), unit="h")  # 우선 시간만 붙임
    # (2) 만약 일부가 'HH:MM'처럼 들어와서 위에서 NaN이면, 날짜 문자열과 합쳐서 직접 파싱
    mask = hh.isna()
    if mask.any():
        ts2 = pd.to_datetime(
            d[mask].dt.strftime("%Y-%m-%d") + " " + t_raw[mask],
            errors="coerce"
        )
        ts[mask] = ts2

    df = df.copy()
    df[out_col] = ts
    return df

# 사용: df = make_timestamp(df, "날짜", "시간")


In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

def add_time_features(df, ts_col):
    df = df.copy()
    df[ts_col] = pd.to_datetime(df[ts_col])
    df['hour'] = df[ts_col].dt.hour
    df['dow'] = df[ts_col].dt.dayofweek
    df['month'] = df[ts_col].dt.month
    df['is_weekend'] = (df['dow'] >= 5).astype(int)
    # 주기형 인코딩(23→0 경계 끊김 방지)
    df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)
    df['dow_sin']  = np.sin(2*np.pi*df['dow']/7)
    df['dow_cos']  = np.cos(2*np.pi*df['dow']/7)
    return df

def add_target_lags_rolls(df, y_col, group_col, ts_col):
    # 미래 정보 누수 방지: 정렬 + shift(1) 후 롤링
    df = df.sort_values([group_col, ts_col]).copy()
    def _by_group(g):
        g[f'{y_col}_lag1']   = g[y_col].shift(1)
        g[f'{y_col}_lag24']  = g[y_col].shift(24)
        g[f'{y_col}_lag168'] = g[y_col].shift(168)
        for w in [3, 24, 168]:
            g[f'{y_col}_roll{w}_mean'] = g[y_col].shift(1).rolling(w, min_periods=2).mean()
            g[f'{y_col}_roll{w}_std']  = g[y_col].shift(1).rolling(w, min_periods=2).std()
        g[f'{y_col}_ewm'] = g[y_col].shift(1).ewm(alpha=0.3, adjust=False).mean()
        return g
    return df.groupby(group_col, group_keys=False).apply(_by_group)

def add_weather_features(df, weather, ts_col):
    # 기상 파생: CDD/HDD, 온도/습도/풍속 lag/roll, + (선택) 일사/일조/강수 파생
    df = df.copy()
    temp = weather.get("temp")
    rh   = weather.get("rh")
    ws   = weather.get("ws")
    rad  = weather.get("radiation")
    sun  = weather.get("sunshine")
    prcp = weather.get("precip")

    if temp and temp in df:
        df['CDD'] = (df[temp] - 23).clip(lower=0)   # 기준온도는 튜닝 포인트
        df['HDD'] = (18 - df[temp]).clip(lower=0)
        for lag in [1, 24]:
            df[f'{temp}_lag{lag}'] = df[temp].shift(lag)
        for w in [3, 24]:
            df[f'{temp}_roll{w}'] = df[temp].rolling(w, min_periods=2).mean()
    if rh and rh in df:
        df[f'{rh}_roll24'] = df[rh].rolling(24, min_periods=2).mean()
    if ws and ws in df:
        df[f'{ws}_roll24'] = df[ws].rolling(24, min_periods=2).mean()

    # 일사/일조/강수는 그대로 써도 되지만, 약간의 파생이 도움이 될 때가 있음
    if rad and rad in df:
        df[f'{rad}_roll24'] = df[rad].rolling(24, min_periods=2).mean()
    if sun and sun in df:
        df[f'{sun}_roll24'] = df[sun].rolling(24, min_periods=2).mean()
    if prcp and prcp in df:
        df[f'{prcp}_roll24'] = df[prcp].rolling(24, min_periods=2).sum()  # 누적강수량 합계
        df['is_rain'] = (df[prcp] > 0).astype(int)

    # 시간 주기 × 온도 상호작용 (패턴이 온도에 따라 달라질 때 유용)
    if temp and ('hour_sin' in df):
        df['T_x_hour'] = df[temp] * df['hour_sin']
    return df

def impute_and_cap_target(df, y_col, group_col, ts_col):
    # TARGET 결측 보간 → 이상치 winsorize(1~99%)로 안정화
    df = df.sort_values([group_col, ts_col]).copy()

    # 1) 그룹 내부 forward/back fill
    df[y_col] = df.groupby(group_col)[y_col].apply(lambda s: s.ffill().bfill())

    # 2) 여전히 NaN이면 시간대별(hour×dow) 중앙값으로 2차 보간
    if 'hour' not in df or 'dow' not in df:
        raise ValueError("add_time_features를 먼저 호출해서 hour/dow를 만들어줘.")
    medmap = df.groupby(['hour','dow'])[y_col].median()
    null_idx = df[y_col].isna()
    if null_idx.any():
        df.loc[null_idx, y_col] = df.loc[null_idx, ['hour','dow']].apply(
            lambda r: medmap.get((r['hour'], r['dow']), np.nan), axis=1
        )

    # 3) 이상치 캡핑(각 그룹에서 1~99% 범위로 자름)
    def _winsor(s):
        lo, hi = s.quantile(0.01), s.quantile(0.99)
        return s.clip(lo, hi)
    df[y_col] = df.groupby(group_col)[y_col].transform(_winsor)
    return df

def group_standardize(df, cols, group_col):
    # 그룹별 평균/표준편차가 크게 다를 때 분포 맞춤
    df = df.copy()
    for c in cols:
        mu = df.groupby(group_col)[c].transform('mean')
        sd = df.groupby(group_col)[c].transform('std').replace(0, np.nan)
        df[f'{c}_z'] = (df[c] - mu) / sd
    return df


In [62]:
# === 한국 공휴일/브릿지데이 (2024년 여름 구간) ===
def add_korean_holidays_2024(df, ts_col="timestamp"):
    df = df.copy()
    ts = pd.to_datetime(df[ts_col])
    d = ts.dt.normalize()

    # 2024-06-06 (현충일), 2024-08-15 (광복절)
    holi = pd.to_datetime(["2024-06-06", "2024-08-15"])
    df["is_holiday"] = d.isin(holi).astype(int)

    # 브릿지데이: 휴일과 주말 사이의 평일(금/월 등)
    # 간단히: 휴일 전날/다음날이 평일이면 bridge=1
    weekday = ts.dt.dayofweek  # 0=월 ... 6=일
    prev_is_holi = d.shift(1).isin(holi)
    next_is_holi = d.shift(-1).isin(holi)
    is_weekend = (weekday >= 5)
    df["is_bridge"] = (
        ((prev_is_holi | next_is_holi) & ~is_weekend).astype(int)
    )

    return df


In [109]:
# === 확장 휴일 피처 (2024 여름 예시) ===
def add_korean_holidays_extended(df, ts_col="timestamp", pre_days=1, post_days=1):
    df = df.copy()
    ts = pd.to_datetime(df[ts_col]); d = ts.dt.normalize()

    # 필요한 날짜만 깔끔히: 6~8월
    holi = pd.to_datetime(["2024-06-06", "2024-08-15"])

    is_holiday = d.isin(holi)
    df["is_holiday"] = is_holiday.astype(int)

    # 전후 k일 윈도
    win = set()
    for h in holi:
        for k in range(-pre_days, post_days+1):
            win.add((h + pd.Timedelta(days=k)).normalize())
    df["is_holi_window"] = d.isin(list(win)).astype(int)

    # 브릿지데이(휴일과 주말 사이의 평일)
    w = ts.dt.dayofweek  # 0=월..6=일
    prev_h = d.shift(1).isin(holi); next_h = d.shift(-1).isin(holi)
    is_weekend = (w >= 5)
    df["is_bridge_strong"] = (((prev_h | next_h) & ~is_weekend)).astype(int)

    # 휴일 전날/다음날 더미(직접)
    df["is_before_holi"] = d.shift(-1).isin(holi).astype(int)
    df["is_after_holi"]  = d.shift(1).isin(holi).astype(int)
    return df

In [119]:
def build_features(df, cfg=CONFIG):
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    df_ = df.copy()

    # (A) timestamp 만들기
    if ts not in df_:
        df_ = make_timestamp(df_, "날짜", "시간", out_col=ts)

    # (B) 숫자형 변환(문자 섞여도 안전)
    numeric_candidates = [
        y, "연면적(m2)", "냉방면적(m2)", "태양광용량(kW)",
        "ESS저장용량(kWh)", "PCS용량(kW)",
        cfg["weather"].get("temp"), cfg["weather"].get("rh"),
        cfg["weather"].get("ws"), cfg["weather"].get("radiation"),
        cfg["weather"].get("sunshine"), cfg["weather"].get("precip"),
    ]
    for c in set([c for c in numeric_candidates if c and c in df_.columns]):
        df_[c] = pd.to_numeric(df_[c], errors="coerce")

    # (C) 시간/주기
    df_ = add_korean_holidays_extended(df_, ts, pre_days=1, post_days=1)
    df_ = add_korean_holidays_2024(df_, ts)   # ← 요 줄 추가
    # (D) 타깃 결측/이상치
    df_ = impute_and_cap_target(df_, y, grp, ts)
    # (E) 라그/롤링
    df_ = add_target_lags_rolls(df_, y, grp, ts)

    # (+) 같은 시간대 과거 평균 & 안전한 델타/비율
    df_ = add_same_how_history(df_, y, grp, ts)
    df_ = add_safe_deltas(df_, y)

    # (+) 건물 장기 히스토리 피처  ← 새로 추가
    df_ = add_building_history_features(df_, y, grp, ts)

    # (F) 기상 파생 후
    df_ = add_weather_features(df_, cfg["weather"], ts)
    df_ = add_cdd_hdd_aggregates(df_)   # 기존
    df_ = add_piecewise_temp_features(df_, temp_col=cfg["weather"].get("temp"))  # ← 추가


    # (+) CDD/HDD 누적 합계  ← 새로 추가
    df_ = add_cdd_hdd_aggregates(df_)

    # (+) 고급 파생(이슬점/비선형 등)
    df_ = add_extra_features_v2(df_, cfg)
    df_ = add_type_interactions_selective(df_)


    # (G) 범주 → 원-핫
    if "건물유형" in df_:
        df_ = pd.concat([df_, pd.get_dummies(df_["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)

    # (H) 라그/롤링 앞부분 NaN 정리
    df_ = df_.dropna(subset=[y]).reset_index(drop=True)

    # 🔴 반드시 반환!
    return df_


In [30]:
def make_rolling_folds(df, cfg=CONFIG):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    assert len(uniq) >= need, "데이터가 부족해. train_days/val_days를 줄여줘."

    folds = []
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        train_end = val_start
        train_start = max(0, train_end - train_days * 24)
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]

def _select_feature_columns(df, cfg=CONFIG):
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    drop_like = {ts, grp, y, "날짜", "시간", "건물유형"}  # 명백한 식별자/원본 범주 제거
    cols = []
    for c in df.columns:
        if c in drop_like:
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            cols.append(c)
    return cols

def _metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    eps = 1e-6
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(eps, np.abs(y_true)))) * 100
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "MAPE(%)": mape, "R2": r2}

def run_cv_training(df_feat, cfg=CONFIG, model_params=None, verbose=True):
    ts, y = cfg["ts_col"], cfg["y_col"]
    folds = make_rolling_folds(df_feat, cfg)
    feat_cols = _select_feature_columns(df_feat, cfg)

    if model_params is None:
        model_params = dict(
            max_depth=None,
            learning_rate=0.06,
            max_bins=255,
            min_samples_leaf=20,
            l2_regularization=0.0,
            random_state=42,
        )

    oof_pred = pd.Series(index=df_feat.index, dtype=float)
    reports = []
    models = []
    use_log = cfg.get("log1p_target", False)

    for i, ((tr_start, tr_end), (va_start, va_end)) in enumerate(folds, 1):
        tr_mask = (df_feat[ts] >= tr_start) & (df_feat[ts] <= tr_end)
        va_mask = (df_feat[ts] >= va_start) & (df_feat[ts] <= va_end)

        X_tr = df_feat.loc[tr_mask, feat_cols]
        y_tr = df_feat.loc[tr_mask, y].values
        X_va = df_feat.loc[va_mask, feat_cols]
        y_va = df_feat.loc[va_mask, y].values

        y_fit = np.log1p(y_tr) if use_log else y_tr
        model = HistGradientBoostingRegressor(**model_params).fit(X_tr, y_fit)
        models.append(model)

        yhat_va = model.predict(X_va)
        if use_log:
            yhat_va = np.expm1(yhat_va)

        oof_pred.loc[X_va.index] = yhat_va
        rep = _metrics(y_va, yhat_va)
        rep["fold"] = i
        rep["train_range"] = (tr_start, tr_end)
        rep["val_range"] = (va_start, va_end)
        reports.append(rep)
        if verbose:
            print(f"[Fold {i}] {rep}")

    whole = _metrics(df_feat[y].loc[oof_pred.index].values, oof_pred.values)
    if verbose:
        print("\n[OOF] ", whole)

    return {
        "feature_cols": feat_cols,
        "fold_reports": pd.DataFrame(reports),
        "oof_metrics": whole,
        "oof_pred": oof_pred,
        "models": models,
    }


In [53]:
def impute_and_cap_target(df, y_col, group_col, ts_col):
    """
    1) 건물(그룹) 내부에서 TARGET을 ffill/bfill로 보간
    2) 남은 NaN은 hour×dow 중앙값으로 채움
    3) 그룹별로 1~99 분위 클리핑(윈저라이즈)로 이상치 완화
    ※ 전부 transform 기반이라 인덱스가 깨지지 않음
    """
    df = df.sort_values([group_col, ts_col]).copy()

    # 1) 그룹 내 시계열 보간 (인덱스 보존)
    df[y_col] = df.groupby(group_col)[y_col].transform(lambda s: s.ffill().bfill())

    # 2) hour×dow 중앙값으로 2차 보간 (apply 대신 transform으로 한 방에 정렬)
    if 'hour' not in df or 'dow' not in df:
        raise ValueError("add_time_features를 먼저 호출해서 hour/dow를 만들어줘.")
    hour_dow_median = df.groupby(['hour','dow'])[y_col].transform('median')
    df[y_col] = df[y_col].fillna(hour_dow_median)

    # 3) 이상치 윈저라이즈 (그룹별 분위수 기반 클리핑, 역시 transform)
    def _winsor(s):
        lo, hi = s.quantile(0.01), s.quantile(0.99)
        return s.clip(lo, hi)
    df[y_col] = df.groupby(group_col)[y_col].transform(_winsor)

    return df


In [40]:
def add_target_lags_rolls(df, y_col, group_col, ts_col):
    """
    그룹별 타깃 lag/rolling/EWM 생성 (누수 방지).
    groupby.apply 대신 shift/rolling/transform을 조합해서 인덱스 보존 + 경고 제거.
    """
    df = df.sort_values([group_col, ts_col]).copy()
    g = df.groupby(group_col)

    # Lags
    df[f'{y_col}_lag1']   = g[y_col].shift(1)
    df[f'{y_col}_lag24']  = g[y_col].shift(24)
    df[f'{y_col}_lag168'] = g[y_col].shift(168)

    # Rolling stats (shift(1) 뒤 rolling, 멀티인덱스 → 원래 인덱스로 평탄화)
    for w in [3, 24, 168]:
        roll_mean = (g[y_col].shift(1).rolling(w, min_periods=2).mean()
                     .reset_index(level=0, drop=True))
        roll_std  = (g[y_col].shift(1).rolling(w, min_periods=2).std()
                     .reset_index(level=0, drop=True))
        df[f'{y_col}_roll{w}_mean'] = roll_mean
        df[f'{y_col}_roll{w}_std']  = roll_std

    # EWM
    df[f'{y_col}_ewm'] = g[y_col].transform(lambda s: s.shift(1).ewm(alpha=0.3, adjust=False).mean())
    return df

def impute_and_cap_target(df, y_col, group_col, ts_col):
    """
    1) 그룹 내 ffill/bfill
    2) 남은 NaN은 hour×dow 중앙값으로 대체
    3) 그룹별 1~99 분위 윈저라이즈
    (모두 transform 기반 → 인덱스 안정)
    """
    df = df.sort_values([group_col, ts_col]).copy()
    # 1) 그룹 내부 보간
    df[y_col] = df.groupby(group_col)[y_col].transform(lambda s: s.ffill().bfill())
    # 2) hour×dow 중앙값
    if 'hour' not in df or 'dow' not in df:
        raise ValueError("add_time_features를 먼저 호출해서 hour/dow를 만들어줘.")
    hour_dow_median = df.groupby(['hour','dow'])[y_col].transform('median')
    df[y_col] = df[y_col].fillna(hour_dow_median)
    # 3) 윈저라이즈
    def _winsor(s):
        lo, hi = s.quantile(0.01), s.quantile(0.99)
        return s.clip(lo, hi)
    df[y_col] = df.groupby(group_col)[y_col].transform(_winsor)
    return df


In [80]:
def make_rolling_folds(df, cfg=CONFIG):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]

    if df is None:
        raise ValueError("make_rolling_folds: df가 None이야. build_features가 반환했는지 확인해줘.")

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    if len(uniq) < need:
        raise ValueError(f"시계열 고유 타임스텝이 부족해요. 필요={need}, 현재={len(uniq)}. "
                         "train_days/val_days를 줄여보거나 데이터 기간을 늘려줘.")

    folds = []
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        train_end = val_start
        train_start = max(0, train_end - train_days * 24)
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]

# ===== 패치: gamma 손실을 위한 안전한 시프트/복원 로직 =====
def run_cv_training(df_feat, cfg=CONFIG, model_params=None, verbose=True):
    if df_feat is None:
        raise ValueError("run_cv_training: df_feat가 None이야. build_features() 마지막에 return이 있는지 확인해줘.")

    ts, y = cfg["ts_col"], cfg["y_col"]
    folds = make_rolling_folds(df_feat, cfg)
    feat_cols = _select_feature_columns(df_feat, cfg)

    if model_params is None:
        model_params = dict(
            max_depth=None, learning_rate=0.06, max_bins=255,
            min_samples_leaf=20, l2_regularization=0.0, random_state=42
        )

    loss_name = str(model_params.get("loss", "squared_error")).lower()
    use_gamma = (loss_name == "gamma")
    use_poisson = (loss_name == "poisson")

    # gamma는 y>0 필수. 데이터에 0이 있으면 작은 eps로 올려 학습 후 복원
    # eps는 데이터 스케일에 비례하게 아주 작게 설정
    global_mean = float(np.nanmean(df_feat[y].values))
    gamma_eps = max(1e-6, 1e-3 * global_mean) if use_gamma else 0.0

    # log1p_target은 gamma/poisson과 같이 쓰지 않도록 강제 비활성화
    use_log = bool(cfg.get("log1p_target", False))
    if use_gamma or use_poisson:
        if use_log and verbose:
            print("[주의] gamma/poisson 손실에서는 log1p_target을 무시할게.")
        use_log = False

    oof_pred = pd.Series(index=df_feat.index, dtype=float)
    reports, models = [], []

    for i, ((tr_start, tr_end), (va_start, va_end)) in enumerate(folds, 1):
        tr_mask = (df_feat[ts] >= tr_start) & (df_feat[ts] <= tr_end)
        va_mask = (df_feat[ts] >= va_start) & (df_feat[ts] <= va_end)

        X_tr = df_feat.loc[tr_mask, feat_cols]
        y_tr = df_feat.loc[tr_mask, y].values
        X_va = df_feat.loc[va_mask, feat_cols]
        y_va = df_feat.loc[va_mask, y].values

        if len(X_va) == 0 or len(X_tr) == 0:
            if verbose:
                print(f"[Fold {i}] 건너뜀 (train {len(X_tr)}, val {len(X_va)})")
            continue

        # 타깃 변환
        y_fit = np.log1p(y_tr) if use_log else y_tr
        if use_gamma:
            y_fit = y_fit + gamma_eps  # strictly positive

        model = HistGradientBoostingRegressor(**model_params).fit(X_tr, y_fit)
        models.append(model)

        # 예측 & 역변환
        yhat_va = model.predict(X_va)
        if use_gamma:
            yhat_va = yhat_va - gamma_eps
            yhat_va = np.maximum(0.0, yhat_va)  # 음수 방지
        if use_log:
            yhat_va = np.expm1(yhat_va)

        oof_pred.loc[X_va.index] = yhat_va
        rep = _metrics(y_va, yhat_va)
        rep["fold"] = i
        rep["train_range"] = (tr_start, tr_end)
        rep["val_range"] = (va_start, va_end)
        reports.append(rep)
        if verbose:
            print(f"[Fold {i}] {rep}")

    # OOF 점수는 예측이 채워진 인덱스만
    valid = oof_pred.notna()
    if valid.sum() == 0:
        raise ValueError("OOF 예측이 비어 있어. folds 범위나 train_days/val_days를 확인해줘.")
    whole = _metrics(df_feat.loc[valid, y].values, oof_pred.loc[valid].values)

    if verbose:
        cov = 100 * valid.mean()
        print(f"\n[OOF] {whole}  | coverage={cov:.1f}%")

    return {
        "feature_cols": feat_cols,
        "fold_reports": pd.DataFrame(reports),
        "oof_metrics": whole,
        "oof_pred": oof_pred,
        "oof_mask": valid,
        "models": models,
    }


In [47]:
def run_cv_training(df_feat, cfg=CONFIG, model_params=None, verbose=True):
    if df_feat is None:
        raise ValueError("run_cv_training: df_feat가 None이야. build_features() 마지막에 return이 있는지 확인해줘.")

    ts, y = cfg["ts_col"], cfg["y_col"]
    folds = make_rolling_folds(df_feat, cfg)
    feat_cols = _select_feature_columns(df_feat, cfg)

    if model_params is None:
        model_params = dict(
            max_depth=None, learning_rate=0.06, max_bins=255,
            min_samples_leaf=20, l2_regularization=0.0, random_state=42
        )

    oof_pred = pd.Series(index=df_feat.index, dtype=float)
    reports, models = [], []
    use_log = cfg.get("log1p_target", False)

    for i, ((tr_start, tr_end), (va_start, va_end)) in enumerate(folds, 1):
        tr_mask = (df_feat[ts] >= tr_start) & (df_feat[ts] <= tr_end)
        va_mask = (df_feat[ts] >= va_start) & (df_feat[ts] <= va_end)

        X_tr = df_feat.loc[tr_mask, feat_cols]
        y_tr = df_feat.loc[tr_mask, y].values
        X_va = df_feat.loc[va_mask, feat_cols]
        y_va = df_feat.loc[va_mask, y].values

        if len(X_va) == 0 or len(X_tr) == 0:
            if verbose:
                print(f"[Fold {i}] 건너뜀 (train {len(X_tr)}, val {len(X_va)})")
            continue

        y_fit = np.log1p(y_tr) if use_log else y_tr
        model = HistGradientBoostingRegressor(**model_params).fit(X_tr, y_fit)
        models.append(model)

        yhat_va = model.predict(X_va)
        if use_log:
            yhat_va = np.expm1(yhat_va)

        oof_pred.loc[X_va.index] = yhat_va
        rep = _metrics(y_va, yhat_va)
        rep["fold"] = i
        rep["train_range"] = (tr_start, tr_end)
        rep["val_range"] = (va_start, va_end)
        reports.append(rep)
        if verbose:
            print(f"[Fold {i}] {rep}")

    # ✅ 여기부터 핵심 패치
    valid = oof_pred.notna()
    if valid.sum() == 0:
        raise ValueError("OOF 예측이 비어 있어. folds 범위나 데이터 기간(train_days/val_days)을 확인해줘.")
    whole = _metrics(df_feat.loc[valid, y].values, oof_pred.loc[valid].values)

    if verbose:
        cov = 100 * valid.mean()
        print(f"\n[OOF] {whole}  | coverage={cov:.1f}% (검증으로 예측 채워진 비율)")

    return {
        "feature_cols": feat_cols,
        "fold_reports": pd.DataFrame(reports),
        "oof_metrics": whole,
        "oof_pred": oof_pred,
        "oof_mask": valid,      # <- 덤으로 반환
        "models": models,
    }


In [49]:
# ===== 재실행 원스톱 셀 =====
import numpy as np, pandas as pd

# 0) CONFIG 보정(없으면 생성)
if 'CONFIG' not in globals():
    CONFIG = {
        "ts_col": "timestamp",
        "group_col": "건물번호",
        "y_col": "전력소비량(kWh)",
        "weather": {
            "temp": "기온(°C)",
            "rh":   "습도(%)",
            "ws":   "풍속(m/s)",
            "radiation": "일사(MJ/m2)",
            "sunshine":  "일조(hr)",
            "precip":    "강수량(mm)",
        },
        "log1p_target": False,
        "train_days": 60,
        "val_days": 14,
        "n_folds": 4,
    }

# 1) 필수 객체/함수 체크
required_funcs = ["make_timestamp", "build_features", "run_cv_training"]
missing = [f for f in required_funcs if f not in globals()]
if missing:
    raise RuntimeError(f"다음 함수가 아직 정의 안됨: {missing}\n위에서 해당 셀을 먼저 실행해줘.")

# 2) 필수 컬럼 존재 확인
needed_cols = ["날짜","시간","건물번호","전력소비량(kWh)"]
missing_cols = [c for c in needed_cols if c not in df.columns]
if missing_cols:
    raise RuntimeError(f"df에 필요한 컬럼이 없음: {missing_cols}")

# 3) timestamp 만들기 → 피처 생성 → 학습/검증
df = make_timestamp(df, "날짜", "시간", out_col=CONFIG["ts_col"])
df_feat = build_features(df, CONFIG)
result = run_cv_training(df_feat, CONFIG)

# 4) 커버리지/점수 출력 (oof_mask가 없으면 oof_pred.notna()로 대체)
valid_mask = result.get("oof_mask", result["oof_pred"].notna())
cov = 100 * valid_mask.mean()
print(f"OOF coverage = {cov:.1f}%")
print("OOF metrics  =", result["oof_metrics"])

# 5) 폴드별 결과 요약 보기
cols = ["fold","MAE","RMSE","MAPE(%)","R2","train_range","val_range"]
display(result["fold_reports"][cols])


[Fold 1] {'MAE': 119.71523123658346, 'RMSE': np.float64(230.0002537910461), 'MAPE(%)': np.float64(7.7694092790840426), 'R2': 0.9960026421026723, 'fold': 1, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-29 23:00:00')), 'val_range': (Timestamp('2024-06-30 00:00:00'), Timestamp('2024-07-13 23:00:00'))}
[Fold 2] {'MAE': 126.68804903025146, 'RMSE': np.float64(255.60232602219125), 'MAPE(%)': np.float64(7.833308160946406), 'R2': 0.9953670044829893, 'fold': 2, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-13 23:00:00')), 'val_range': (Timestamp('2024-07-14 00:00:00'), Timestamp('2024-07-27 23:00:00'))}
[Fold 3] {'MAE': 128.0302325658229, 'RMSE': np.float64(244.48733091327867), 'MAPE(%)': np.float64(7.650047585087803), 'R2': 0.9960632453782693, 'fold': 3, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-27 23:00:00')), 'val_range': (Timestamp('2024-07-28 00:00:00'), Timestamp('2024-08-10 23:00:00'))}
[Fold 4] {'MAE': 133.08162720

Unnamed: 0,fold,MAE,RMSE,MAPE(%),R2,train_range,val_range
0,1,119.715231,230.000254,7.769409,0.996003,"(2024-06-01 00:00:00, 2024-06-29 23:00:00)","(2024-06-30 00:00:00, 2024-07-13 23:00:00)"
1,2,126.688049,255.602326,7.833308,0.995367,"(2024-06-01 00:00:00, 2024-07-13 23:00:00)","(2024-07-14 00:00:00, 2024-07-27 23:00:00)"
2,3,128.030233,244.487331,7.650048,0.996063,"(2024-06-01 00:00:00, 2024-07-27 23:00:00)","(2024-07-28 00:00:00, 2024-08-10 23:00:00)"
3,4,133.081627,259.439654,8.472527,0.995527,"(2024-06-12 00:00:00, 2024-08-10 23:00:00)","(2024-08-11 00:00:00, 2024-08-24 23:00:00)"


In [51]:
# ===== A) 추가 파생 피처: hour_of_week 원-핫, 이슬점, 비선형 교호 =====
def add_extra_features_v2(df, cfg=CONFIG):
    df = df.copy()
    ts = cfg["ts_col"]
    temp = cfg["weather"].get("temp")
    rh   = cfg["weather"].get("rh")
    rad  = cfg["weather"].get("radiation")
    sun  = cfg["weather"].get("sunshine")
    prcp = cfg["weather"].get("precip")

    # 1) hour_of_week (0~167) + 원-핫
    if {"hour","dow"}.issubset(df.columns):
        df["hour_of_week"] = df["dow"]*24 + df["hour"]
        how_dum = pd.get_dummies(df["hour_of_week"], prefix="how", dummy_na=False)
        df = pd.concat([df, how_dum], axis=1)

        # 근무/야간/피크 히ュー리스틱
        df["is_workhour"] = (((df["dow"] < 5) & (df["hour"].between(9, 18))).astype(int))
        df["is_night"]    = (((df["hour"] <= 6) | (df["hour"] >= 22)).astype(int))

    # 2) 이슬점(°C) ~ Magnus 공식 근사 (냉방/제습 부하 관련성 ↑)
    # Td ≈ (b * γ(T,RH)) / (a - γ(T,RH)), γ = aT/(b+T)+ln(RH/100), a=17.27, b=237.7
    if temp in df and rh in df:
        a, b = 17.27, 237.7
        T = pd.to_numeric(df[temp], errors="coerce")
        RH = pd.to_numeric(df[rh], errors="coerce").clip(1, 100)  # 0 회피
        gamma = (a*T)/(b+T) + np.log(RH/100.0)
        df["dewpoint"] = (b*gamma) / (a - gamma)

        # 3) 비선형/교호
        df["temp2"]   = T**2
        df["temp_rh"] = T * RH
        if "is_workhour" in df:
            df["T_x_work"] = T * df["is_workhour"]

    # 4) 태양 관련: 일사/일조가 있으면 완만한 창(24h) 추가(이미 없으면 skip)
    if rad and rad in df and f"{rad}_roll24" not in df:
        df[f"{rad}_roll24"] = df[rad].rolling(24, min_periods=2).mean()
    if sun and sun in df and f"{sun}_roll24" not in df:
        df[f"{sun}_roll24"] = df[sun].rolling(24, min_periods=2).mean()
    if prcp and prcp in df and f"{prcp}_roll24" not in df:
        df[f"{prcp}_roll24"] = df[prcp].rolling(24, min_periods=2).sum()
        df["is_rain"] = (df[prcp] > 0).astype(int)

    return df


In [54]:
# ===== C) 실험 유틸: 다양한 모델/옵션을 자동 비교 =====
def try_configs(df_feat, base_cfg=CONFIG, exp_list=None, verbose=True):
    if exp_list is None:
        exp_list = [
            ("baseline",               {"loss": "squared_error"}),
            ("mae_loss",               {"loss": "absolute_error"}),
            ("poisson",                {"loss": "poisson"}),  # 양수 타깃일 때 종종 유리
            ("mae_regul",              {"loss": "absolute_error", "min_samples_leaf": 50, "l2_regularization": 1.0}),
            ("mae_slow_lr",            {"loss": "absolute_error", "learning_rate": 0.04, "min_samples_leaf": 30}),
        ]
    results = []
    for name, mp in exp_list:
        params = dict(
            max_depth=None, learning_rate=0.06, max_bins=255,
            min_samples_leaf=20, l2_regularization=0.0, random_state=42,
        )
        params.update(mp)
        if verbose:
            print(f"\n=== [{name}] params: {params}")
        out = run_cv_training(df_feat, base_cfg, model_params=params, verbose=False)
        metrics = out["oof_metrics"].copy()
        metrics["name"] = name
        results.append((metrics, out))
        print(f"OOF -> MAE {metrics['MAE']:.3f} | RMSE {float(metrics['RMSE']):.3f} | "
              f"MAPE {float(metrics['MAPE(%)']):.3f}% | R2 {metrics['R2']:.6f}")
    # 정렬 출력
    ranked = sorted(results, key=lambda x: (x[0]["MAE"], x[0]["MAPE(%)"]))
    print("\n=== Top by MAE/MAPE ===")
    for m,_ in ranked[:3]:
        print(f"{m['name']}: MAE {m['MAE']:.3f}, MAPE {float(m['MAPE(%)']):.3f}%, R2 {m['R2']:.6f}")
    return ranked


In [55]:
# ===== D) 원클릭 실행 =====
# (1) timestamp
df = make_timestamp(df, "날짜", "시간", out_col=CONFIG["ts_col"])

# (2) 피처 생성 (새 파생 포함)
df_feat = build_features(df, CONFIG)

# (3) 실험 실행
ranked = try_configs(df_feat, CONFIG)

# (4) 최상 모델/파라미터로 다시 돌리고 폴드 점수/coverage 보고 싶으면:
best_metrics, best_out = ranked[0]
print("\nBest config:", best_metrics["name"])
print("Best OOF:", {k: (float(v) if not isinstance(v, (float,int)) else v)
                   for k,v in best_out["oof_metrics"].items()})
cov = 100 * best_out.get("oof_mask", best_out["oof_pred"].notna()).mean()
print(f"Coverage: {cov:.1f}%")
display(best_out["fold_reports"][["fold","MAE","RMSE","MAPE(%)","R2","train_range","val_range"]])



=== [baseline] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 20, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'squared_error'}
OOF -> MAE 126.023 | RMSE 242.045 | MAPE 7.967% | R2 0.995935

=== [mae_loss] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 20, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'absolute_error'}
OOF -> MAE 164.708 | RMSE 476.692 | MAPE 8.562% | R2 0.984234

=== [poisson] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 20, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'poisson'}
OOF -> MAE 124.333 | RMSE 252.706 | MAPE 5.546% | R2 0.995569

=== [mae_regul] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 50, 'l2_regularization': 1.0, 'random_state': 42, 'loss': 'absolute_error'}
OOF -> MAE 164.993 | RMSE 479.680 | MAPE 8.508% | R2 0.984036

=== [mae_slow_lr] params: {'max_depth':

Unnamed: 0,fold,MAE,RMSE,MAPE(%),R2,train_range,val_range
0,1,116.2697,229.347117,5.690913,0.996025,"(2024-06-01 00:00:00, 2024-06-29 23:00:00)","(2024-06-30 00:00:00, 2024-07-13 23:00:00)"
1,2,121.234556,249.746441,5.466722,0.995577,"(2024-06-01 00:00:00, 2024-07-13 23:00:00)","(2024-07-14 00:00:00, 2024-07-27 23:00:00)"
2,3,127.515273,258.055846,5.302344,0.995614,"(2024-06-01 00:00:00, 2024-07-27 23:00:00)","(2024-07-28 00:00:00, 2024-08-10 23:00:00)"
3,4,132.31249,271.798404,5.725435,0.995091,"(2024-06-12 00:00:00, 2024-08-10 23:00:00)","(2024-08-11 00:00:00, 2024-08-24 23:00:00)"


In [56]:
BEST_PARAMS = dict(
    loss="poisson",        # 양수 타깃 + 이분산 대응
    learning_rate=0.06,
    max_bins=255,
    min_samples_leaf=30,   # 살짝 키워서 과적합 완화
    l2_regularization=0.0,
    max_depth=None,
    random_state=42,
)

df_feat = build_features(df, CONFIG)
best_out = run_cv_training(df_feat, CONFIG, model_params=BEST_PARAMS, verbose=True)

print("OOF(best):", best_out["oof_metrics"])
display(best_out["fold_reports"][["fold","MAE","RMSE","MAPE(%)","R2","train_range","val_range"]])


[Fold 1] {'MAE': 116.12410215013544, 'RMSE': np.float64(226.39794810555256), 'MAPE(%)': np.float64(5.688782952579652), 'R2': 0.996126876223712, 'fold': 1, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-29 23:00:00')), 'val_range': (Timestamp('2024-06-30 00:00:00'), Timestamp('2024-07-13 23:00:00'))}
[Fold 2] {'MAE': 122.35154514461317, 'RMSE': np.float64(254.7588745458333), 'MAPE(%)': np.float64(5.469144501862485), 'R2': 0.9953975304921209, 'fold': 2, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-13 23:00:00')), 'val_range': (Timestamp('2024-07-14 00:00:00'), Timestamp('2024-07-27 23:00:00'))}
[Fold 3] {'MAE': 126.31387889129608, 'RMSE': np.float64(253.29933100509527), 'MAPE(%)': np.float64(5.310348139849337), 'R2': 0.9957743481494808, 'fold': 3, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-27 23:00:00')), 'val_range': (Timestamp('2024-07-28 00:00:00'), Timestamp('2024-08-10 23:00:00'))}
[Fold 4] {'MAE': 132.696675729

Unnamed: 0,fold,MAE,RMSE,MAPE(%),R2,train_range,val_range
0,1,116.124102,226.397948,5.688783,0.996127,"(2024-06-01 00:00:00, 2024-06-29 23:00:00)","(2024-06-30 00:00:00, 2024-07-13 23:00:00)"
1,2,122.351545,254.758875,5.469145,0.995398,"(2024-06-01 00:00:00, 2024-07-13 23:00:00)","(2024-07-14 00:00:00, 2024-07-27 23:00:00)"
2,3,126.313879,253.299331,5.310348,0.995774,"(2024-06-01 00:00:00, 2024-07-27 23:00:00)","(2024-07-28 00:00:00, 2024-08-10 23:00:00)"
3,4,132.696676,273.421022,5.715559,0.995032,"(2024-06-12 00:00:00, 2024-08-10 23:00:00)","(2024-08-11 00:00:00, 2024-08-24 23:00:00)"


In [57]:
# 기존 함수를 덮어써서 CONFIG의 cdd_base/hdd_base를 읽게 함
def add_weather_features(df, weather, ts_col, cdd_base=None, hdd_base=None):
    df = df.copy()
    temp = weather.get("temp")
    rh   = weather.get("rh")
    ws   = weather.get("ws")
    rad  = weather.get("radiation")
    sun  = weather.get("sunshine")
    prcp = weather.get("precip")

    # 기본값
    if cdd_base is None: cdd_base = weather.get("cdd_base", 23)
    if hdd_base is None: hdd_base = weather.get("hdd_base", 18)

    if temp and temp in df:
        T = pd.to_numeric(df[temp], errors="coerce")
        df['CDD'] = (T - cdd_base).clip(lower=0)
        df['HDD'] = (hdd_base - T).clip(lower=0)
        for lag in [1, 24]:
            df[f'{temp}_lag{lag}'] = T.shift(lag)
        for w in [3, 24]:
            df[f'{temp}_roll{w}'] = T.rolling(w, min_periods=2).mean()
    if rh and rh in df:
        df[f'{rh}_roll24'] = pd.to_numeric(df[rh], errors="coerce").rolling(24, min_periods=2).mean()
    if ws and ws in df:
        df[f'{ws}_roll24'] = pd.to_numeric(df[ws], errors="coerce").rolling(24, min_periods=2).mean()
    if rad and rad in df and f"{rad}_roll24" not in df:
        df[f"{rad}_roll24"] = pd.to_numeric(df[rad], errors="coerce").rolling(24, min_periods=2).mean()
    if sun and sun in df and f"{sun}_roll24" not in df:
        df[f"{sun}_roll24"] = pd.to_numeric(df[sun], errors="coerce").rolling(24, min_periods=2).mean()
    if prcp and prcp in df and f"{prcp}_roll24" not in df:
        p = pd.to_numeric(df[prcp], errors="coerce")
        df[f"{prcp}_roll24"] = p.rolling(24, min_periods=2).sum()
        df["is_rain"] = (p > 0).astype(int)

    # 시간×온도 상호작용
    if temp and ('hour_sin' in df):
        df['T_x_hour'] = pd.to_numeric(df[temp], errors="coerce") * df['hour_sin']
    return df

# build_features에서 호출 부분 한 줄만 이렇게 바꿔도 됨:
# df_ = add_weather_features(df_, cfg["weather"], ts,
#                            cdd_base=cfg["weather"].get("cdd_base", 23),
#                            hdd_base=cfg["weather"].get("hdd_base", 18))


In [58]:
def build_features_with_bases(df, cfg, cdd_base, hdd_base):
    cfg2 = {**cfg, "weather": {**cfg["weather"], "cdd_base": cdd_base, "hdd_base": hdd_base}}
    df_ = df.copy()
    if cfg2["ts_col"] not in df_:
        df_ = make_timestamp(df_, "날짜", "시간", out_col=cfg2["ts_col"])
    df_ = add_time_features(df_, cfg2["ts_col"])
    df_ = impute_and_cap_target(df_, cfg2["y_col"], cfg2["group_col"], cfg2["ts_col"])
    df_ = add_target_lags_rolls(df_, cfg2["y_col"], cfg2["group_col"], cfg2["ts_col"])
    df_ = add_weather_features(df_, cfg2["weather"], cfg2["ts_col"],
                               cdd_base=cdd_base, hdd_base=hdd_base)
    df_ = add_extra_features_v2(df_, cfg2)
    if "건물유형" in df_:
        df_ = pd.concat([df_, pd.get_dummies(df_["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)
    df_ = df_.dropna(subset=[cfg2["y_col"]]).reset_index(drop=True)
    return df_, cfg2

grid = [(22,18),(23,18),(24,18),(25,18),(23,17),(23,19)]
records = []
for cb, hb in grid:
    df_g, cfg_g = build_features_with_bases(df, CONFIG, cb, hb)
    out = run_cv_training(df_g, cfg_g, model_params=BEST_PARAMS, verbose=False)
    m = out["oof_metrics"]
    records.append({"CDD": cb, "HDD": hb, "MAE": m["MAE"], "MAPE": float(m["MAPE(%)"]), "R2": m["R2"]})
    print(f"CDD/HDD=({cb},{hb}) -> MAE {m['MAE']:.3f}, MAPE {float(m['MAPE(%)']):.3f}%, R2 {m['R2']:.6f}")

import pandas as pd
tune_df = pd.DataFrame(records).sort_values(["MAE","MAPE"])
display(tune_df.head(10))


CDD/HDD=(22,18) -> MAE 124.372, MAPE 5.546%, R2 0.995576
CDD/HDD=(23,18) -> MAE 124.372, MAPE 5.546%, R2 0.995576
CDD/HDD=(24,18) -> MAE 124.372, MAPE 5.546%, R2 0.995576
CDD/HDD=(25,18) -> MAE 124.372, MAPE 5.546%, R2 0.995576
CDD/HDD=(23,17) -> MAE 124.372, MAPE 5.546%, R2 0.995576
CDD/HDD=(23,19) -> MAE 124.372, MAPE 5.546%, R2 0.995576


Unnamed: 0,CDD,HDD,MAE,MAPE,R2
0,22,18,124.37155,5.545959,0.995576
1,23,18,124.37155,5.545959,0.995576
2,24,18,124.37155,5.545959,0.995576
3,25,18,124.37155,5.545959,0.995576
4,23,17,124.37155,5.545959,0.995576
5,23,19,124.37155,5.545959,0.995576


In [59]:
def make_rolling_folds(df, cfg=CONFIG, expanding=True):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    if len(uniq) < need:
        raise ValueError("타임스텝 부족: train_days/val_days 조정 필요")

    folds = []
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        if expanding:
            train_start = 0
        else:
            train_start = max(0, val_start - train_days * 24)
        train_end = val_start
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]


In [60]:
def group_report(df_feat, result, cfg=CONFIG, by="건물유형"):
    ts, y = cfg["ts_col"], cfg["y_col"]
    oof = result["oof_pred"]
    valid = result.get("oof_mask", oof.notna())

    tmp = df_feat.loc[valid, [by, y]].copy()
    tmp["pred"] = oof.loc[valid].values
    agg = tmp.groupby(by).apply(
        lambda g: pd.Series({
            "MAE": np.mean(np.abs(g[y]-g["pred"])),
            "MAPE(%)": np.mean(np.abs((g[y]-g["pred"]) / np.maximum(1e-6, np.abs(g[y]))))*100,
            "count": len(g)
        })
    ).sort_values("MAE")
    return agg

seg = group_report(df_feat, best_out, CONFIG, by="건물유형")
display(seg.head(10))


  agg = tmp.groupby(by).apply(


Unnamed: 0_level_0,MAE,MAPE(%),count
건물유형,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,61.064179,12.819854,12096.0
5,72.624048,3.484319,13440.0
2,97.23451,6.689329,10752.0
8,112.386521,3.950431,13440.0
7,113.872221,5.255698,12096.0
1,114.351916,6.047608,13440.0
3,139.274116,6.277652,21504.0
0,152.729271,1.615193,12096.0
9,181.30698,5.732977,13440.0
4,186.134158,3.474306,12096.0


In [64]:
# === 유형별 CDD/HDD 생성 (세그먼트별 base_map 사용) ===
def add_weather_features_segmental(df, cfg, type_col="건물유형",
                                   cdd_base_map=None, hdd_base=18):
    df = df.copy()
    ts_col = cfg["ts_col"]; W = cfg["weather"]
    temp = W.get("temp"); rh = W.get("rh"); ws = W.get("ws")
    rad  = W.get("radiation"); sun = W.get("sunshine"); prcp = W.get("precip")

    if temp and temp in df:
        T = pd.to_numeric(df[temp], errors="coerce")
        if cdd_base_map is None:
            cdd_base = W.get("cdd_base", 23)
            base = np.full(len(df), cdd_base, dtype=float)
        else:
            # 행마다 건물유형에 맞는 base 지정
            base = df[type_col].map(cdd_base_map).fillna(W.get("cdd_base", 23)).astype(float).values

        df["CDD"] = (T - base).clip(lower=0)
        df["HDD"] = (hdd_base - T).clip(lower=0)

        for lag in [1, 24]:
            df[f'{temp}_lag{lag}'] = T.shift(lag)
        for w in [3, 24]:
            df[f'{temp}_roll{w}'] = T.rolling(w, min_periods=2).mean()

    if rh and rh in df:
        RH = pd.to_numeric(df[rh], errors="coerce")
        df[f'{rh}_roll24'] = RH.rolling(24, min_periods=2).mean()

    if ws and ws in df:
        WS = pd.to_numeric(df[ws], errors="coerce")
        df[f'{ws}_roll24'] = WS.rolling(24, min_periods=2).mean()

    if rad and rad in df and f"{rad}_roll24" not in df:
        R = pd.to_numeric(df[rad], errors="coerce")
        df[f"{rad}_roll24"] = R.rolling(24, min_periods=2).mean()
    if sun and sun in df and f"{sun}_roll24" not in df:
        S = pd.to_numeric(df[sun], errors="coerce")
        df[f"{sun}_roll24"] = S.rolling(24, min_periods=2).mean()
    if prcp and prcp in df and f"{prcp}_roll24" not in df:
        P = pd.to_numeric(df[prcp], errors="coerce")
        df[f"{prcp}_roll24"] = P.rolling(24, min_periods=2).sum()
        df["is_rain"] = (P > 0).astype(int)

    # 시간×온도
    if temp and ('hour_sin' in df):
        df['T_x_hour'] = pd.to_numeric(df[temp], errors="coerce") * df['hour_sin']
    return df


In [65]:
def build_features_with_segment_bases(df, cfg, base_map, type_col="건물유형"):
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    d = df.copy()
    if ts not in d:
        d = make_timestamp(d, "날짜", "시간", out_col=ts)

    # 숫자형 캐스팅
    for c in [y, cfg["weather"].get("temp"), cfg["weather"].get("rh"),
              cfg["weather"].get("ws"), cfg["weather"].get("radiation"),
              cfg["weather"].get("sunshine"), cfg["weather"].get("precip"),
              "연면적(m2)", "냉방면적(m2)", "태양광용량(kW)",
              "ESS저장용량(kWh)", "PCS용량(kW)"]:
        if c and c in d.columns:
            d[c] = pd.to_numeric(d[c], errors="coerce")

    # 시간/휴일
    d = add_time_features(d, ts)
    d = add_korean_holidays_2024(d, ts)

    # 타깃 보간/이상치
    d = impute_and_cap_target(d, y, grp, ts)

    # 타깃 라그/롤링
    d = add_target_lags_rolls(d, y, grp, ts)

    # 유형별 CDD/HDD
    d = add_weather_features_segmental(d, cfg, type_col=type_col, cdd_base_map=base_map, hdd_base=18)

    # 추가 고급 파생(이슬점, hour_of_week, 비선형 등)
    d = add_extra_features_v2(d, cfg)

    # 범주 원-핫
    if "건물유형" in d:
        d = pd.concat([d, pd.get_dummies(d["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)

    d = d.dropna(subset=[y]).reset_index(drop=True)
    return d

def make_rolling_folds(df, cfg=CONFIG, expanding=True):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    if len(uniq) < need:
        raise ValueError("타임스텝 부족: train_days/val_days를 조정해줘.")

    folds = []
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        train_end = val_start
        train_start = 0 if expanding else max(0, train_end - train_days * 24)
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]

def tune_cdd_base_per_type(df, cfg, type_col="건물유형",
                           candidates=(22,23,24,25),
                           fast_cfg_override=None,
                           model_params=None):
    """
    유형마다 CDD 기준온도를 candidates 중에서 고름.
    - fast_cfg_override: {'train_days':45,'val_days':7,'n_folds':4} 같이 가볍게
    """
    if model_params is None:
        model_params = dict(
            loss="poisson", learning_rate=0.06, max_bins=255,
            min_samples_leaf=30, l2_regularization=0.0, max_depth=None, random_state=42
        )
    if fast_cfg_override is None:
        fast_cfg_override = {"train_days":45, "val_days":7, "n_folds":4}

    base_map = {}  # 초기엔 비움 → 없는 유형은 23 사용
    types = sorted(df[type_col].dropna().unique().tolist())
    best_log = []

    # 빠른 튜닝 루프(유형별로 독립적으로 베이스 선택)
    for t in types:
        best_mae, best_cdd = np.inf, None
        for cdd in candidates:
            bm_try = base_map.copy()
            bm_try[t] = cdd
            # 빠른 설정으로 피처 만들기
            cfg_fast = {**cfg, **fast_cfg_override}
            df_feat_try = build_features_with_segment_bases(df, cfg_fast, bm_try, type_col)
            out = run_cv_training(df_feat_try, cfg_fast, model_params=model_params, verbose=False)
            mae = out["oof_metrics"]["MAE"]
            if mae < best_mae:
                best_mae, best_cdd = mae, cdd
        base_map[t] = best_cdd if best_cdd is not None else 23
        best_log.append((t, base_map[t], best_mae))
        print(f"[튜닝] 유형 {t}: best CDD base = {base_map[t]} (MAE {best_mae:.3f})")

    return base_map, pd.DataFrame(best_log, columns=["건물유형","best_cdd","MAE"])


In [66]:
# 1) 유형별 CDD 베이스 튜닝
base_map, tune_table = tune_cdd_base_per_type(df, CONFIG)
display(tune_table.sort_values("MAE"))

# 2) 베이스 매핑 적용해서 풀 설정(원래 CONFIG)으로 재학습
df_feat_seg = build_features_with_segment_bases(df, CONFIG, base_map)
best_out_seg = run_cv_training(df_feat_seg, CONFIG, model_params=BEST_PARAMS, verbose=True)
print("OOF(seg-base):", best_out_seg["oof_metrics"])

# 3) 유형별 에러 다시 보기
seg_after = group_report(df_feat_seg, best_out_seg, CONFIG, by="건물유형")
display(seg_after.join(seg, how="left", lsuffix="_after", rsuffix="_before").head(20))


[튜닝] 유형 0: best CDD base = 22 (MAE 129.490)
[튜닝] 유형 1: best CDD base = 25 (MAE 129.446)
[튜닝] 유형 2: best CDD base = 24 (MAE 129.424)
[튜닝] 유형 3: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 4: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 5: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 6: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 7: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 8: best CDD base = 23 (MAE 129.424)
[튜닝] 유형 9: best CDD base = 23 (MAE 129.424)


Unnamed: 0,건물유형,best_cdd,MAE
3,3,23,129.423826
2,2,24,129.423826
5,5,23,129.423826
4,4,23,129.423826
6,6,23,129.423826
7,7,23,129.423826
9,9,23,129.423826
8,8,23,129.423826
1,1,25,129.445572
0,0,22,129.490109


[Fold 1] {'MAE': 115.80010842118517, 'RMSE': np.float64(225.64070320768377), 'MAPE(%)': np.float64(5.674398670837685), 'R2': 0.9961527421675207, 'fold': 1, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-29 23:00:00')), 'val_range': (Timestamp('2024-06-30 00:00:00'), Timestamp('2024-07-13 23:00:00'))}
[Fold 2] {'MAE': 122.51082823306544, 'RMSE': np.float64(255.26887020503332), 'MAPE(%)': np.float64(5.478045162026482), 'R2': 0.9953790849018277, 'fold': 2, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-13 23:00:00')), 'val_range': (Timestamp('2024-07-14 00:00:00'), Timestamp('2024-07-27 23:00:00'))}
[Fold 3] {'MAE': 125.81381930217327, 'RMSE': np.float64(251.56910381935353), 'MAPE(%)': np.float64(5.293809195731827), 'R2': 0.9958318798190685, 'fold': 3, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-07-27 23:00:00')), 'val_range': (Timestamp('2024-07-28 00:00:00'), Timestamp('2024-08-10 23:00:00'))}
[Fold 4] {'MAE': 135.0676905

  agg = tmp.groupby(by).apply(


Unnamed: 0_level_0,MAE_after,MAPE(%)_after,count_after,MAE_before,MAPE(%)_before,count_before
건물유형,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,61.436665,12.884783,12096.0,61.064179,12.819854,12096.0
5,73.145892,3.501981,13440.0,72.624048,3.484319,13440.0
2,98.92727,6.795171,10752.0,97.23451,6.689329,10752.0
8,113.211163,3.978622,13440.0,112.386521,3.950431,13440.0
7,114.432742,5.301838,12096.0,113.872221,5.255698,12096.0
1,114.887949,6.055037,13440.0,114.351916,6.047608,13440.0
3,140.472239,6.296333,21504.0,139.274116,6.277652,21504.0
0,153.849749,1.594759,12096.0,152.729271,1.615193,12096.0
9,178.267414,5.684087,13440.0,181.30698,5.732977,13440.0
4,186.471177,3.482163,12096.0,186.134158,3.474306,12096.0


In [67]:
def add_type_interactions(df):
    df = df.copy()
    # 건물유형 원-핫이 이미 있을 때만
    type_dums = [c for c in df.columns if c.startswith("건물유형_")]
    for td in type_dums:
        for base_col in ["CDD", "HDD", "기온(°C)", "dewpoint"]:
            if base_col in df:
                df[f"{base_col}__x__{td}"] = df[base_col] * df[td]
    return df

# build_features_with_segment_bases(...) 마지막 부분에서 한 줄 더:
# d = add_type_interactions(d)

In [78]:
def add_same_how_history(df, y_col, group_col, ts_col):
    """
    hour_of_week = dow*24 + hour 기준의 '같은 건물 × 같은 시간대'에서
    과거 TARGET 평균(3/8) 피처를 만든다. (누수 방지: shift(1) 사용)
    - group_keys=False 로 멀티인덱스 방지 → 바로 대입 가능
    """
    df = df.copy()
    if not {"hour","dow"}.issubset(df.columns):
        raise ValueError("add_time_features를 먼저 호출해서 hour/dow를 만들어줘.")

    # 시간대 키
    df["hour_of_week"] = df["dow"]*24 + df["hour"]

    # 시간 순서 보장
    df = df.sort_values([group_col, "hour_of_week", ts_col])

    # 같은 (건물, 시간대) 그룹 내 과거 평균
    g = df.groupby([group_col, "hour_of_week"], group_keys=False)[y_col]
    df[f"{y_col}_how_mean3"] = g.apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    df[f"{y_col}_how_mean8"] = g.apply(lambda s: s.shift(1).rolling(8, min_periods=2).mean())

    # 잔차형(전부 과거값으로 구성)
    if f"{y_col}_lag1" in df.columns:
        df[f"{y_col}_lag1_minus_how3"]  = df[f"{y_col}_lag1"]  - df[f"{y_col}_how_mean3"]
    if f"{y_col}_lag24" in df.columns:
        df[f"{y_col}_lag24_minus_how8"] = df[f"{y_col}_lag24"] - df[f"{y_col}_how_mean8"]

    # 원래 인덱스 순서로 복원
    return df.sort_index()


In [69]:
# === B) 과거끼리 차분/비율 ===
def add_safe_deltas(df, y_col):
    df = df.copy()
    eps = 1e-6
    if f"{y_col}_lag1" in df and f"{y_col}_lag24" in df:
        df[f"{y_col}_chg1_from24"] = df[f"{y_col}_lag1"] - df[f"{y_col}_lag24"]
        df[f"{y_col}_ratio1_24"]   = df[f"{y_col}_lag1"] / (df[f"{y_col}_lag24"].abs() + eps)
    if f"{y_col}_lag24" in df and f"{y_col}_lag168" in df:
        df[f"{y_col}_chg24_from168"] = df[f"{y_col}_lag24"] - df[f"{y_col}_lag168"]
        df[f"{y_col}_ratio24_168"]   = df[f"{y_col}_lag24"] / (df[f"{y_col}_lag168"].abs() + eps)
    return df


In [70]:
# === C) hour_of_week 원-핫 제거 버전 ===
def add_extra_features_v2(df, cfg=CONFIG):
    df = df.copy()
    ts = cfg["ts_col"]
    temp = cfg["weather"].get("temp")
    rh   = cfg["weather"].get("rh")
    rad  = cfg["weather"].get("radiation")
    sun  = cfg["weather"].get("sunshine")
    prcp = cfg["weather"].get("precip")

    # hour_of_week 수치형만(원-핫 X)
    if {"hour","dow"}.issubset(df.columns):
        df["hour_of_week"] = df["dow"]*24 + df["hour"]
        df["is_workhour"] = (((df["dow"] < 5) & (df["hour"].between(9, 18))).astype(int))
        df["is_night"]    = (((df["hour"] <= 6) | (df["hour"] >= 22)).astype(int))

    # 이슬점 + 비선형/교호
    if temp in df and rh in df:
        a, b = 17.27, 237.7
        T = pd.to_numeric(df[temp], errors="coerce")
        RH = pd.to_numeric(df[rh], errors="coerce").clip(1, 100)
        gamma = (a*T)/(b+T) + np.log(RH/100.0)
        df["dewpoint"] = (b*gamma) / (a - gamma)
        df["temp2"]   = T**2
        df["temp_rh"] = T * RH
        if "is_workhour" in df:
            df["T_x_work"] = T * df["is_workhour"]

    # 태양/강수 24h 완만 창(있을 때만)
    if rad and rad in df and f"{rad}_roll24" not in df:
        df[f"{rad}_roll24"] = pd.to_numeric(df[rad], errors="coerce").rolling(24, min_periods=2).mean()
    if sun and sun in df and f"{sun}_roll24" not in df:
        df[f"{sun}_roll24"] = pd.to_numeric(df[sun], errors="coerce").rolling(24, min_periods=2).mean()
    if prcp and prcp in df and f"{prcp}_roll24" not in df:
        p = pd.to_numeric(df[prcp], errors="coerce")
        df[f"{prcp}_roll24"] = p.rolling(24, min_periods=2).sum()
        df["is_rain"] = (p > 0).astype(int)

    return df


In [73]:
def try_configs(df_feat, base_cfg=CONFIG, exp_list=None, verbose=True):
    if exp_list is None:
        exp_list = [
            ("poisson",  {"loss": "poisson", "min_samples_leaf": 30}),
            ("gamma",    {"loss": "gamma",   "min_samples_leaf": 30}),  # ← 추가
            ("poisson_slow", {"loss":"poisson","learning_rate":0.04,"min_samples_leaf":50}),
            ("baseline", {"loss": "squared_error"})
        ]
    results = []
    for name, mp in exp_list:
        params = dict(
            max_depth=None, learning_rate=0.06, max_bins=255,
            min_samples_leaf=20, l2_regularization=0.0, random_state=42,
        )
        params.update(mp)
        if verbose:
            print(f"\n=== [{name}] params: {params}")
        out = run_cv_training(df_feat, base_cfg, model_params=params, verbose=False)
        m = out["oof_metrics"]
        print(f"OOF -> MAE {m['MAE']:.3f} | RMSE {float(m['RMSE']):.3f} | "
              f"MAPE {float(m['MAPE(%)']):.3f}% | R2 {m['R2']:.6f}")
        results.append((m.copy() | {"name": name}, out))
    ranked = sorted(results, key=lambda x: (x[0]["MAE"], x[0]["MAPE(%)"]))
    print("\n=== Top by MAE/MAPE ===")
    for m,_ in ranked[:3]:
        print(f"{m['name']}: MAE {m['MAE']:.3f}, MAPE {float(m['MAPE(%)']):.3f}%, R2 {m['R2']:.6f}")
    return ranked


In [81]:
df_feat = build_features(df, CONFIG)
drop_how = [c for c in df_feat.columns if c.startswith("how_")]  # 남아있으면 삭제
df_feat = df_feat.drop(columns=drop_how, errors="ignore")

ranked = try_configs(df_feat, CONFIG)
best_m, best_out = ranked[0]
print("Best:", best_m)



=== [poisson] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 30, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'poisson'}
OOF -> MAE 121.168 | RMSE 247.402 | MAPE 5.459% | R2 0.995753

=== [gamma] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 30, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'gamma'}
OOF -> MAE 1444.981 | RMSE 2984.802 | MAPE 39.256% | R2 0.381886

=== [poisson_slow] params: {'max_depth': None, 'learning_rate': 0.04, 'max_bins': 255, 'min_samples_leaf': 50, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'poisson'}
OOF -> MAE 140.397 | RMSE 273.176 | MAPE 10.567% | R2 0.994822

=== [baseline] params: {'max_depth': None, 'learning_rate': 0.06, 'max_bins': 255, 'min_samples_leaf': 20, 'l2_regularization': 0.0, 'random_state': 42, 'loss': 'squared_error'}
OOF -> MAE 122.236 | RMSE 240.846 | MAPE 7.704% | R2 0.995975

=== Top by MAE/MAPE ===
poisson: MAE 121.168, MAPE 

In [82]:
# 1) 건물 장기 히스토리: 168h/720h 롤링, 장기 EWM (전부 shift(1)로 누수 방지)
def add_building_history_features(df, y_col, group_col, ts_col):
    df = df.sort_values([group_col, ts_col]).copy()
    g = df.groupby(group_col)[y_col]

    def _roll(series, w, minp):
        return (series.shift(1).rolling(w, min_periods=minp).mean()
                .reset_index(level=0, drop=True))

    df[f"{y_col}_roll168_grp_mean"] = _roll(g, 168, 12)   # 일주일
    df[f"{y_col}_roll720_grp_mean"] = _roll(g, 720, 24)   # 약 한 달
    # 장기 EWM(느리게 반응)
    df[f"{y_col}_ewm_long"] = g.transform(lambda s: s.shift(1).ewm(alpha=0.01, adjust=False).mean())
    return df

# 2) CDD/HDD 누적(24h/168h) — 이미 CDD/HDD가 있으면 누적만 추가
def add_cdd_hdd_aggregates(df):
    df = df.copy()
    if "CDD" in df:
        df["CDD_roll24_sum"]  = df["CDD"].rolling(24,  min_periods=2).sum()
        df["CDD_roll168_sum"] = df["CDD"].rolling(168, min_periods=12).sum()
    if "HDD" in df:
        df["HDD_roll24_sum"]  = df["HDD"].rolling(24,  min_periods=2).sum()
        df["HDD_roll168_sum"] = df["HDD"].rolling(168, min_periods=12).sum()
    return df


In [84]:
# CONFIG 추천값 (원래 값 덮어써도 됨)
CONFIG.update({
    "train_days": 60,   # 확장형이라 여기 숫자는 최소 확보 구간 느낌
    "val_days": 7,      # 7일 검증
    "n_folds": 10,      # 폴드 수 ↑ → coverage ↑
    "expanding": True,  # 확장형
})

# make_rolling_folds가 cfg의 expanding을 읽도록 살짝 패치
def make_rolling_folds(df, cfg=CONFIG):
    ts = cfg["ts_col"]
    n_folds = cfg["n_folds"]
    train_days = cfg["train_days"]
    val_days = cfg["val_days"]
    expanding = bool(cfg.get("expanding", True))

    d = df.copy()
    d[ts] = pd.to_datetime(d[ts])
    uniq = d[ts].sort_values().drop_duplicates().tolist()
    need = (train_days + val_days) * 24
    if len(uniq) < need:
        raise ValueError("타임스텝 부족: train_days/val_days를 조정해줘.")

    folds = []
    for k in range(n_folds):
        val_end = len(uniq) - (k * val_days * 24)
        val_start = val_end - val_days * 24
        train_end = val_start
        train_start = 0 if expanding else max(0, train_end - train_days * 24)
        tr = (uniq[train_start], uniq[train_end-1])
        va = (uniq[val_start],   uniq[val_end-1])
        folds.append((tr, va))
    return folds[::-1]


In [85]:
BEST_PARAMS = dict(
    loss="poisson",
    learning_rate=0.06,
    max_bins=255,
    min_samples_leaf=30,
    l2_regularization=0.0,
    max_depth=None,
    early_stopping=False,   # ← 랜덤 홀드아웃 누수 방지 (중요)
    random_state=42,
)


In [86]:
# 1) 피처 재생성
df_feat = build_features(df, CONFIG)

# 2) (예전 hour_of_week 원-핫이 남아있다면 제거)
drop_how = [c for c in df_feat.columns if c.startswith("how_")]
df_feat = df_feat.drop(columns=drop_how, errors="ignore")

# 3) 학습/검증 — Poisson 고정
best_out = run_cv_training(df_feat, CONFIG, model_params=BEST_PARAMS, verbose=True)
print("OOF:", best_out["oof_metrics"])
cov = 100 * best_out.get("oof_mask", best_out["oof_pred"].notna()).mean()
print(f"Coverage: {cov:.1f}%")
display(best_out["fold_reports"][["fold","MAE","RMSE","MAPE(%)","R2","train_range","val_range"]])


[Fold 1] {'MAE': 109.58115075614064, 'RMSE': np.float64(208.02522131770448), 'MAPE(%)': np.float64(5.73282979878186), 'R2': 0.996497413102963, 'fold': 1, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-15 23:00:00')), 'val_range': (Timestamp('2024-06-16 00:00:00'), Timestamp('2024-06-22 23:00:00'))}
[Fold 2] {'MAE': 111.30988731815663, 'RMSE': np.float64(203.1579246080794), 'MAPE(%)': np.float64(6.005093257340142), 'R2': 0.9966539641217759, 'fold': 2, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-22 23:00:00')), 'val_range': (Timestamp('2024-06-23 00:00:00'), Timestamp('2024-06-29 23:00:00'))}
[Fold 3] {'MAE': 111.17174903584457, 'RMSE': np.float64(208.36254631355078), 'MAPE(%)': np.float64(5.603691084591148), 'R2': 0.9966476131895545, 'fold': 3, 'train_range': (Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-29 23:00:00')), 'val_range': (Timestamp('2024-06-30 00:00:00'), Timestamp('2024-07-06 23:00:00'))}
[Fold 4] {'MAE': 110.6304991874

Unnamed: 0,fold,MAE,RMSE,MAPE(%),R2,train_range,val_range
0,1,109.581151,208.025221,5.73283,0.996497,"(2024-06-01 00:00:00, 2024-06-15 23:00:00)","(2024-06-16 00:00:00, 2024-06-22 23:00:00)"
1,2,111.309887,203.157925,6.005093,0.996654,"(2024-06-01 00:00:00, 2024-06-22 23:00:00)","(2024-06-23 00:00:00, 2024-06-29 23:00:00)"
2,3,111.171749,208.362546,5.603691,0.996648,"(2024-06-01 00:00:00, 2024-06-29 23:00:00)","(2024-06-30 00:00:00, 2024-07-06 23:00:00)"
3,4,110.630499,219.357735,5.473074,0.99644,"(2024-06-01 00:00:00, 2024-07-06 23:00:00)","(2024-07-07 00:00:00, 2024-07-13 23:00:00)"
4,5,113.248695,223.939659,5.508203,0.996378,"(2024-06-01 00:00:00, 2024-07-13 23:00:00)","(2024-07-14 00:00:00, 2024-07-20 23:00:00)"
5,6,119.936356,250.670582,5.104634,0.995621,"(2024-06-01 00:00:00, 2024-07-20 23:00:00)","(2024-07-21 00:00:00, 2024-07-27 23:00:00)"
6,7,121.202096,241.672783,5.044931,0.99613,"(2024-06-01 00:00:00, 2024-07-27 23:00:00)","(2024-07-28 00:00:00, 2024-08-03 23:00:00)"
7,8,123.171499,235.633077,5.210209,0.996365,"(2024-06-01 00:00:00, 2024-08-03 23:00:00)","(2024-08-04 00:00:00, 2024-08-10 23:00:00)"
8,9,135.963396,275.052992,6.054354,0.995009,"(2024-06-01 00:00:00, 2024-08-10 23:00:00)","(2024-08-11 00:00:00, 2024-08-17 23:00:00)"
9,10,126.571887,271.21519,5.49527,0.995076,"(2024-06-01 00:00:00, 2024-08-17 23:00:00)","(2024-08-18 00:00:00, 2024-08-24 23:00:00)"


In [87]:
# === 1) 포아송+MSE 블렌딩 ===
def cv_oof_with_params(df_feat, cfg, params, verbose=False):
    out = run_cv_training(df_feat, cfg, model_params=params, verbose=verbose)
    return out

def blend_two_oofs(oof1, oof2, y_true, metric="MAE"):
    # 두 OOF가 동시에 존재하는 인덱스만 사용
    mask = oof1.notna() & oof2.notna() & y_true.notna()
    y = y_true.loc[mask].values
    p1 = oof1.loc[mask].values
    p2 = oof2.loc[mask].values

    # 0~1 사이 그리드에서 최적 alpha 탐색 (필요시 더 촘촘히)
    grid = np.linspace(0, 1, 21)
    best = (np.inf, None, None)
    for a in grid:
        pred = a*p1 + (1-a)*p2
        if metric.upper() == "MAPE":
            eps = 1e-6
            val = np.mean(np.abs((y - pred)/np.maximum(eps, np.abs(y)))) * 100
        else:
            val = np.mean(np.abs(y - pred))
        if val < best[0]:
            best = (val, a, pred.copy())
    return {"best_score": best[0], "alpha": best[1], "pred": pd.Series(best[2], index=y_true.loc[mask].index), "mask": mask}

# 실행 예시
POISSON = dict(loss="poisson", learning_rate=0.06, max_bins=255, min_samples_leaf=30,
               l2_regularization=0.0, max_depth=None, early_stopping=False, random_state=42)
MSE     = dict(loss="squared_error", learning_rate=0.06, max_bins=255, min_samples_leaf=20,
               l2_regularization=0.0, max_depth=None, early_stopping=False, random_state=42)

po_out = cv_oof_with_params(df_feat, CONFIG, POISSON, verbose=False)
mse_out = cv_oof_with_params(df_feat, CONFIG, MSE, verbose=False)

y_true = df_feat[CONFIG["y_col"]]
blend = blend_two_oofs(po_out["oof_pred"], mse_out["oof_pred"], y_true, metric="MAE")
print(f"[Blend] best alpha (Poisson weight) = {blend['alpha']:.2f}, MAE={blend['best_score']:.3f}")

# 블렌딩 OOF 지표 확인
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
oof_mask = blend["mask"]
pred_blend = blend["pred"]
y_blend = y_true.loc[oof_mask].values
mae = mean_absolute_error(y_blend, pred_blend)
rmse = np.sqrt(mean_squared_error(y_blend, pred_blend))
mape = np.mean(np.abs((y_blend - pred_blend) / np.maximum(1e-6, np.abs(y_blend))))*100
r2 = r2_score(y_blend, pred_blend)
print({"MAE": mae, "RMSE": rmse, "MAPE(%)": mape, "R2": r2})


[Blend] best alpha (Poisson weight) = 0.50, MAE=116.341
{'MAE': 116.34105119764814, 'RMSE': np.float64(224.78486005743252), 'MAPE(%)': np.float64(6.543922552194788), 'R2': 0.9963962125265362}


In [88]:
# === 2) 건물별 선형 보정 ===
def fit_group_calibration(df_feat, oof_pred, cfg):
    grp, y = cfg["group_col"], cfg["y_col"]
    mask = oof_pred.notna()
    d = df_feat.loc[mask, [grp, y]].copy()
    d["pred"] = oof_pred.loc[mask].values

    coefs = {}
    for g, sub in d.groupby(grp):
        # 최소 50샘플 이상일 때만 안정적으로 적합
        if len(sub) < 50:
            coefs[g] = (1.0, 0.0)
            continue
        X = np.c_[sub["pred"].values, np.ones(len(sub))]
        yv = sub[y].values
        # OLS 해 (X^T X)^-1 X^T y
        try:
            a, b = np.linalg.lstsq(X, yv, rcond=None)[0]
        except:
            a, b = 1.0, 0.0
        coefs[g] = (float(a), float(b))
    return coefs

def apply_group_calibration(df_feat, pred, coefs, cfg):
    grp = cfg["group_col"]
    a = df_feat[grp].map({k:v[0] for k,v in coefs.items()}).fillna(1.0).values
    b = df_feat[grp].map({k:v[1] for k,v in coefs.items()}).fillna(0.0).values
    return pd.Series(a * pred.values + b, index=pred.index)

# 사용: 포아송 OOF 기준으로 보정해보기 (블렌딩 쓰면 blend['pred']로 교체 가능)
cal = fit_group_calibration(df_feat, po_out["oof_pred"], CONFIG)
adj_oof = apply_group_calibration(df_feat.loc[po_out["oof_mask"]], po_out["oof_pred"].loc[po_out["oof_mask"]], cal, CONFIG)

# 지표 확인
y_adj = df_feat.loc[po_out["oof_mask"], CONFIG["y_col"]].values
mae = mean_absolute_error(y_adj, adj_oof)
rmse = np.sqrt(mean_squared_error(y_adj, adj_oof))
mape = np.mean(np.abs((y_adj - adj_oof) / np.maximum(1e-6, np.abs(y_adj))))*100
r2 = r2_score(y_adj, adj_oof)
print({"MAE_calibrated": mae, "RMSE_calibrated": rmse, "MAPE_calibrated(%)": mape, "R2_calibrated": r2})


{'MAE_calibrated': 114.61035293697779, 'RMSE_calibrated': np.float64(225.94091919145603), 'MAPE_calibrated(%)': np.float64(4.892804995033536), 'R2_calibrated': 0.9963590489497824}


In [89]:
# === 3) 단조 제약 Poisson (지원 안 되면 자동 우회) ===
def train_poisson_with_monotone(df_feat, cfg):
    feat_cols = _select_feature_columns(df_feat, cfg)
    # '상승 단조'를 기대하는 피처 prefix
    pos_prefix = ["CDD", "CDD_roll24_sum", "CDD_roll168_sum", "기온(°C)", "temp2", "dewpoint", "temp_rh"]
    cons = [1 if any(c.startswith(pfx) for pfx in pos_prefix) else 0 for c in feat_cols]

    params = dict(loss="poisson", learning_rate=0.06, max_bins=255, min_samples_leaf=30,
                  l2_regularization=0.0, max_depth=None, early_stopping=False, random_state=42)

    try:
        params["monotonic_cst"] = cons
        out = run_cv_training(df_feat, cfg, model_params=params, verbose=False)
        print("[Monotone] 적용 성공")
    except TypeError:
        print("[Monotone] scikit-learn이 monotonic_cst를 지원하지 않아 일반 Poisson으로 진행")
        out = run_cv_training(df_feat, cfg, model_params={k:v for k,v in params.items() if k!="monotonic_cst"}, verbose=False)

    print("OOF(monotone?):", out["oof_metrics"])
    return out

# 실행
mono_out = train_poisson_with_monotone(df_feat, CONFIG)


[Monotone] 적용 성공
OOF(monotone?): {'MAE': 117.6830108846369, 'RMSE': np.float64(234.17766550092944), 'MAPE(%)': np.float64(5.49908746401526), 'R2': 0.996088746163183}


In [91]:
# --- 준비: CV 실행 함수/블렌드 유틸 ---
def cv_oof_with_params(df_feat, cfg, params, verbose=False):
    return run_cv_training(df_feat, cfg, model_params=params, verbose=verbose)

def blend_two_oofs(oof1, oof2, y_true, metric="MAE"):
    mask = oof1.notna() & oof2.notna() & y_true.notna()
    y = y_true.loc[mask].values
    p1 = oof1.loc[mask].values
    p2 = oof2.loc[mask].values

    grid = np.linspace(0, 1, 21)  # 0.0~1.0 by 0.05
    best = (np.inf, None, None)
    for a in grid:
        pred = a*p1 + (1-a)*p2
        if metric.upper() == "MAPE":
            eps = 1e-6
            val = np.mean(np.abs((y - pred)/np.maximum(eps, np.abs(y)))) * 100
        else:
            val = np.mean(np.abs(y - pred))
        if val < best[0]:
            best = (val, a, pred.copy())
    return {"best_score": best[0], "alpha": best[1],
            "pred": pd.Series(best[2], index=y_true.loc[mask].index),
            "mask": mask}

# --- 현재 피처셋 기준 두 모델 OOF 구하기 ---
POISSON_MONO = dict(loss="poisson", learning_rate=0.06, max_bins=255,
                    min_samples_leaf=30, l2_regularization=0.0, max_depth=None,
                    early_stopping=False, random_state=42,  # 중요: 누수 방지
                    # monotonic_cst는 아래에서 계산해 끼워넣음
                   )
MSE          = dict(loss="squared_error", learning_rate=0.06, max_bins=255,
                    min_samples_leaf=20, l2_regularization=0.0, max_depth=None,
                    early_stopping=False, random_state=42)

# 단조 제약 벡터 만들기 (CDD/기온/이슬점 등은 +1, 나머지 0)
feat_cols = _select_feature_columns(df_feat, CONFIG)
pos_prefix = ["CDD", "CDD_roll24_sum", "CDD_roll168_sum", "기온(°C)", "temp2", "dewpoint", "temp_rh"]
mono_cst = [1 if any(c.startswith(pfx) for pfx in pos_prefix) else 0 for c in feat_cols]
POISSON_MONO2 = {**POISSON_MONO, "monotonic_cst": mono_cst}

po_out = cv_oof_with_params(df_feat, CONFIG, POISSON_MONO2, verbose=False)
mse_out = cv_oof_with_params(df_feat, CONFIG, MSE, verbose=False)

y_true = df_feat[CONFIG["y_col"]]
blend = blend_two_oofs(po_out["oof_pred"], mse_out["oof_pred"], y_true, metric="MAE")
print(f"[Blend] best alpha (Poisson weight) = {blend['alpha']:.2f}, MAE={blend['best_score']:.3f}")

# 블렌딩 OOF 지표 확인
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mask = blend["mask"]; pred = blend["pred"]; yb = y_true.loc[mask].values
mae  = mean_absolute_error(yb, pred)
rmse = np.sqrt(mean_squared_error(yb, pred))
mape = np.mean(np.abs((yb - pred) / np.maximum(1e-6, np.abs(yb))))*100
r2   = r2_score(yb, pred)
print({"MAE_blend": mae, "RMSE_blend": rmse, "MAPE_blend(%)": mape, "R2_blend": r2})


[Blend] best alpha (Poisson weight) = 0.55, MAE=116.094
{'MAE_blend': 116.09381503716916, 'RMSE_blend': np.float64(225.14541005855568), 'MAPE_blend(%)': np.float64(6.419016930370882), 'R2_blend': 0.9963846424622692}


In [93]:
def fit_group_calibration(df_feat, oof_pred, cfg):
    """
    df_feat와 oof_pred의 인덱스 교집합에서만 건물별 보정 y ≈ a*pred + b를 학습.
    """
    grp, y = cfg["group_col"], cfg["y_col"]
    # 1) 인덱스 정렬
    idx = df_feat.index.intersection(oof_pred.index)
    if len(idx) == 0:
        raise ValueError("fit_group_calibration: df_feat와 oof_pred의 교집합 인덱스가 없습니다.")

    d = df_feat.loc[idx, [grp, y]].copy()
    d["pred"] = oof_pred.loc[idx].values

    coefs = {}
    for g, sub in d.groupby(grp):
        if len(sub) < 50:
            coefs[g] = (1.0, 0.0)  # 표본 적으면 보정 생략
            continue
        X = np.c_[sub["pred"].values, np.ones(len(sub))]
        yv = sub[y].values
        try:
            a, b = np.linalg.lstsq(X, yv, rcond=None)[0]
        except:
            a, b = 1.0, 0.0
        coefs[g] = (float(a), float(b))
    return coefs

def apply_group_calibration(df_feat, pred_series, coefs, cfg):
    """
    pred_series.index에 맞춰 df_feat을 정렬한 뒤 a*pred+b 적용.
    """
    grp = cfg["group_col"]
    # 1) 인덱스 정렬
    idx = df_feat.index.intersection(pred_series.index)
    if len(idx) != len(pred_series):
        # pred 기준으로 정렬된 df_part 만들기
        df_part = df_feat.reindex(pred_series.index)
    else:
        df_part = df_feat.loc[pred_series.index]

    a_map = {k:v[0] for k,v in coefs.items()}
    b_map = {k:v[1] for k,v in coefs.items()}
    a = df_part[grp].map(a_map).fillna(1.0).values
    b = df_part[grp].map(b_map).fillna(0.0).values
    return pd.Series(a * pred_series.values + b, index=pred_series.index)


In [95]:
# 최종 학습
def train_final_ensemble(df_feat, cfg, alpha, cal_coefs, use_monotone=True, max_iter=150):
    feat_cols = _select_feature_columns(df_feat, cfg)
    X = df_feat[feat_cols]
    y = df_feat[cfg["y_col"]].values

    # 파라미터
    po_params = dict(loss="poisson", learning_rate=0.06, max_bins=255,
                     min_samples_leaf=30, l2_regularization=0.0, max_depth=None,
                     early_stopping=False, random_state=42, max_iter=max_iter)
    if use_monotone:
        pos_prefix = ["CDD", "CDD_roll24_sum", "CDD_roll168_sum", "기온(°C)", "temp2", "dewpoint", "temp_rh"]
        mono_cst = [1 if any(c.startswith(pfx) for pfx in pos_prefix) else 0 for c in feat_cols]
        po_params["monotonic_cst"] = mono_cst

    mse_params = dict(loss="squared_error", learning_rate=0.06, max_bins=255,
                      min_samples_leaf=20, l2_regularization=0.0, max_depth=None,
                      early_stopping=False, random_state=42, max_iter=max_iter)

    m_po  = HistGradientBoostingRegressor(**po_params).fit(X, y)
    m_mse = HistGradientBoostingRegressor(**mse_params).fit(X, y)

    return {"feat_cols": feat_cols, "poisson": m_po, "mse": m_mse,
            "alpha": float(alpha), "cal_coefs": cal_coefs, "cfg": cfg}

# 예측
def predict_final_ensemble(pack, df_feat_new):
    fc = pack["feat_cols"]; cfg = pack["cfg"]
    Xn = df_feat_new[fc]
    p_po  = pack["poisson"].predict(Xn)
    p_mse = pack["mse"].predict(Xn)
    pred  = pack["alpha"]*p_po + (1-pack["alpha"])*p_mse
    pred  = np.maximum(0.0, pred)  # 안전장치
    pred  = pd.Series(pred, index=df_feat_new.index)

    # 건물별 보정 적용
    pred_adj = apply_group_calibration(df_feat_new, pred, pack["cal_coefs"], cfg)
    return pred_adj

# 사용 예시:
# pack = train_final_ensemble(df_feat, CONFIG, alpha=blend["alpha"], cal_coefs=cal, use_monotone=True, max_iter=200)
# df_feat_test = build_features(df_test, CONFIG)  # 동일 파이프라인
# yhat_test = predict_final_ensemble(pack, df_feat_test)



In [96]:
# 1) 블렌딩 가중치(alpha)와 블렌딩 OOF는 이미 계산된 상태라고 가정
#    (없으면 직전에 했던 blend 코드로 다시 만들어줘)
y_true = df_feat[CONFIG["y_col"]]
mask = blend["mask"]
yb = y_true.loc[mask].values

# 2) 보정 계수 학습 (블렌딩 OOF로)
cal = fit_group_calibration(df_feat, blend["pred"], CONFIG)

# 3) 보정 적용 + 지표
pred_cal = apply_group_calibration(df_feat, blend["pred"], cal, CONFIG)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae_c  = mean_absolute_error(y_true.loc[pred_cal.index].values, pred_cal.values)
rmse_c = np.sqrt(mean_squared_error(y_true.loc[pred_cal.index].values, pred_cal.values))
mape_c = np.mean(np.abs((y_true.loc[pred_cal.index].values - pred_cal.values)
                        / np.maximum(1e-6, np.abs(y_true.loc[pred_cal.index].values))))*100
r2_c   = r2_score(y_true.loc[pred_cal.index].values, pred_cal.values)
print({"MAE_calibrated": mae_c, "RMSE_calibrated": rmse_c,
       "MAPE_calibrated(%)": mape_c, "R2_calibrated": r2_c})


{'MAE_calibrated': 112.57649968042156, 'RMSE_calibrated': np.float64(217.64765631687996), 'MAPE_calibrated(%)': np.float64(5.115736392287933), 'R2_calibrated': 0.9966214290186955}


In [97]:
# ===== [FINAL 1/3] 최종 모델 팩 훈련 (전체 데이터로 재학습) =====
# (필수 전제) build_features()로 만든 df_feat가 있음
# (필수 전제) blend['alpha'] (블렌딩 가중치)와 cal (건물별 보정계수)가 있음

def train_final_ensemble(df_feat, cfg, alpha, cal_coefs, use_monotone=True, max_iter=200):
    feat_cols = _select_feature_columns(df_feat, cfg)
    X = df_feat[feat_cols]
    y = df_feat[cfg["y_col"]].values

    po_params = dict(
        loss="poisson", learning_rate=0.06, max_bins=255,
        min_samples_leaf=30, l2_regularization=0.0, max_depth=None,
        early_stopping=False, random_state=42, max_iter=max_iter
    )
    if use_monotone:
        pos_prefix = ["CDD", "CDD_roll24_sum", "CDD_roll168_sum", "기온(°C)", "temp2", "dewpoint", "temp_rh"]
        mono_cst = [1 if any(c.startswith(pfx) for pfx in pos_prefix) else 0 for c in feat_cols]
        po_params["monotonic_cst"] = mono_cst

    mse_params = dict(
        loss="squared_error", learning_rate=0.06, max_bins=255,
        min_samples_leaf=20, l2_regularization=0.0, max_depth=None,
        early_stopping=False, random_state=42, max_iter=max_iter
    )

    m_po  = HistGradientBoostingRegressor(**po_params).fit(X, y)
    m_mse = HistGradientBoostingRegressor(**mse_params).fit(X, y)

    pack = {
        "feat_cols": feat_cols,
        "poisson": m_po,
        "mse": m_mse,
        "alpha": float(alpha),
        "cal_coefs": cal_coefs,
        "cfg": cfg
    }
    return pack

# 블렌딩 가중치/보정계수 확보 (세션에 없으면 적당히 기본값)
alpha_final = float(blend["alpha"]) if "blend" in globals() else 0.6
cal_coefs   = cal if "cal" in globals() else {}

final_pack = train_final_ensemble(df_feat, CONFIG, alpha=alpha_final, cal_coefs=cal_coefs, use_monotone=True, max_iter=200)
print("최종 팩 구성 완료. alpha =", final_pack["alpha"], "| feat_cols =", len(final_pack["feat_cols"]))


최종 팩 구성 완료. alpha = 0.55 | feat_cols = 77


In [98]:
# ===== [FINAL 2/3] 새 데이터 예측 =====
def align_features(df_feat_new, feat_cols):
    # 누락 피처는 0으로 채우고, 여분 피처는 드랍
    add_cols = [c for c in feat_cols if c not in df_feat_new.columns]
    if add_cols:
        df_feat_new = df_feat_new.copy()
        for c in add_cols:
            df_feat_new[c] = 0.0
    df_feat_new = df_feat_new[feat_cols]
    return df_feat_new

def predict_final_ensemble(pack, df_feat_new):
    fc = pack["feat_cols"]; cfg = pack["cfg"]
    Xn = align_features(df_feat_new, fc)

    p_po  = pack["poisson"].predict(Xn)
    p_mse = pack["mse"].predict(Xn)
    pred  = pack["alpha"]*p_po + (1-pack["alpha"])*p_mse
    pred  = np.maximum(0.0, pred)  # 안전장치
    pred  = pd.Series(pred, index=df_feat_new.index)

    # 건물별 보정 적용 (미등록 건물은 a=1,b=0)
    pred_adj = apply_group_calibration(df_feat_new, pred, pack["cal_coefs"], cfg)
    return pred_adj


In [99]:
# ===== [FINAL 3/3] 테스트셋 피처 생성 → 예측 → 제출 파일 생성 =====
def build_features_for_any(df_raw, cfg):
    d = df_raw.copy()
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]

    if ts not in d:
        d = make_timestamp(d, "날짜", "시간", out_col=ts)

    # 숫자형 캐스팅(있을 때만)
    numeric_candidates = [
        y, "연면적(m2)", "냉방면적(m2)", "태양광용량(kW)",
        "ESS저장용량(kWh)", "PCS용량(kW)",
        cfg["weather"].get("temp"), cfg["weather"].get("rh"),
        cfg["weather"].get("ws"), cfg["weather"].get("radiation"),
        cfg["weather"].get("sunshine"), cfg["weather"].get("precip"),
    ]
    for c in set([c for c in numeric_candidates if c and c in d.columns]):
        d[c] = pd.to_numeric(d[c], errors="coerce")

    d = add_time_features(d, ts)
    d = add_korean_holidays_2024(d, ts)

    # 테스트셋이면 y가 없거나 NaN일 수 있음 → 보간/윈저라이즈는 스킵 가능
    if y in d.columns and d[y].notna().any():
        d = impute_and_cap_target(d, y, grp, ts)

    d = add_target_lags_rolls(d, y, grp, ts)
    d = add_same_how_history(d, y, grp, ts)
    d = add_safe_deltas(d, y)
    d = add_building_history_features(d, y, grp, ts)
    d = add_weather_features(d, cfg["weather"], ts)
    d = add_cdd_hdd_aggregates(d)
    d = add_extra_features_v2(d, cfg)

    if "건물유형" in d:
        d = pd.concat([d, pd.get_dummies(d["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)

    return d

# 사용 예시:
# df_test_raw = ...  # 테스트 원본 로드
# df_test_feat = build_features_for_any(df_test_raw, CONFIG)
# yhat_test = predict_final_ensemble(final_pack, df_test_feat)

# 제출 스키마에 맞춰 DataFrame 구성 (필요 컬럼명에 맞게 바꿔)
def make_submission(df_feat_new, yhat_series, id_cols=("건물번호", "timestamp"), target_name="전력소비량(kWh)"):
    sub = df_feat_new.loc[yhat_series.index, list(id_cols)].copy()
    sub[target_name] = yhat_series.values
    return sub

# 예시:
# submission = make_submission(df_test_feat, yhat_test, id_cols=("건물번호","timestamp"), target_name="전력소비량(kWh)")
# submission.to_csv("submission.csv", index=False)
# print("submission.csv 저장 완료")


In [100]:
import joblib, json, os
os.makedirs("artifacts", exist_ok=True)

# 1) 최종 팩 저장
joblib.dump(final_pack, "artifacts/final_pack.joblib")

# 2) 블렌딩/보정 파라미터 저장
with open("artifacts/blend_and_cal.json", "w", encoding="utf-8") as f:
    json.dump({
        "alpha": float(final_pack["alpha"]),
        "n_feat": len(final_pack["feat_cols"]),
        "cfg": {k: (v if not isinstance(v, (np.generic,)) else v.item()) for k,v in CONFIG.items()},
        "n_calibrated_groups": len(final_pack["cal_coefs"]),
    }, f, ensure_ascii=False, indent=2)

print("✅ 저장 완료:", os.listdir("artifacts"))


✅ 저장 완료: ['blend_and_cal.json', 'final_pack.joblib']


In [103]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def safe_metrics(y_true_s, y_pred_s):
    # 1) 공통 인덱스만
    idx = y_true_s.index.intersection(y_pred_s.index)
    yt = y_true_s.loc[idx]
    yp = y_pred_s.loc[idx]
    # 2) NaN 제거
    m = yt.notna() & yp.notna()
    yt = yt.loc[m]; yp = yp.loc[m]
    # 3) 지표 계산
    mae = mean_absolute_error(yt, yp)
    rmse = np.sqrt(mean_squared_error(yt, yp))
    mape = float(np.mean(np.abs((yt - yp) / np.maximum(1e-6, np.abs(yt)))) * 100)
    r2 = r2_score(yt, yp)
    return {"MAE": mae, "RMSE": rmse, "MAPE(%)": mape, "R2": r2, "n": int(m.sum())}


In [104]:
y_true = df_feat[CONFIG["y_col"]]

pairs = [
    ("poisson_mono", po_out["oof_pred"]),
    ("blend",        blend["pred"]),
    ("blend+cal",    cal_pred),
]

for name, pred in pairs:
    print(name, safe_metrics(y_true, pred))


poisson_mono {'MAE': 117.6830108846369, 'RMSE': np.float64(234.17766550092944), 'MAPE(%)': 5.49908746401526, 'R2': 0.996088746163183, 'n': 168000}
blend {'MAE': 116.09381503716916, 'RMSE': np.float64(225.14541005855568), 'MAPE(%)': 6.419016930370882, 'R2': 0.9963846424622692, 'n': 168000}
blend+cal {'MAE': 112.57649968042156, 'RMSE': np.float64(217.64765631687996), 'MAPE(%)': 5.115736392287933, 'R2': 0.9966214290186955, 'n': 168000}


In [107]:
def compare_by(df_feat, y_true, preds:dict, by_col="건물유형"):
    rows = []
    for name, pred in preds.items():
        idx = y_true.index.intersection(pred.index)
        yt = y_true.loc[idx]; yp = pred.loc[idx]
        m = yt.notna() & yp.notna()
        dd = pd.DataFrame({
            "y": yt.loc[m].values,
            "pred": yp.loc[m].values,
            by_col: df_feat.loc[yt.loc[m].index, by_col].values
        })
        dd["AE"]  = (dd["y"] - dd["pred"]).abs()
        dd["APE"] = dd["AE"] / np.maximum(1e-6, dd["y"].abs()) * 100
        g = (dd.groupby(by_col)
               .agg(MAE=("AE","mean"), MAPE=("APE","mean"), n=("AE","size"))
               .assign(model=name)
               .reset_index())
        rows.append(g)
    return pd.concat(rows, ignore_index=True)


In [111]:
def add_type_interactions_selective(df, weak_types=(4,9,0,3),
                                    base_cols=("CDD","dewpoint","is_workhour")):
    df = df.copy()
    if "건물유형" not in df: return df
    for t in weak_types:
        mask = (df["건물유형"] == t).astype(int)
        df[f"type_{t}"] = mask
        for bc in base_cols:
            if bc in df:
                df[f"{bc}__x__type{t}"] = df[bc] * mask
    return df


In [114]:
# === 멀티시드 OOF ===
def cv_oof_multi_seed(df_feat, cfg, base_params, seeds=(42,77,123), pos_prefix=None):
    feat_cols = _select_feature_columns(df_feat, cfg)
    if pos_prefix is None:
        pos_prefix = ["CDD","CDD_roll24_sum","CDD_roll168_sum","기온(°C)","temp2","dewpoint","temp_rh"]
    mono_cst = [1 if any(c.startswith(p) for p in pos_prefix) else 0 for c in feat_cols]

    oofs = []
    for s in seeds:
        params = {**base_params, "random_state": s, "early_stopping": False, "monotonic_cst": mono_cst}
        out = run_cv_training(df_feat, cfg, model_params=params, verbose=False)
        oofs.append(out["oof_pred"])
    # 공통 인덱스에서 평균
    idx = oofs[0].index
    for o in oofs[1:]:
        idx = idx.intersection(o.index)
    oofs = [o.loc[idx] for o in oofs]
    oof_mean = sum(oofs) / len(oofs)
    return oof_mean

POISSON_BASE = dict(loss="poisson", learning_rate=0.06, max_bins=255,
                    min_samples_leaf=30, l2_regularization=0.0, max_depth=None)
MSE_BASE = dict(loss="squared_error", learning_rate=0.06, max_bins=255,
                min_samples_leaf=20, l2_regularization=0.0, max_depth=None, early_stopping=False)

# 1) 멀티시드 포아송 OOF
oof_po_multi = cv_oof_multi_seed(df_feat, CONFIG, POISSON_BASE, seeds=(42,77,123))

# 2) MSE OOF
mse_out = run_cv_training(df_feat, CONFIG, model_params=MSE_BASE, verbose=False)
oof_mse = mse_out["oof_pred"]

# 3) 멀티시드 포아송 vs MSE 블렌딩(기존 blend 함수 재사용)
y_true = df_feat[CONFIG["y_col"]]
blend2 = blend_two_oofs(oof_po_multi, oof_mse, y_true, metric="MAE")
print(f"[Blend-2] alpha(Poisson_multi)={blend2['alpha']:.2f}, MAE={blend2['best_score']:.3f}")

# 4) (선택) 건물별 보정 재학습
cal2 = fit_group_calibration(df_feat, blend2["pred"], CONFIG)
pred_cal2 = apply_group_calibration(df_feat, blend2["pred"], cal2, CONFIG)
print("blend+cal(2):", safe_metrics(y_true, pred_cal2))


[Blend-2] alpha(Poisson_multi)=0.55, MAE=116.094
blend+cal(2): {'MAE': 112.57649968042156, 'RMSE': np.float64(217.64765631687996), 'MAPE(%)': 5.115736392287933, 'R2': 0.9966214290186955, 'n': 168000}


In [115]:
def train_final_ensemble_multi(df_feat, cfg, alpha, cal_coefs,
                               seeds=(42,77,123), max_iter=200, use_monotone=True):
    feat_cols = _select_feature_columns(df_feat, cfg)
    X = df_feat[feat_cols]; y = df_feat[cfg["y_col"]].values

    # Poisson 멀티시드
    pos_prefix = ["CDD","CDD_roll24_sum","CDD_roll168_sum","기온(°C)","temp2","dewpoint","temp_rh"]
    mono_cst = [1 if any(c.startswith(p) for p in pos_prefix) else 0 for c in feat_cols] if use_monotone else None

    po_models = []
    for s in seeds:
        p = dict(loss="poisson", learning_rate=0.06, max_bins=255,
                 min_samples_leaf=30, l2_regularization=0.0, max_depth=None,
                 early_stopping=False, random_state=s, max_iter=max_iter)
        if use_monotone:
            p["monotonic_cst"] = mono_cst
        m = HistGradientBoostingRegressor(**p).fit(X, y)
        po_models.append(m)

    # 하나는 MSE
    m_mse = HistGradientBoostingRegressor(
        loss="squared_error", learning_rate=0.06, max_bins=255,
        min_samples_leaf=20, l2_regularization=0.0, max_depth=None,
        early_stopping=False, random_state=42, max_iter=max_iter
    ).fit(X, y)

    return {"feat_cols": feat_cols, "po_models": po_models, "mse": m_mse,
            "alpha": float(alpha), "cal_coefs": cal_coefs, "cfg": cfg, "seeds": list(seeds)}

def predict_final_ensemble_multi(pack, df_feat_new):
    fc = pack["feat_cols"]; cfg = pack["cfg"]
    Xn = align_features(df_feat_new, fc)
    # 멀티시드 평균
    p_po = np.mean([m.predict(Xn) for m in pack["po_models"]], axis=0)
    p_mse = pack["mse"].predict(Xn)
    pred = pack["alpha"]*p_po + (1-pack["alpha"])*p_mse
    pred = np.maximum(0.0, pred)
    pred = pd.Series(pred, index=df_feat_new.index)
    # 보정
    return apply_group_calibration(df_feat_new, pred, pack["cal_coefs"], cfg)


In [116]:
# 멀티시드 블렌딩/보정 결과가 좋으면 고정
alpha_final2 = float(blend2["alpha"])
cal_coefs2   = cal2
final_pack2  = train_final_ensemble_multi(df_feat, CONFIG, alpha_final2, cal_coefs2, seeds=(42,77,123), max_iter=200)
# 예측은 predict_final_ensemble_multi(final_pack2, df_test_feat)


In [117]:
# === 온도 스플라인 + 주간/야간 기울기 분리 ===
def add_piecewise_temp_features(df, temp_col="기온(°C)"):
    df = df.copy()
    if temp_col not in df: return df
    T = pd.to_numeric(df[temp_col], errors="coerce")
    # 절편형 CDD: 23, 27 두 단계
    df["CDD23"] = (T - 23).clip(lower=0)
    df["CDD27"] = (T - 27).clip(lower=0)
    # 주간/야간 기울기 분리 (is_workhour는 add_extra_features_v2에서 생성)
    if "is_workhour" in df:
        df["CDD23_x_work"] = df["CDD23"] * df["is_workhour"]
        df["CDD27_x_work"] = df["CDD27"] * df["is_workhour"]
    return df


In [120]:
from collections import deque, defaultdict

def predict_autoregressive(pack, df_test_raw, cfg):
    """
    - df_test_raw: 테스트 원본(날짜/시간/건물번호/날씨/메타 포함, y는 없어도 됨)
    - pack: final_pack 또는 final_pack2 (멀티시드)
    - 반환: pred_adj (보정 적용 최종 예측 Series, index=df_feat_test.index)
    """
    # 1) 먼저 exogenous 피처 전부 생성 (y 파생 제외)
    d = build_features_for_any(df_test_raw, cfg)  # 우리가 만든 동일 파이프라인
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]

    # y 파생들 초기화 (모델이 요구하는 컬럼만 최소 채움)
    needed = [
        f"{y}_lag1", f"{y}_lag24", f"{y}_lag168",
        f"{y}_roll3", f"{y}_roll6", f"{y}_roll24", f"{y}_roll168", f"{y}_ewm_alpha005",
        f"{y}_how_mean3", f"{y}_how_mean8",
        f"{y}_lag1_minus_how3", f"{y}_lag24_minus_how8",
        f"{y}_chg1_from24", f"{y}_ratio1_24",
        f"{y}_chg24_from168", f"{y}_ratio24_168",
    ]
    for c in needed:
        if c not in d: d[c] = np.nan

    # 2) 그룹별 상태 버퍼(라그/롤링/SHR용)
    #    - lag24: 길이 24 deque
    #    - roll/ewm: 길이 168 deque + 누적평균
    #    - how별 버퍼: (건물, hour_of_week) → deque(maxlen=8)
    states = {}

    def init_state():
        return {
            "last": None,
            "lag24": deque(maxlen=24),
            "lag168": deque(maxlen=168),
            "ewm": None,
            "how_buf": defaultdict(lambda: deque(maxlen=8))  # key=hour_of_week
        }

    # 3) 순차 예측
    d = d.sort_values([grp, ts]).copy()
    preds = []

    # 어떤 팩인지에 따라 예측 함수 통일
    def _predict_pack(pack, df_feat_part):
        if "po_models" in pack:
            # 멀티시드
            fc = pack["feat_cols"]
            Xn = align_features(df_feat_part, fc)
            p_po = np.mean([m.predict(Xn) for m in pack["po_models"]], axis=0)
            p_mse = pack["mse"].predict(Xn)
            p = pack["alpha"]*p_po + (1-pack["alpha"])*p_mse
            p = np.maximum(0.0, p)
            return pd.Series(p, index=df_feat_part.index)
        else:
            # 단일 팩
            return predict_final_ensemble(pack, df_feat_part)

    for b, sub in d.groupby(grp):
        st = states.setdefault(b, init_state())
        sub_idx = sub.index
        for i in sub_idx:
            hour = int(d.at[i, "hour"]) if "hour" in d else None
            dow  = int(d.at[i, "dow"]) if "dow"  in d else None
            how  = (dow * 24 + hour) if (hour is not None and dow is not None) else None

            # 라그 채우기
            d.at[i, f"{y}_lag1"]   = st["last"] if st["last"] is not None else np.nan
            d.at[i, f"{y}_lag24"]  = (st["lag24"][0] if len(st["lag24"])==24 else np.nan)
            d.at[i, f"{y}_lag168"] = (st["lag168"][0] if len(st["lag168"])==168 else np.nan)

            # 롤링/ewm (과거로만)
            if len(st["lag3"] if "lag3" in st else [])==0:
                # lazy init
                st["lag3"] = deque(maxlen=3)
                st["lag6"] = deque(maxlen=6)
            d.at[i, f"{y}_roll3"]   = (np.mean(st["lag3"]) if len(st["lag3"])>=2 else np.nan)
            d.at[i, f"{y}_roll6"]   = (np.mean(st["lag6"]) if len(st["lag6"])>=3 else np.nan)
            d.at[i, f"{y}_roll24"]  = (np.mean(st["lag24"]) if len(st["lag24"])>=6 else np.nan)
            d.at[i, f"{y}_roll168"] = (np.mean(st["lag168"]) if len(st["lag168"])>=24 else np.nan)

            # EWM(alpha=0.05) 과거값만
            d.at[i, f"{y}_ewm_alpha005"] = st["ewm"]

            # 같은 시간대 과거 평균 (how_mean3/8)
            if how is not None:
                buf = st["how_buf"][how]
                # shift(1) 후 rolling 이므로 현재는 buf 평균만 쓴다
                d.at[i, f"{y}_how_mean3"] = (np.mean(list(buf)[-3:]) if len(buf)>=1 else np.nan)
                d.at[i, f"{y}_how_mean8"] = (np.mean(list(buf)[-8:]) if len(buf)>=2 else np.nan)

            # 델타/비율 (전부 과거끼리)
            lag1  = d.at[i, f"{y}_lag1"]
            lag24 = d.at[i, f"{y}_lag24"]
            lag168= d.at[i, f"{y}_lag168"]
            eps = 1e-6
            d.at[i, f"{y}_chg1_from24"]   = (lag1 - lag24) if (pd.notna(lag1) and pd.notna(lag24)) else np.nan
            d.at[i, f"{y}_ratio1_24"]     = (lag1 / (abs(lag24)+eps)) if (pd.notna(lag1) and pd.notna(lag24) and abs(lag24)>0) else np.nan
            d.at[i, f"{y}_chg24_from168"] = (lag24 - lag168) if (pd.notna(lag24) and pd.notna(lag168)) else np.nan
            d.at[i, f"{y}_ratio24_168"]   = (lag24 / (abs(lag168)+eps)) if (pd.notna(lag24) and pd.notna(lag168) and abs(lag168)>0) else np.nan

            # 현재 한 행만 예측해서 값 얻기
            row_feat = d.loc[[i]]
            yhat_i = _predict_pack(pack, row_feat).iloc[0]
            preds.append((i, yhat_i))

            # 상태 업데이트 (맨 마지막에)
            st["last"] = yhat_i
            # 왼쪽이 t-24가 되도록: maxlen 유지하며 append
            st["lag3"].append(yhat_i); st["lag6"].append(yhat_i)
            st["lag24"].append(yhat_i); st["lag168"].append(yhat_i)
            st["ewm"] = yhat_i if st["ewm"] is None else (0.05*yhat_i + 0.95*st["ewm"])
            if how is not None:
                st["how_buf"][how].append(yhat_i)

    pred_series = pd.Series({i:v for i,v in preds}).sort_index()
    # 보정 적용
    pred_adj = apply_group_calibration(d, pred_series, pack.get("cal_coefs", {}), cfg)
    return pred_adj


In [125]:
# ===== 0) 유틸: 컬럼 자동탐지 =====
import re
import pandas as pd
import numpy as np

def _find_col(df, candidates, partial=True):
    """
    df.columns에서 후보명(candidates) 중 하나를 찾아서 반환.
    - 대소문자/공백/언더스코어 무시
    - partial=True면 부분일치도 허용(예: '일시(UTC)' → '일시')
    """
    norm = lambda s: re.sub(r"[\s_]+", "", str(s)).lower()
    cols_norm = {norm(c): c for c in df.columns}
    # 1) 정확히 일치
    for name in candidates:
        key = norm(name)
        if key in cols_norm:
            return cols_norm[key]
    # 2) 부분 일치
    if partial:
        for name in candidates:
            key = norm(name)
            for k, orig in cols_norm.items():
                if key in k:
                    return orig
    return None

# ===== 1) timestamp 자동 생성 (일시 or 날짜+시간 or timestamp 다 지원) =====
def make_timestamp_auto(df, out_col="timestamp"):
    d = df.copy()
    if out_col in d.columns:
        d[out_col] = pd.to_datetime(d[out_col], errors="coerce")
        return d

    # (A) '일시' / 'datetime' / 'timestamp' 한 칼럼에 합쳐진 경우 먼저 탐색
    single_candidates = ["일시", "datetime", "date_time", "timestamp", "ts"]
    single_col = _find_col(d, single_candidates)
    if single_col:
        d[out_col] = pd.to_datetime(d[single_col], errors="coerce")
        return d

    # (B) 날짜 + 시간 칼럼 조합 탐색
    date_candidates = ["날짜", "일자", "date", "dt", "ymd"]
    time_candidates = ["시간", "시각", "hour", "hr", "time", "hh"]

    date_col = _find_col(d, date_candidates)
    time_col = _find_col(d, time_candidates)

    if date_col is None and time_col is None:
        # 마지막 시도: '년','월','일','시'가 따로 있을 수도 있음
        y_col = _find_col(d, ["연", "year", "yyyy"])
        m_col = _find_col(d, ["월", "month", "mm"])
        day_col = _find_col(d, ["일", "day", "dd"])
        h_col = _find_col(d, ["시", "hour", "hh"])
        if all(c is not None for c in [y_col, m_col, day_col, h_col]):
            y = d[y_col].astype(int).astype(str).str.zfill(4)
            m = d[m_col].astype(int).astype(str).str.zfill(2)
            day = d[day_col].astype(int).astype(str).str.zfill(2)
            hh = d[h_col].astype(int).astype(str).str.zfill(2)
            d[out_col] = pd.to_datetime(y + "-" + m + "-" + day + " " + hh + ":00", errors="coerce")
            return d
        # 정말 없으면 에러 메시지와 함께 컬럼 리스트 보여주기
        raise KeyError(f"timestamp를 만들 소스 컬럼을 못 찾았어. 사용 가능한 컬럼: {list(d.columns)}")

    # 날짜만 있고 시간 없으면 00:00으로
    if date_col and not time_col:
        d[out_col] = pd.to_datetime(d[date_col].astype(str), errors="coerce")
        return d

    # 날짜+시간 둘 다 있으면 조합
    # 시간은 '0~23' 정수 or 'HH' or 'HH:MM' 모두 처리
    hh_raw = d[time_col].astype(str).str.strip()
    # 정수만 들어있으면 HH:00으로 변환
    if pd.to_numeric(hh_raw, errors="coerce").notna().all():
        hh = pd.to_numeric(hh_raw, errors="coerce").fillna(0).astype(int).clip(0, 23)
        tstr = hh.astype(str).str.zfill(2) + ":00"
    else:
        # 이미 "HH" or "HH:MM" 형식일 수 있음 → HH만 남기거나 그대로 사용
        # HH:MM:SS → HH:MM으로 자르기
        tstr = hh_raw.str.extract(r"^(\d{1,2}(?::\d{2})?)")[0].fillna(hh_raw)
        # HH만 있으면 ':00' 붙이기
        tstr = tstr.where(tstr.str.contains(":"), tstr + ":00")

    d[out_col] = pd.to_datetime(d[date_col].astype(str) + " " + tstr, errors="coerce")
    return d

# ===== 2) 테스트 전처리 본체(위 자동 timestamp 사용) =====
def build_features_for_test_safe(df_raw, cfg):
    d = df_raw.copy()
    ts = cfg["ts_col"]; grp = cfg["group_col"]; y = cfg["y_col"]
    weather = cfg.get("weather", {})

    # timestamp 생성(자동)
    d = make_timestamp_auto(d, out_col=ts)

    # 시간/휴일
    d = add_time_features(d, ts)
    d = add_korean_holidays_extended(d, ts)

    # 기상 + CDD/HDD + 누적 + 스플라인
    temp_col = weather.get("temp") or "기온(°C)"
    d = add_weather_features(d, weather, ts)
    d = add_cdd_hdd(d, temp_col=temp_col, base_cdd=23, base_hdd=18)
    d = add_cdd_hdd_aggregates(d)
    d = add_extra_features_v2(d, cfg)
    d = add_piecewise_temp_features(d, temp_col=temp_col)

    # 건물유형 원-핫(있을 때만)
    if "건물유형" in d:
        d = pd.concat([d, pd.get_dummies(d["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)

    d = d.sort_values([grp, ts]).reset_index(drop=True)
    return d


In [126]:
# 1) 테스트 로드(네가 쓴 경로 그대로)
PATH = r"C:\Users\user\Downloads\open (1)\test.csv"
try:
    df_test_raw = pd.read_csv(PATH)
except UnicodeDecodeError:
    df_test_raw = pd.read_csv(PATH, encoding="cp949")

# 2) 전처리 재실행
df_test_feat = build_features_for_test_safe(df_test_raw, CONFIG)
print("✅ preprocessed:", df_test_feat.shape)
display(df_test_feat.head(3))


✅ preprocessed: (16800, 36)


Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),timestamp,year,month,...,CDD_roll24_sum,CDD_roll168_sum,HDD_roll24_sum,HDD_roll168_sum,is_workhour,is_night,CDD23,CDD27,CDD23_x_work,CDD27_x_work
0,1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0,2024-08-25 00:00:00,2024,8,...,,,,,0,1,3.5,0.0,0.0,0.0
1,1_20240825 01,1,20240825 01,26.1,0.0,0.0,80.0,2024-08-25 01:00:00,2024,8,...,6.6,,0.0,,0,1,3.1,0.0,0.0,0.0
2,1_20240825 02,1,20240825 02,25.9,0.0,0.3,83.0,2024-08-25 02:00:00,2024,8,...,9.5,,0.0,,0,1,2.9,0.0,0.0,0.0


In [127]:
# ===========================================
# 0) 공통: 컬럼 표준화(스마트 매핑 + 자동 파싱)
#    - 건물번호/일시/날짜/시간/날씨/메타 컬럼 동명이인 처리
#    - 'num_date_time'에서 건물번호/일시 파싱
#    - '일시'나 'timestamp'만 있는 경우도 처리
# ===========================================
import re
import pandas as pd
import numpy as np

# 후보명 사전(표준명 -> 가능한 별칭들)
COLMAP_ANY = {
    "건물번호":         ["building_id","번호","bld_no","bldg","num","건물ID","건물 코드"],
    "일시":             ["datetime","date_time","측정일시","일시(UTC)","일시(KST)"],
    "날짜":             ["일자","date","dt","ymd","yymmdd","YYYYMMDD"],
    "시간":             ["시각","hour","hr","time","hh"],
    "기온(°C)":         ["기온","기온(℃)","temperature","temp","T"],
    "강수량(mm)":       ["강수량","precip","rain","prcp","강수"],
    "풍속(m/s)":        ["풍속","wind_speed","ws"],
    "습도(%)":          ["습도","rh","relative_humidity"],
    "일조(hr)":         ["일조","sunshine","sunhr"],
    "일사(MJ/m2)":      ["일사","radiation","rad","solar"],
    "전력소비량(kWh)":  ["전력사용량","전력소비량","target","y"],
    "건물유형":         ["건물 종류","type","bld_type"],
    "연면적(m2)":       ["연면적","total_area"],
    "냉방면적(m2)":     ["냉방면적","cooling_area"],
    "태양광용량(kW)":   ["태양광용량","pv_kw","pv"],
    "ESS저장용량(kWh)":["ESS저장용량","ess_kwh","ess"],
    "PCS용량(kW)":      ["PCS용량","pcs_kw","pcs"],
}

def _norm(s):  # 공백/언더스코어 제거, 소문자
    return re.sub(r"[\s_]+", "", str(s)).lower()

def _find_first(df_cols, candidates):
    cols_norm = {_norm(c): c for c in df_cols}
    # 정확히
    for c in candidates:
        k = _norm(c)
        if k in cols_norm:
            return cols_norm[k]
    # 부분 일치
    for c in candidates:
        k = _norm(c)
        for kn, orig in cols_norm.items():
            if k in kn:
                return orig
    return None

def standardize_columns(df):
    d = df.copy()
    rename = {}
    # 1) 쉬운 매핑(존재하는 별칭을 표준명으로)
    for std, aliases in COLMAP_ANY.items():
        cand = [std] + aliases
        got = _find_first(d.columns, cand)
        if got and got != std:
            rename[got] = std
    if rename:
        d = d.rename(columns=rename)

    # 2) num_date_time 같은 합본에서 분해 (예: "1_20240825 00")
    ndt = _find_first(d.columns, ["num_date_time","numdatetime","번호일시","bld_dt"])
    if ndt:
        s = d[ndt].astype(str).str.strip()
        # 앞 숫자(건물), 뒤 yyyymmdd와 시간 추출
        m = s.str.extract(r"^\s*(\d+)[_\- ]?(\d{8})\s*([0-2]?\d)?")
        if "건물번호" not in d and m[0].notna().any():
            d["건물번호"] = pd.to_numeric(m[0], errors="coerce")
        if "일시" not in d and m[1].notna().any():
            # "20240825 00" 형태로 합쳐서 보관
            hh = m[2].fillna("00").astype(str).str.zfill(2)
            d["일시"] = m[1].astype(str) + " " + hh

    # 3) timestamp 만들기 (일시 or 날짜+시간)
    if "timestamp" not in d:
        if "일시" in d:
            # "YYYYMMDD HH" / "YYYY-MM-DD HH:MM" 모두 허용
            raw = d["일시"].astype(str).str.strip()
            # "YYYYMMDD HH"인 경우, YYYY-MM-DD HH:MM으로 보정
            raw = raw.str.replace(r"^(\d{4})(\d{2})(\d{2})\s+(\d{2})$",
                                  r"\1-\2-\3 \4:00", regex=True)
            d["timestamp"] = pd.to_datetime(raw, errors="coerce")
        else:
            date_col = _find_first(d.columns, ["날짜","일자","date","dt","ymd"])
            time_col = _find_first(d.columns, ["시간","시각","hour","hr","time","hh"])
            if date_col:
                if time_col:
                    hh = d[time_col].astype(str).str.strip()
                    # 정수만 → HH:00, 아니면 HH:MM 유지
                    if pd.to_numeric(hh, errors="coerce").notna().all():
                        hh = pd.to_numeric(hh, errors="coerce").fillna(0).astype(int).clip(0,23)
                        tstr = hh.astype(str).str.zfill(2) + ":00"
                    else:
                        tstr = hh.str.extract(r"^(\d{1,2}(?::\d{2})?)")[0].fillna(hh)
                        tstr = tstr.where(tstr.str.contains(":"), tstr + ":00")
                    d["timestamp"] = pd.to_datetime(d[date_col].astype(str) + " " + tstr, errors="coerce")
                else:
                    d["timestamp"] = pd.to_datetime(d[date_col].astype(str), errors="coerce")

    # 4) 타입 정리(숫자형으로 변환)
    numeric_cols = [
        "건물번호","기온(°C)","강수량(mm)","풍속(m/s)","습도(%)","일조(hr)","일사(MJ/m2)",
        "전력소비량(kWh)","연면적(m2)","냉방면적(m2)","태양광용량(kW)","ESS저장용량(kWh)","PCS용량(kW)"
    ]
    for c in numeric_cols:
        if c in d:
            d[c] = pd.to_numeric(d[c], errors="coerce")

    return d

# ===========================================
# 1) train/test를 표준화하고(컬럼 통일), 메타 결합(테스트에 메타 없을 때)
# ===========================================
def align_train_test_columns(df_train, df_test):
    tr = standardize_columns(df_train)
    te = standardize_columns(df_test)

    # 테스트에 메타 없으면 train에서 per-건물번호 메타 복사
    meta_cols = ["건물유형","연면적(m2)","냉방면적(m2)","태양광용량(kW)","ESS저장용량(kWh)","PCS용량(kW)"]
    if "건물번호" in te:
        need = [c for c in meta_cols if c not in te.columns and c in tr.columns]
        if need:
            meta = (tr.groupby("건물번호")[need]
                      .first()
                      .reset_index())
            te = te.merge(meta, on="건물번호", how="left")
    return tr, te

# ===========================================
# 2) 사용법 (네 환경)
#    - df: 학습 데이터프레임(이미 로드됨)
#    - df_test_raw: 테스트 원본 (방금 경로에서 로드한 것)
# ===========================================
# tr_std, te_std = align_train_test_columns(df, df_test_raw)

# 확인
# print("train cols:", list(tr_std.columns))
# print("test  cols:", list(te_std.columns))


In [128]:
# 1) 학습/테스트 컬럼 표준화
tr_std, te_std = align_train_test_columns(df, df_test_raw)

# 2) 우리가 만든 테스트 전처리(동일 컨벤션) 적용
df_test_feat = build_features_for_test_safe(te_std, CONFIG)

# 3) 예측 (final_pack2가 있으면 그걸, 없으면 final_pack)
pack = final_pack2 if 'final_pack2' in globals() else final_pack
yhat_auto = predict_autoregressive(pack, te_std, CONFIG)

# 4) 제출 생성
submission = make_submission(build_features_for_any(te_std, CONFIG), yhat_auto,
                             id_cols=("건물번호","timestamp"),
                             target_name="전력소비량(kWh)")
submission.to_csv("submission_autoreg.csv", index=False)
print("💾 submission_autoreg.csv 저장 완료:", submission.shape)


KeyError: 'Column not found: 전력소비량(kWh)'

In [122]:
df

Unnamed: 0,건물번호,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간,timestamp
0,1,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,0,2024-06-01 00:00:00
1,1,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,1,2024-06-01 01:00:00
2,1,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,2,2024-06-01 02:00:00
3,1,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,3,2024-06-01 03:00:00
4,1,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,9,82912.71,77586.0,0.0,0.0,0.0,2024-06-01,4,2024-06-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,19,2024-08-24 19:00:00
203996,100,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,20,2024-08-24 20:00:00
203997,100,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,21,2024-08-24 21:00:00
203998,100,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,9,162070.24,152943.0,0.0,0.0,0.0,2024-08-24,22,2024-08-24 22:00:00


In [129]:
# ✅ 기존 함수 덮어쓰기 (차이: y 없으면 라그/롤링/잔차/빌딩히스토리 스킵)
def build_features_for_any(df_raw, cfg):
    d = df_raw.copy()
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]

    if ts not in d:
        d = make_timestamp_auto(d, out_col=ts)

    # 시간/휴일
    d = add_time_features(d, ts)
    d = add_korean_holidays_extended(d, ts)

    # 기상 파생
    d = add_weather_features(d, cfg["weather"], ts)
    d = add_cdd_hdd_aggregates(add_cdd_hdd(d, cfg["weather"].get("temp") or "기온(°C)"))

    # 스플라인/추가 파생
    d = add_extra_features_v2(d, cfg)
    d = add_piecewise_temp_features(d, cfg["weather"].get("temp") or "기온(°C)")

    # ▶ y가 있을 때만 y-기반 파생 생성
    if y in d.columns:
        d = impute_and_cap_target(d, y, grp, ts)
        d = add_target_lags_rolls(d, y, grp, ts)
        d = add_same_how_history(d, y, grp, ts)
        d = add_safe_deltas(d, y)
        d = add_building_history_features(d, y, grp, ts)

    # 건물유형 원핫
    if "건물유형" in d:
        d = pd.concat([d, pd.get_dummies(d["건물유형"], prefix="건물유형", dummy_na=True)], axis=1)

    return d

In [130]:
# ✅ 기존 함수 첫 줄만 교체
def predict_autoregressive(pack, df_test_raw, cfg):
    # d = build_features_for_any(df_test_raw, cfg)   # (구) 테스트에 y 없어서 터짐
    d = build_features_for_test_safe(df_test_raw, cfg)  # (신) y 없이도 안전 전처리
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]
    ...
    # (아래 로직은 그대로; needed y-피처를 NaN으로 만들고 오토리그레시브로 채움)

In [132]:
import numpy as np
import pandas as pd
from collections import deque, defaultdict

def predict_autoregressive_v2(pack, df_test_raw, cfg):
    """
    테스트셋에 y가 없어도 순차적으로 y-기반 피처를 예측값으로 채우며 진행.
    반환: pd.Series (index=df_feat_test.index), 보정(calibration)까지 적용된 예측
    """
    # 1) y 없이도 안전한 전처리
    d = build_features_for_test_safe(df_test_raw, cfg)  # 반드시 이 함수 사용!
    ts, grp, y = cfg["ts_col"], cfg["group_col"], cfg["y_col"]

    # 2) 모델이 실제로 요구하는 y-기반 피처만 골라 생성(없으면 건너뜀)
    fc = pack["feat_cols"]
    req = [
        f"{y}_lag1", f"{y}_lag24", f"{y}_lag168",
        f"{y}_roll3", f"{y}_roll6", f"{y}_roll24", f"{y}_roll168",
        f"{y}_ewm_alpha005",
        f"{y}_how_mean3", f"{y}_how_mean8",
        f"{y}_lag1_minus_how3", f"{y}_lag24_minus_how8",
        f"{y}_chg1_from24", f"{y}_ratio1_24",
        f"{y}_chg24_from168", f"{y}_ratio24_168",
    ]
    needed = [c for c in req if c in fc]
    for c in needed:
        if c not in d:
            d[c] = np.nan

    # 3) 순차 예측 상태(건물별)
    def init_state():
        return {
            "last": None,
            "lag3": deque(maxlen=3),
            "lag6": deque(maxlen=6),
            "lag24": deque(maxlen=24),
            "lag168": deque(maxlen=168),
            "ewm": None,
            "how_buf": defaultdict(lambda: deque(maxlen=8))  # key=hour_of_week
        }
    states = {}

    # 내부 예측 (멀티시드/단일 둘 다 지원)
    def _predict_pack_one(pack, row_df):
        fc = pack["feat_cols"]
        Xn = align_features(row_df, fc)
        if "po_models" in pack:  # 멀티시드 팩
            p_po = np.mean([m.predict(Xn) for m in pack["po_models"]], axis=0)
            p_mse = pack["mse"].predict(Xn)
            pred = pack["alpha"]*p_po + (1-pack["alpha"])*p_mse
        else:  # 단일 팩
            p_po  = pack.get("poisson").predict(Xn) if pack.get("poisson") else 0.0
            p_mse = pack.get("mse").predict(Xn) if pack.get("mse") else 0.0
            a = float(pack.get("alpha", 1.0))
            pred = a*p_po + (1-a)*p_mse
        pred = np.maximum(0.0, pred)
        return float(pred[0])

    # 4) 정렬 후 순차 진행
    d = d.sort_values([grp, ts]).copy()
    preds = {}

    for b, sub in d.groupby(grp, sort=False):
        st = states.setdefault(b, init_state())
        for i in sub.index:
            # hour_of_week
            hour = int(d.at[i, "hour"]) if "hour" in d else None
            dow  = int(d.at[i, "dow"])  if "dow"  in d else None
            how  = (dow * 24 + hour) if (hour is not None and dow is not None) else None

            # 필요한 y-피처만 채우기
            if f"{y}_lag1" in needed:   d.at[i, f"{y}_lag1"]   = st["last"]
            if f"{y}_lag24" in needed:  d.at[i, f"{y}_lag24"]  = (st["lag24"][0]  if len(st["lag24"])==24   else np.nan)
            if f"{y}_lag168" in needed: d.at[i, f"{y}_lag168"] = (st["lag168"][0] if len(st["lag168"])==168 else np.nan)

            if f"{y}_roll3"   in needed: d.at[i, f"{y}_roll3"]   = (np.mean(st["lag3"])   if len(st["lag3"])>=2   else np.nan)
            if f"{y}_roll6"   in needed: d.at[i, f"{y}_roll6"]   = (np.mean(st["lag6"])   if len(st["lag6"])>=3   else np.nan)
            if f"{y}_roll24"  in needed: d.at[i, f"{y}_roll24"]  = (np.mean(st["lag24"])  if len(st["lag24"])>=6  else np.nan)
            if f"{y}_roll168" in needed: d.at[i, f"{y}_roll168"] = (np.mean(st["lag168"]) if len(st["lag168"])>=24 else np.nan)
            if f"{y}_ewm_alpha005" in needed: d.at[i, f"{y}_ewm_alpha005"] = st["ewm"]

            if how is not None:
                if f"{y}_how_mean3" in needed:
                    d.at[i, f"{y}_how_mean3"] = (np.mean(list(st["how_buf"][how])[-3:]) if len(st["how_buf"][how])>=1 else np.nan)
                if f"{y}_how_mean8" in needed:
                    d.at[i, f"{y}_how_mean8"] = (np.mean(list(st["how_buf"][how])[-8:]) if len(st["how_buf"][how])>=2 else np.nan)

            # 델타/비율
            lag1   = d.at[i, f"{y}_lag1"]   if f"{y}_lag1"   in d else np.nan
            lag24  = d.at[i, f"{y}_lag24"]  if f"{y}_lag24"  in d else np.nan
            lag168 = d.at[i, f"{y}_lag168"] if f"{y}_lag168" in d else np.nan
            eps = 1e-6
            if f"{y}_chg1_from24" in needed:
                d.at[i, f"{y}_chg1_from24"] = (lag1 - lag24) if pd.notna(lag1) and pd.notna(lag24) else np.nan
            if f"{y}_ratio1_24" in needed:
                d.at[i, f"{y}_ratio1_24"]   = (lag1 / (abs(lag24)+eps)) if pd.notna(lag1) and pd.notna(lag24) and abs(lag24)>0 else np.nan
            if f"{y}_chg24_from168" in needed:
                d.at[i, f"{y}_chg24_from168"] = (lag24 - lag168) if pd.notna(lag24) and pd.notna(lag168) else np.nan
            if f"{y}_ratio24_168" in needed:
                d.at[i, f"{y}_ratio24_168"]   = (lag24 / (abs(lag168)+eps)) if pd.notna(lag24) and pd.notna(lag168) and abs(lag168)>0 else np.nan

            # 잔차형(필요할 때만)
            if f"{y}_lag1_minus_how3" in needed:
                val = np.nan
                if f"{y}_lag1" in d and f"{y}_how_mean3" in d:
                    v1 = d.at[i, f"{y}_lag1"]; v2 = d.at[i, f"{y}_how_mean3"]
                    val = (v1 - v2) if pd.notna(v1) and pd.notna(v2) else np.nan
                d.at[i, f"{y}_lag1_minus_how3"] = val
            if f"{y}_lag24_minus_how8" in needed:
                val = np.nan
                if f"{y}_lag24" in d and f"{y}_how_mean8" in d:
                    v1 = d.at[i, f"{y}_lag24"]; v2 = d.at[i, f"{y}_how_mean8"]
                    val = (v1 - v2) if pd.notna(v1) and pd.notna(v2) else np.nan
                d.at[i, f"{y}_lag24_minus_how8"] = val

            # 한 행 예측
            yhat = _predict_pack_one(pack, d.loc[[i]])
            preds[i] = yhat

            # 상태 업데이트
            st["last"] = yhat
            st["lag3"].append(yhat); st["lag6"].append(yhat)
            st["lag24"].append(yhat); st["lag168"].append(yhat)
            st["ewm"] = yhat if st["ewm"] is None else (0.05*yhat + 0.95*st["ewm"])
            if how is not None:
                st["how_buf"][how].append(yhat)

    # 5) 시리즈 구성(원 인덱스 정렬 유지)
    pred_series = pd.Series(preds, name=y, dtype="float64").sort_index()
    # 보정 적용
    pred_adj = apply_group_calibration(d, pred_series, pack.get("cal_coefs", {}), cfg)
    # 최종 안전 장치
    pred_adj = pred_adj.astype("float64")
    return pred_adj


In [133]:
def make_submission_safe(df_feat_new, yhat, id_cols=("건물번호","timestamp"), target_name="전력소비량(kWh)"):
    # yhat: Series/ndarray/list 모두 허용
    if isinstance(yhat, pd.Series):
        y_vals = yhat.values
    else:
        y_vals = np.asarray(yhat)
    if len(y_vals) != len(df_feat_new):
        # 길이가 다르면 공통 인덱스 기준으로 맞춤 시도
        common = df_feat_new.index.intersection(getattr(yhat, "index", df_feat_new.index))
        sub = df_feat_new.loc[common, list(id_cols)].copy().reset_index(drop=True)
        y_vals = (yhat.loc[common].values if isinstance(yhat, pd.Series) else y_vals[:len(common)])
    else:
        sub = df_feat_new.loc[:, list(id_cols)].copy().reset_index(drop=True)
    sub[target_name] = y_vals
    return sub


In [134]:
# 1) train/test 표준화
tr_std, te_std = align_train_test_columns(df, df_test_raw)

# 2) 테스트 전처리
df_test_feat = build_features_for_test_safe(te_std, CONFIG)

# 3) 예측 (멀티시드 있으면 그걸 사용)
pack = final_pack2 if 'final_pack2' in globals() else final_pack
yhat_auto = predict_autoregressive_v2(pack, te_std, CONFIG)   # ← v2 사용!

# 4) 제출
submission = make_submission_safe(df_test_feat, yhat_auto,
                                  id_cols=("건물번호","timestamp"),
                                  target_name="전력소비량(kWh)")
submission.to_csv("submission_autoreg.csv", index=False)
print("💾 submission_autoreg.csv 저장 완료:", submission.shape)
display(submission.head())


💾 submission_autoreg.csv 저장 완료: (16800, 3)


Unnamed: 0,건물번호,timestamp,전력소비량(kWh)
0,1,2024-08-25 00:00:00,1524.168508
1,1,2024-08-25 01:00:00,941.39049
2,1,2024-08-25 02:00:00,434.923016
3,1,2024-08-25 03:00:00,156.938367
4,1,2024-08-25 04:00:00,15.934646


In [None]:
# 테스트 원본 로드 후
# df_test_raw = pd.read_csv("test.csv")  # 경로는 너의 환경에 맞춰
yhat_auto = predict_autoregressive(final_pack2 if 'final_pack2' in globals() else final_pack,
                                   df_test_raw, CONFIG)
submission = make_submission(build_features_for_any(df_test_raw, CONFIG), yhat_auto,
                             id_cols=("건물번호","timestamp"), target_name="전력소비량(kWh)")
submission.to_csv("submission_autoreg.csv", index=False)
print("💾 submission_autoreg.csv 저장 완료:", submission.shape)
