In [2]:
import pandas as pd

# 파일 경로
train_path = r"C:\Users\user\Downloads\open (1)\train.csv"
building_info_path = r"C:\Users\user\Downloads\open (1)\building_info.csv"

# CSV 불러오기
train_df = pd.read_csv(train_path)
building_info_df = pd.read_csv(building_info_path)

# 병합 (건물번호 기준)
merged_df = pd.merge(train_df, building_info_df, on='건물번호', how='left')

# 결과 저장
merged_df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')

print("병합 완료! merged_train.csv로 저장됨")


병합 완료! merged_train.csv로 저장됨


In [3]:
def read_csv_smart(path):
    import pandas as pd
    for enc in ['cp949', 'utf-8-sig', 'utf-8', 'euc-kr']:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # 최후의 보루: 깨지는 글자는 � 로 대체
    return pd.read_csv(path, encoding='utf-8', errors='replace')

In [4]:
# CSV 불러오기 (예시)
df = read_csv_smart("C:\\Users\\user\\Downloads\\open (1)\\merged_train.csv")


# '일시'를 문자열로 변환 후 날짜와 시간 분리
df['일시'] = df['일시'].astype(str)

# 날짜(YYYYMMDD)와 시간(HH) 분리
df['날짜'] = df['일시'].str.slice(0, 8)     # 앞 8자리 → 날짜
df['시간'] = df['일시'].str.slice(9, 11)    # 9~10번째 자리 → 시간

# 날짜를 datetime 형식으로 변환
df['날짜'] = pd.to_datetime(df['날짜'], format='%Y%m%d')
df['시간'] = df['시간'].astype(int)

# 확인
print(df[['일시', '날짜', '시간']].head())



            일시         날짜  시간
0  20240601 00 2024-06-01   0
1  20240601 01 2024-06-01   1
2  20240601 02 2024-06-01   2
3  20240601 03 2024-06-01   3
4  20240601 04 2024-06-01   4


In [5]:
df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,호텔,82912.71,77586.0,-,-,-,2024-06-01,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,호텔,82912.71,77586.0,-,-,-,2024-06-01,1
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,호텔,82912.71,77586.0,-,-,-,2024-06-01,2
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,호텔,82912.71,77586.0,-,-,-,2024-06-01,3
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,호텔,82912.71,77586.0,-,-,-,2024-06-01,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,호텔,162070.24,152943.0,-,-,-,2024-08-24,19
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,호텔,162070.24,152943.0,-,-,-,2024-08-24,20
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,호텔,162070.24,152943.0,-,-,-,2024-08-24,21
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,호텔,162070.24,152943.0,-,-,-,2024-08-24,22


In [6]:
# 결측치 대체할 컬럼 목록
cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']

# '-'를 0으로 바꾸고 숫자형으로 변환
for col in cols:
    df[col] = df[col].replace('-', 0).astype(float)

In [7]:
df = df.drop(columns=['num_date_time', '일시'])

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['건물유형'] = le.fit_transform(df['건물유형'])
df['날짜'] = pd.to_datetime(df['날짜'])

In [9]:
import pandas as pd
import numpy as np

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    # 0) 기본 정렬 & datetime 만들기
    #    (이미 df['날짜']와 df['시간']이 있다면 그대로 쓰되, 한 줄짜리 datetime을 만들어두면 편함)
    df = df.copy()
    df['날짜'] = pd.to_datetime(df['날짜'])
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # -------------------------------------------------------
    # 1) 최근 24시간 평균, 최근 7일(같은 시각) 평균  → 전부 "과거만" 보도록 shift 사용
    # -------------------------------------------------------
    grp = df.groupby('건물번호', group_keys=False)

    # (a) 최근 24시간 평균 (전력소비량 기준)
    #  - window=24, past-only를 위해 shift(1) 후 rolling
    df['cons_lag1'] = grp['전력소비량(kWh)'].shift(1)
    df['cons_mean_24h'] = grp['cons_lag1'].rolling(window=24, min_periods=1).mean()

    # (b) 최근 7일 같은 시각 평균 (24시간 간격으로 7개)
    #  - 1일 전 같은 시각부터 7일 전 같은 시각까지 평균
    same_hour_lag = grp['전력소비량(kWh)'].shift(24)
    df['cons_samehour_mean_7d'] = same_hour_lag.rolling(window=7, min_periods=1).mean()

    # 참고로 모델에 바로 쓰진 않아도 되는 추가 라그들(원하면 활성화)
    df['cons_lag_24h'] = grp['전력소비량(kWh)'].shift(24)
    df['cons_lag_48h'] = grp['전력소비량(kWh)'].shift(48)
    df['cons_lag_72h'] = grp['전력소비량(kWh)'].shift(72)
    df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전 같은 시각

    # -------------------------------------------------------
    # 2) 기온·일사 기반 냉방 수요 지표 (CDD류)
    # -------------------------------------------------------
    # 한국 여름 기준 base temp 24°C 가합리(필요시 23~26으로 튜닝)
    base_temp = 24.0
    # ‘냉방도수’(Cooling Degree) 시간 단위
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    # 일사량(MJ/m2)과의 상호작용: 햇볕이 강할수록 체감 부하↑
    # 일사량이 0~상위 99퍼센타일 사이로 정규화(robust)
    q99 = df['일사(MJ/m2)'].quantile(0.99)
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / (q99 + 1e-6))
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    # 습도(%)와의 상호작용: 습도가 높으면 동일 온도에서도 냉방 부하↑
    # 간단히 (1 + 습도/100*알파) 가중. 알파=0.3 정도로 시작(튜닝 가능)
    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # -------------------------------------------------------
    # 3) 주말/평일, 공휴일
    # -------------------------------------------------------
    df['weekday'] = df['dt'].dt.weekday  # 월=0 ... 일=6
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    # 2024-06~08 사이 한국 공휴일: 현충일(6/6), 광복절(8/15)
    kr_holidays = {
        pd.Timestamp(2024, 6, 6),  # 현충일
        pd.Timestamp(2024, 8, 15), # 광복절
    }
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # -------------------------------------------------------
    # 4) 태양광·ESS·PCS 용량 대비 “동작 가능성” 지표
    #    (실제 제어 로그가 없으니 ‘가능성/잠재력’을 피처로 넣는다)
    # -------------------------------------------------------
    # 설비 유무 이진
    df['has_pv'] = (df['태양광용량(kW)'] > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)'] > 0).astype(int)

    # 낮/밤 플래그 (대략 일사량>0이면 주간으로 간주)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    # 태양광 ‘동작 가능성’ (설비 있고 + 주간/일사>0)
    df['pv_active_potential'] = ((df['has_pv'] == 1) & (df['is_daylight'] == 1)).astype(int)

    # 피크/오프피크 (현실 요금제와 다를 수 있지만 합리적 초기값)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    # ESS 충방전 ‘가능성’ 피처
    df['ess_charge_potential']   = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    # 용량 스케일 자체도 피처로 사용(로그 스케일로 완만화; 0은 0으로 유지)
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    # 누설 방지: 타깃 기반 비율은 과거 라그로만 계산
    # ESS 대비 부하 비율(전일 같은 시각 소비량 사용)
    df['ess_to_load_lag_ratio'] = np.where(
        df['cons_lag_24h'].notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # -------------------------------------------------------
    # 5) 기타 유틸리티 파생
    # -------------------------------------------------------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']  # 가독성용 복사
    df['dayofyear'] = df['dt'].dt.dayofyear

    # 모델 입력 전에 의미 없는 원본(또는 중복) 컬럼 정리 원하면 아래 사용
    # drop_cols = ['dt']  # 학습 시 굳이 안 써도 되면 제거
    # df = df.drop(columns=drop_cols)

    return df

# 사용 예시:
# df_feat = make_features(df)
# df_feat.head()


In [10]:
# =========================
# 0. 라이브러리 & 경로 설정
# =========================
import os
import numpy as np
import pandas as pd

# pip install lightgbm 먼저 (처음 1번만)
# pip install lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

DATA_DIR = r"C:\Users\user\Downloads\open (1)"
TRAIN_MERGED_PATH = os.path.join(DATA_DIR, "merged_train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")
BUILD_PATH = os.path.join(DATA_DIR, "building_info.csv")
SAMPLE_SUB = os.path.join(DATA_DIR, "sample_submission.csv")
OUT_SUB    = os.path.join(DATA_DIR, "baseline_lgbm_submission.csv")


In [11]:
# ===== IMPORTS (필요시 중복 있어도 무방) =====
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

# ===== 피처 선택 헬퍼: 숫자/카테고리만 남기기 + 불필요 컬럼 드롭 =====
def get_feature_cols(df: pd.DataFrame) -> list:
    base_drop = ['전력소비량(kWh)', 'dt', '날짜', '시간', '일시', 'num_date_time']
    cols = [c for c in df.columns if c not in base_drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# =========================
# 1. 유틸 함수들
# =========================
def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    """일시 → 날짜/시간 분리(또는 이미 분리돼 있으면 그대로) + dt 생성"""
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    """설비 용량에 '-'가 있으면 0으로 치환 후 float 변환"""
    df = df.copy()
    cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

# =========================
# 2. 특징 엔지니어링(수정 버전)
# =========================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: 전부 shift/rolling(=transform)로 과거만 사용
    - groupby().rolling() 대신 groupby().transform(...) 사용
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # 그룹 객체
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전
        # 최근 24시간 평균 (과거만 참고)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())
        # 최근 7일 같은 시각 평균
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    return df

# =========================
# 3. 학습 데이터 로드 & 피처 생성
# =========================
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)
train_feat = make_features(train_df)

# =========================
# 4. 시계열 검증 분할 (2024-08-17 ~ 2024-08-24)
# =========================
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)

is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# === 피처 선택 (object/문자열 배제) ===
features = get_feature_cols(train_feat)
target_col = '전력소비량(kWh)'

# 카테고리 지정(피처에 포함된 컬럼만)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr, y_tr = train_part[features], train_part[target_col]
X_va, y_va = valid_part[features], valid_part[target_col]

# =========================
# 5. LightGBM 학습 & 검증 (콜백 방식 조기종료)
# =========================
lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}

callbacks = [
    lgb.early_stopping(stopping_rounds=200),
    lgb.log_evaluation(period=200)
]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

pred_va = model.predict(X_va, num_iteration=model.best_iteration)
print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))



  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 173.366	valid's rmse: 230.791
[400]	train's rmse: 154.914	valid's rmse: 221.188
[600]	train's rmse: 143.383	valid's rmse: 216.656
[800]	train's rmse: 134.649	valid's rmse: 214.773
[1000]	train's rmse: 127.614	valid's rmse: 213.857
[1200]	train's rmse: 121.745	valid's rmse: 213.28
[1400]	train's rmse: 116.665	valid's rmse: 213.123
[1600]	train's rmse: 112.139	valid's rmse: 212.795
[1800]	train's rmse: 108.081	valid's rmse: 212.451
Early stopping, best iteration is:
[1739]	train's rmse: 109.199	valid's rmse: 212.38
VALID RMSE: 212.38019335781
VALID MAE : 101.67560328341894


In [12]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    # dt/시간 보장
    if 'dt' not in all_df.columns:
        raise ValueError("dt 없으면 ensure_datetime_cols 먼저 호출")
    all_df = all_df.copy()
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']

    # train/test 구분: 타깃 존재 여부로 판별
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else pd.Series(False, index=all_df.index)

    # 기준 통계 (train에서만)
    rad_ref = all_df.loc[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df.loc[is_train].groupby(['month','hour'])['일조(hr)'].median()

    # index 매칭해서 test 행만 채움
    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_fill_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_fill_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))

    all_df.loc[need_fill_rad, '일사(MJ/m2)'] = fill_rad[need_fill_rad.values]
    all_df.loc[need_fill_sun, '일조(hr)']    = fill_sun[need_fill_sun.values]

    # 혹시라도 남은 결측은 0으로
    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [13]:
# train + test concat
all_df = pd.concat([train_df, test_df], ignore_index=True)
all_df = clean_capacity_fields(all_df)
all_df = ensure_datetime_cols(all_df)

# ★ 추가
all_df = backfill_solar_by_time(all_df)

# 그 다음 피처 생성
all_feat = make_features(all_df)


NameError: name 'test_df' is not defined

In [14]:
# make_features() 끝부분에 추가
df['hour_sin'] = np.sin(2*np.pi*df['시간']/24)
df['hour_cos'] = np.cos(2*np.pi*df['시간']/24)


In [15]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: shift/rolling 모두 과거만 사용
    - groupby().transform(...) 으로 인덱스 정렬 유지
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬 후 그룹 객체 생성
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)  # ← 여기서 grp 정의

    # ---------- 타깃 라그 & 롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전

        # 최근 24시간 평균(과거만; shift(1) 후 rolling)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())

        # 최근 7일 같은 시각 평균(24시간 간격 7개)
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())

        # ✅ 최근 24시간 표준편차(변동성) — cons_lag1(과거값) 기반
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(window=24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    # (옵션) 시간 사이클릭 인코딩 원하면 활성화
    # df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    # df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    return df




In [16]:
params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.4,  # 1.2~1.6 사이 튜닝
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}


In [17]:
MAX_ABS = 1e12  # 과도한 값 하드 클립 임계

def sanitize_matrix(X: pd.DataFrame) -> pd.DataFrame:
    """Inf 제거 + 과대값 클립 (LightGBM은 NaN은 허용, Inf는 불가)"""
    X = X.copy()
    num_cols = X.select_dtypes(include=[np.number]).columns
    X[num_cols] = X[num_cols].replace([np.inf, -np.inf], np.nan)
    X[num_cols] = X[num_cols].clip(lower=-MAX_ABS, upper=MAX_ABS)
    return X

def safe_log1p_vec(a):
    """음수/비정상값 방지 후 log1p"""
    a = np.asarray(a, dtype=float)
    # 비정상(y에 NaN/inf) → 0으로 대체 (혹은 np.nan 유지하고 마스킹하려면 전략 바꿔도 됨)
    a = np.where(np.isfinite(a), a, 0.0)
    a = np.clip(a, 0, None)  # -0 방지
    return np.log1p(a)



In [None]:
# =========================
# 5. LightGBM 학습 & 검증 (로그변환 + 입력 정화)
# =========================
# X 정화(Inf 제거/클립)
X_tr = sanitize_matrix(X_tr)
X_va = sanitize_matrix(X_va)

# y 안전 로그변환
y_tr_log = safe_log1p_vec(y_tr)
y_va_log = safe_log1p_vec(y_va)

# 안전 체크(디버그 용 — 문제 있으면 어떤 값인지 바로 알 수 있음)
assert np.isfinite(y_tr_log).all(), "y_tr_log에 비정상값이 있습니다."
assert np.isfinite(y_va_log).all(), "y_va_log에 비정상값이 있습니다."

lgb_train = lgb.Dataset(X_tr, label=y_tr_log, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_log, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 예측(로그→원복)
pred_va_log = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = np.expm1(pred_va_log)

print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))


In [18]:
grid = [
    {'num_leaves': 48, 'min_data_in_leaf': 80, 'lambda_l2': 0.5},
    {'num_leaves': 64, 'min_data_in_leaf': 60, 'lambda_l2': 1.0},
    {'num_leaves': 96, 'min_data_in_leaf': 40, 'lambda_l2': 2.0},
]


In [19]:
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n", va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n", va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


Top-10 by 건물번호:
 건물번호
10    975.309755
3     919.754590
79    525.064795
45    501.634111
1     425.058776
23    393.314688
54    366.046030
43    329.434740
34    325.694886
64    316.445453
Name: se, dtype: float64
By 건물유형:
 건물유형
호텔          364.313067
병원          322.632263
백화점         240.597665
IDC(전화국)    193.622124
연구소         171.481702
건물기타        144.326968
학교          137.332301
공공          128.527986
상용          114.194073
아파트          80.796247
Name: se, dtype: float64


  print("Top-10 by 건물번호:\n", va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [20]:
def build_baseline(df):
    base = df['cons_samehour_mean_7d']
    base = base.fillna(df['cons_mean_24h'])
    base = base.fillna(df['cons_lag1'])
    base = base.fillna(df['전력소비량(kWh)'].median() if '전력소비량(kWh)' in df.columns else 0)
    return base

In [99]:
# =========================
# 5. LightGBM 학습 & 검증 (잔차 학습)
# =========================
# 베이스라인 생성 (누설 없음: 전부 shift/rolling 기반 피처)
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)

# 잔차 타깃
y_tr_resid = y_tr - baseline_tr.values
y_va_resid = y_va - baseline_va.values

# (안전) X에서 ±inf 제거
X_tr = X_tr.replace([np.inf, -np.inf], np.nan)
X_va = X_va.replace([np.inf, -np.inf], np.nan)

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',   # 나중에 tweedie/huber 시도 가능
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 잔차 예측 + 베이스라인 복원
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = baseline_va.values + pred_va_resid

print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 208.044	valid's rmse: 299.621
[400]	train's rmse: 173.802	valid's rmse: 278.029
[600]	train's rmse: 156.796	valid's rmse: 270.213
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1000]	train's rmse: 136.718	valid's rmse: 263.252
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1400]	train's rmse: 123.945	valid's rmse: 260.827
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[1800]	train's rmse: 114.191	valid's rmse: 259.365
[2000]	train's rmse: 110.226	valid's rmse: 259.047
[2200]	train's rmse: 106.599	valid's rmse: 258.916
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
VALID RMSE: 258.5116251016002
VALID MAE : 136.39708264373976


In [100]:
# =========================
# 6. 제출 파일 생성 (num_date_time 미사용 버전)
# =========================
sub = read_csv_smart(SAMPLE_SUB)  # 샘플 제출 양식 그대로 사용

# 예측값 길이가 샘플과 달라도 안전하게 할당(인덱스 기준 정렬)
# - 길이가 짧으면 남는 행은 NaN
# - 길이가 길면 초과 분량은 자동으로 버려짐
sub['answer'] = pd.Series(test_pred, index=sub.index)

# 저장 (엑셀 호환 위해 utf-8-sig 권장)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"저장 완료 → {OUT_SUB}")



저장 완료 → C:\Users\user\Downloads\open (1)\baseline_lgbm_submission.csv


In [21]:
# 문제 빌딩 리스트 (네 출력 기준)
hard_blds = {3, 10, 79, 45, 1, 23, 67, 57, 34, 64}

# 5단계 학습 직전에 가중치 계산
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(hard_blds).values] = 2.0  # 2~3배 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


NameError: name 'y_tr_resid' is not defined

In [22]:
def build_baseline(df: pd.DataFrame) -> pd.Series:
    """
    잔차 학습용 베이스라인:
      1) cons_samehour_mean_7d
      2) cons_mean_24h
      3) cons_lag1
      4) (최후) 학습 구간 타깃 중앙값
    """
    base = df.get('cons_samehour_mean_7d')
    if base is None: base = pd.Series(index=df.index, dtype=float)
    base = base.fillna(df.get('cons_mean_24h'))
    base = base.fillna(df.get('cons_lag1'))
    if '전력소비량(kWh)' in df.columns:
        base = base.fillna(df['전력소비량(kWh)'].median())
    else:
        base = base.fillna(0.0)
    return base


In [None]:
# =========================
# 5R. LightGBM 학습 & 검증 (잔차 학습)
# =========================

# 피처 재선택(혹시 위에서 수정되었을 수 있으니 보강)
features = get_feature_cols(train_feat)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr, y_tr = train_part[features], train_part['전력소비량(kWh)']
X_va, y_va = valid_part[features], valid_part['전력소비량(kWh)']

# 베이스라인 생성(인덱스 정렬 유지)
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)

# 잔차 타깃
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (선택) 문제 유형 가중치 — 처음엔 False로 두고 결과 본 뒤 켜줘
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4: 2.0, 9: 2.0}  # 필요시 가중치 조정
if USE_TYPE_WEIGHTS:
    w_tr = np.ones(len(train_part), dtype=float)
    mask = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).values
    w_tr = w_tr * mask
else:
    w_tr = None

# 안전 처리(±inf → NaN)
X_tr = X_tr.replace([np.inf, -np.inf], np.nan)
X_va = X_va.replace([np.inf, -np.inf], np.nan)

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',   # 잔차는 음수/양수 모두 가능 → 회귀(평균제곱오차)로
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 잔차 예측 + 베이스라인 복원
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = baseline_va.values + pred_va_resid

print("VALID RMSE (residual):", rmse(y_va, pred_va))
print("VALID MAE  (residual):", mean_absolute_error(y_va, pred_va))

# --- 에러 분석: 건물/유형별 RMSE ---
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n",
      va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n",
      va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


In [23]:
import os, random, numpy as np
os.environ["PYTHONHASHSEED"] = "0"
random.seed(42); np.random.seed(42)
# LightGBM 쪽은 params['seed']=42, params['num_threads']=4 정도 권장


In [24]:
def downcast(df):
    df = df.copy()
    for c in df.select_dtypes(include=['float64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='float')
    for c in df.select_dtypes(include=['int64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='integer')
    return df

# train_df, test_df 읽은 직후 한 번씩:
# train_df = downcast(train_df)
# test_df  = downcast(test_df)


In [25]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    all_df = all_df.copy()
    if 'dt' not in all_df.columns:
        raise ValueError("ensure_datetime_cols 먼저 호출해줘.")
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else False

    rad_ref = all_df[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df[is_train].groupby(['month','hour'])['일조(hr)'].median()

    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))
    all_df.loc[need_rad, '일사(MJ/m2)'] = fill_rad[need_rad.values]
    all_df.loc[need_sun, '일조(hr)']    = fill_sun[need_sun.values]

    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [26]:
def build_baseline(df: pd.DataFrame) -> pd.Series:
    base = df.get('cons_samehour_mean_7d')
    if base is None:
        base = pd.Series(index=df.index, dtype=float)
    base = base.fillna(df.get('cons_mean_24h'))
    base = base.fillna(df.get('cons_lag1'))
    if '전력소비량(kWh)' in df.columns:
        base = base.fillna(df['전력소비량(kWh)'].median())
    else:
        base = base.fillna(0.0)
    return base


In [27]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

def get_feature_cols(df):
    drop = ['전력소비량(kWh)','dt','날짜','시간','일시','num_date_time']
    cols = [c for c in df.columns if c not in drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# 검증 파트
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features)
X_va = valid_part.reindex(columns=features)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


In [28]:
# 베이스라인 & 잔차
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (옵션) 유형 가중치
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4:2.0, 9:2.0}
w_tr = None
if USE_TYPE_WEIGHTS and '건물유형' in train_part.columns:
    w_tr = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).astype(float).values

# LGBM 학습
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':50,'lambda_l2':1.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr.replace([np.inf,-np.inf], np.nan), label=y_tr_resid, weight=w_tr,
                        categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va.replace([np.inf,-np.inf], np.nan), label=y_va_resid,
                        categorical_feature=cat_cols or None)
callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
model = lgb.train(params, lgb_train, num_boost_round=5000,
                  valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
                  callbacks=callbacks)

# 복원해서 검증 점수
pred_va = baseline_va.values + model.predict(X_va, num_iteration=model.best_iteration)
print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))

# 에러 테이블 저장(재현성 위해 파일로 남겨)
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
va_err_per_bld = va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False)
va_err_per_type = va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False)
va_err_per_bld.head(10).to_csv("val_top10_buildings.csv", encoding='utf-8-sig')
va_err_per_type.to_csv("val_by_type.csv", encoding='utf-8-sig')


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 173.802	valid's rmse: 278.029
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[2000]	train's rmse: 110.226	valid's rmse: 259.047
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
VALID RMSE: 258.5116251016002
VALID MAE : 136.39708264373976


  va_err_per_bld = va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False)


In [29]:
# all_df → ensure_datetime_cols → (옵션) backfill_solar_by_time → make_features 까지 완료됐다고 가정
all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

features_full = get_feature_cols(all_feat_train)
cat_cols_full = [c for c in ['건물번호','건물유형'] if c in features_full]
for c in cat_cols_full:
    all_feat_train[c] = all_feat_train[c].astype('category')
    all_feat_test[c]  = all_feat_test[c].astype('category')

X_full = all_feat_train.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
y_full = all_feat_train['전력소비량(kWh)']
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

y_full_resid = (y_full - baseline_full).astype(float)
lgb_full = lgb.Dataset(X_full, label=y_full_resid, categorical_feature=cat_cols_full or None)
final_model = lgb.train(params, lgb_full, num_boost_round=(model.best_iteration or 2000))

X_te = all_feat_test.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
test_pred = baseline_te.values + final_model.predict(X_te, num_iteration=final_model.best_iteration)

sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"저장 완료 → {OUT_SUB}")


NameError: name 'all_feat' is not defined

In [None]:
# =========================
# 5R. LightGBM 학습 & 검증 (잔차 학습 + α 블렌딩)
# =========================

# 피처 재선택(안전하게 reindex)
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]

for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf, -np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf, -np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 베이스라인(누설 없음: shift/rolling 기반) → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (옵션) 유형 가중치 — 처음엔 꺼두고 결과 보고 켜자
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4: 2.0, 9: 2.0}
w_tr = None
if USE_TYPE_WEIGHTS and ('건물유형' in train_part.columns):
    w_tr = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).astype(float).values

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1,
    'num_threads': 4,
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=400)]

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train','valid'],
    callbacks=callbacks
)

# ----- 블렌딩 α 계산 (최소제곱 닫힌형식) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

base_rmse = rmse(y_va, baseline_va)
base_mae  = mean_absolute_error(y_va, baseline_va)
true_resid_va = (y_va - baseline_va).values

den = float(np.sum(pred_va_resid**2) + 1e-9)
alpha = float(np.sum(true_resid_va * pred_va_resid) / den)   # 최적 α
alpha = max(0.0, min(alpha, 1.5))  # 안정화 클립

pred_va = baseline_va.values + alpha * pred_va_resid

print(f"[Blending] alpha={alpha:.3f}")
print("Baseline-only  RMSE:", base_rmse, "MAE:", base_mae)
print("Residual-only RMSE:", rmse(true_resid_va, pred_va_resid))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))

# 나중에 6R에서 재사용
ALPHA_BEST = alpha

# --- 에러 분석 (빌딩/유형별) ---
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n",
      va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n",
      va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 잔차 예측
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

# 베이스라인 점수
base_rmse = np.sqrt(mean_squared_error(y_va, baseline_va))
base_mae  = mean_absolute_error(y_va, baseline_va)
base_r2   = r2_score(y_va, baseline_va)

# 최적 α (닫힌형식) + 블렌딩 점수
true_resid = (y_va - baseline_va).values
den = float(np.sum(pred_va_resid**2) + 1e-9)
alpha = float(np.sum(true_resid * pred_va_resid) / den)
alpha = max(0.0, min(alpha, 1.5))
pred_va = baseline_va.values + alpha * pred_va_resid

blend_rmse = np.sqrt(mean_squared_error(y_va, pred_va))
blend_mae  = mean_absolute_error(y_va, pred_va)
blend_r2   = r2_score(y_va, pred_va)

print(f"alpha={alpha:.3f}")
print(f"[Baseline] RMSE={base_rmse:.3f} MAE={base_mae:.3f} R2={base_r2:.4f}")
print(f"[Blended ] RMSE={blend_rmse:.3f} MAE={blend_mae:.3f} R2={blend_r2:.4f}")



In [None]:
# 예측값 기준 10분위로 나눠서 실제 평균과 비교
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print(cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

In [30]:
res = valid_part[["건물번호","건물유형","hour","기온(°C)"]].copy()
res["y"] = y_va.values
res["yhat"] = pred_va
res["resid"] = res["y"] - res["yhat"]

# 시간대별
by_hour = res.groupby("hour")["resid"].agg(["mean","std","count"])
print("시간대별 잔차통계:\n", by_hour)

# 온도 구간별 (5°C 단위)
res["temp_bin"] = (res["기온(°C)"]//5)*5
by_temp = res.groupby("temp_bin")["resid"].agg(["mean","std","count"])
print("온도구간별 잔차통계:\n", by_temp)

# 건물유형별 RMSE (베이스라인 vs 블렌딩 비교)
tmp = valid_part[["건물유형"]].copy()
tmp["base_se"]   = (y_va.values - baseline_va.values)**2
tmp["blend_se"]  = (y_va.values - pred_va)**2
cmp_type = tmp.groupby("건물유형")[["base_se","blend_se"]].mean().pow(0.5).sort_values("blend_se", ascending=False)
cmp_type["improve(%)"] = (1 - cmp_type["blend_se"]/cmp_type["base_se"])*100
print("유형별 RMSE 비교(베이스라인→블렌딩):\n", cmp_type)

# 상위 문제 빌딩 Top-10
tmp2 = valid_part[["건물번호"]].copy()
tmp2["se"] = (y_va.values - pred_va)**2
top_bld = tmp2.groupby("건물번호")["se"].mean().pow(0.5).sort_values(ascending=False).head(10)
print("Top-10 by 건물번호 (블렌딩):\n", top_bld)

시간대별 잔차통계:
             mean         std  count
hour                               
0       3.266520  159.376064    800
1       4.701862  147.683415    800
2      11.871789  144.551729    800
3       0.923770  150.067881    800
4      -7.801033  179.145872    800
5       2.412163  154.708147    800
6       9.073931  174.606392    800
7       3.414813  198.042050    800
8     -19.215013  234.254688    800
9     -30.409998  265.857782    800
10    -47.461959  263.023307    800
11   -110.690803  295.701160    800
12   -127.299821  275.864882    800
13   -141.238065  298.483692    800
14   -138.442811  298.695544    800
15   -144.973970  326.251247    800
16   -127.934494  385.695565    800
17   -110.095268  310.676355    800
18   -101.066599  294.251119    800
19    -82.120827  289.549671    800
20    -32.730890  245.111192    800
21    -29.842760  237.086742    800
22    -32.564909  199.411701    800
23    -15.306709  182.713965    800
온도구간별 잔차통계:
                 mean         std  count

  top_bld = tmp2.groupby("건물번호")["se"].mean().pow(0.5).sort_values(ascending=False).head(10)


In [31]:
# === 0) 스키러너블 모델로 재학습 (설명력용, 제출에 사용 안 함)
import lightgbm as lgb
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error

# 1) 우선 그대로 카테고리 dtype을 사용해서 학습 시도
params_sklearn = dict(
    objective='regression',
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=1,
    min_child_samples=50,   # = min_data_in_leaf
    reg_lambda=1.0,         # = lambda_l2
    random_state=42,
    n_estimators=5000,
    n_jobs=4
)

use_cat = True
try:
    model_perm = lgb.LGBMRegressor(**params_sklearn)
    model_perm.fit(
        X_tr, y_tr_resid,
        eval_set=[(X_va, y_va_resid)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)],
        categorical_feature=cat_cols or None  # pandas 'category' dtype이면 자동 인식
    )
except Exception as e:
    # 2) 버전/환경에 따라 categorical_feature가 막힐 수 있어서 안전한 fallback: 카테고리 → 코드화
    print("카테고리 직접 전달 실패 → 코드화하여 재시도:", e)
    use_cat = False
    X_tr_enc = X_tr.copy()
    X_va_enc = X_va.copy()
    for c in (cat_cols or []):
        X_tr_enc[c] = X_tr_enc[c].cat.codes
        X_va_enc[c] = X_va_enc[c].cat.codes
    model_perm = lgb.LGBMRegressor(**params_sklearn)
    model_perm.fit(
        X_tr_enc, y_tr_resid,
        eval_set=[(X_va_enc, y_va_resid)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
    )

# === 1) Permutation Importance (검증셋 기준)
X_pi = X_va if use_cat else X_va_enc
pi = permutation_importance(
    model_perm, X_pi, y_va_resid,
    n_repeats=5, random_state=42, n_jobs=4,
    scoring='neg_mean_squared_error'
)
import pandas as pd, numpy as np
pi_tbl = pd.DataFrame({
    "feature": X_pi.columns,
    "imp": pi.importances_mean,
    "std": pi.importances_std
}).sort_values("imp", ascending=False)
print("Permutation Importance Top-20:\n", pi_tbl.head(20))

# === 2) LightGBM 내장 중요도 (gain/split)도 함께 보기
gain = pd.Series(model_perm.booster_.feature_importance(importance_type='gain'),
                 index=X_pi.columns).sort_values(ascending=False)
split = pd.Series(model_perm.booster_.feature_importance(importance_type='split'),
                  index=X_pi.columns).sort_values(ascending=False)
imp_tbl = pd.DataFrame({"gain": gain, "split": split}).fillna(0).sort_values("gain", ascending=False)
print("LightGBM 중요도(Gain) Top-20:\n", imp_tbl.head(20))

# === 3) SHAP 대체 (shap 미설치/버전 이슈 대비 → pred_contrib)
try:
    import shap
    explainer = shap.TreeExplainer(model_perm)
    shap_vals = explainer.shap_values(X_pi)
    shap_abs = np.abs(shap_vals).mean(axis=0)
    shap_tbl = pd.Series(shap_abs, index=X_pi.columns).sort_values(ascending=False).head(20)
    print("SHAP |value| Top-20:\n", shap_tbl)
except Exception as e:
    print("shap 미사용 → pred_contrib로 대체:", e)
    contrib = np.asarray(model_perm.predict(X_pi, pred_contrib=True))
    # 마지막 열은 base value, 제외
    shap_abs = np.abs(contrib[:, :-1]).mean(axis=0)
    shap_tbl = pd.Series(shap_abs, index=X_pi.columns).sort_values(ascending=False).head(20)
    print("Approx SHAP |value| Top-20:\n", shap_tbl)

# === 4) 캘리브레이션/블렌딩 비교도 같이 수치로 확인 (설명력 품질 점검, 예측 저장 X)
pred_va_resid_hat = model_perm.predict(X_pi, num_iteration=model_perm.best_iteration_)
base_rmse = np.sqrt(mean_squared_error(y_va, baseline_va))
true_resid = (y_va - baseline_va).values
den = float(np.sum(pred_va_resid_hat**2) + 1e-9)
alpha = float(np.sum(true_resid * pred_va_resid_hat) / den)
alpha = max(0.0, min(alpha, 1.5))
pred_va_blend = baseline_va.values + alpha * pred_va_resid_hat
blend_rmse = np.sqrt(mean_squared_error(y_va, pred_va_blend))
print(f"[설명력 체크] Baseline RMSE={base_rmse:.3f}  Blended RMSE={blend_rmse:.3f} (alpha={alpha:.3f})")



Training until validation scores don't improve for 200 rounds
[400]	valid_0's l2: 77300.4
[800]	valid_0's l2: 70921.3
[1200]	valid_0's l2: 68725.4
[1600]	valid_0's l2: 67805.4
[2000]	valid_0's l2: 67105.4
Early stopping, best iteration is:
[2076]	valid_0's l2: 66828.3
Permutation Importance Top-20:
                     feature           imp           std
18    cons_samehour_mean_7d  4.281631e+06  41895.011875
12                cons_lag1  3.329261e+06  20791.980802
0                      건물번호  4.462907e+05   8122.529034
38                     hour  2.833475e+05   9824.810526
13             cons_lag_24h  7.304677e+04    467.204158
22                  weekday  5.557012e+04   4869.273276
16            cons_lag_168h  4.519914e+04    601.363053
6                 일사(MJ/m2)  3.699363e+04   1250.087403
17            cons_mean_24h  2.257438e+04    454.303971
14             cons_lag_48h  1.478527e+04    591.954276
15             cons_lag_72h  8.798136e+03    311.344347
7                   연면적(m2)

In [32]:
# 5R 블록에서 옵션만 켜기
USE_TYPE_WEIGHTS = True
TYPE_WEIGHT = {4: 2.0, 9: 2.0}


In [33]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # 일사/일조 보정
    if '일사(MJ/m2)' not in df.columns: df['일사(MJ/m2)'] = 0.0
    if '일조(hr)'   not in df.columns: df['일조(hr)']   = 0.0

    # 정렬 & 그룹
    df = df.sort_values(['건물번호','dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그/롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(24, min_periods=1).mean())
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(7, min_periods=1).mean())
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h',
                  'cons_lag_168h','cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan
    # --- 일중·주기 변화량(라그 이후 계산) ---
    df['delta_1h'] = df['cons_lag1']    - df['cons_lag_24h']
    df['delta_7d'] = df['cons_lag_24h'] - df['cons_lag_168h']
    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)
    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm
    df['CDD_humid_adj'] = df['CDD'] * (1 + 0.3 * (df['습도(%)'] / 100.0))

    # ---------- 달력 & 설비 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month']     = df['dt'].dt.month
    df['hour']      = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear
    df['hour_sin']  = np.sin(2*np.pi*df['hour']/24)
    df['hour_cos']  = np.cos(2*np.pi*df['hour']/24)

    # ---------- 면적 정규화 (라그 생성 '뒤'에 계산) ----------
    eps = 1e-6
    if '연면적(m2)' in df.columns:
        area = pd.to_numeric(df['연면적(m2)'], errors='coerce')
        df['cons_lag1_per_m2']   = df['cons_lag1']    / (area + eps)
        df['cons_mean24_per_m2'] = df['cons_mean_24h'] / (area + eps)
        df['CDD_x_rad_area']     = df['CDD_x_rad'] * (area.fillna(0) / 1000.0)

    return df


In [34]:
df['hour_sin'] = np.sin(2*np.pi*df['시간']/24)
df['hour_cos'] = np.cos(2*np.pi*df['시간']/24)

In [35]:
params['min_data_in_leaf'] = 120   # 50 → 120
params['lambda_l2'] = 2.0          # 1.0 → 2.0
# (유지) learning_rate=0.05, num_leaves=64


In [36]:
# 5R 바로 앞에서 한 번만 실험(점수만 확인)
features_abl = [c for c in features if c != '건물번호']
X_tr_abl = train_part.reindex(columns=features_abl)
X_va_abl = valid_part.reindex(columns=features_abl)
# 같은 파라미터로 간단하게 1500부스트 정도만 재학습해서 RMSE 비교


In [37]:
train_feat = make_features(train_df)
is_val = (train_feat['dt'] >= pd.Timestamp(2024,8,17)) & (train_feat['dt'] <= pd.Timestamp(2024,8,24,23))
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()



In [38]:
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_mean_24h': 'cons_mean_24h' in train_feat.columns,
  'cons_lag1_per_m2': 'cons_lag1_per_m2' in train_feat.columns,
  'cons_mean24_per_m2': 'cons_mean24_per_m2' in train_feat.columns
})


{'cons_lag1': True, 'cons_mean_24h': True, 'cons_lag1_per_m2': True, 'cons_mean24_per_m2': True}


In [39]:
# 0) 피처 다시 생성 & 검증 분할
train_feat = make_features(train_df)

VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)

train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 1) 피처 선택(안전하게 reindex)
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]

for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 2) 베이스라인 생성 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 3) 잔차 모델 학습(보수적 파라미터)
import lightgbm as lgb
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 4) 설명력/보정력 평가 (예측 저장 X)
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

# 베이스라인/블렌딩 지표
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
base_rmse = rmse(y_va, baseline_va)
base_mae  = mean_absolute_error(y_va, baseline_va)
base_r2   = r2_score(y_va, baseline_va)

true_resid = (y_va - baseline_va).values
den = float((pred_va_resid**2).sum() + 1e-9)
alpha = float((true_resid * pred_va_resid).sum() / den)
alpha = max(0.0, min(alpha, 1.5))  # 안정화
pred_va_blend = baseline_va.values + alpha * pred_va_resid

blend_rmse = rmse(y_va, pred_va_blend)
blend_mae  = mean_absolute_error(y_va, pred_va_blend)
blend_r2   = r2_score(y_va, pred_va_blend)

print(f"alpha={alpha:.3f}")
print(f"[Baseline] RMSE={base_rmse:.3f}  MAE={base_mae:.3f}  R2={base_r2:.4f}")
print(f"[Blended ] RMSE={blend_rmse:.3f}  MAE={blend_mae:.3f}  R2={blend_r2:.4f}")

# 5) 캘리브레이션(10분위) & 유형/빌딩별 진단
import pandas as pd, numpy as np
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va_blend})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se_base']  = (y_va.values - baseline_va.values)**2
va_err['se_blend'] = (y_va.values - pred_va_blend)**2
print("\n[유형별 RMSE 비교]\n",
      va_err.groupby('건물유형')[['se_base','se_blend']].mean().pow(0.5).assign(
          improve_pct=lambda d: (1 - d['se_blend']/d['se_base'])*100
      ).sort_values('se_blend', ascending=False))

print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 177.313	valid's rmse: 252.158
[800]	train's rmse: 148.709	valid's rmse: 240.359
[1200]	train's rmse: 132.742	valid's rmse: 237.079
[1600]	train's rmse: 121.342	valid's rmse: 236.088
[2000]	train's rmse: 112.554	valid's rmse: 234.887
Early stopping, best iteration is:
[2063]	train's rmse: 111.328	valid's rmse: 234.708
alpha=1.014
[Baseline] RMSE=1008.062  MAE=549.663  R2=0.9318
[Blended ] RMSE=234.343  MAE=110.065  R2=0.9963

[Calibration by decile]
                                      y_mean     yhat_mean     n       bias
bin                                                                       
(-4.353000000000001, 778.966]    479.693812    471.795176  1920  -7.898637
(778.966, 1160.635]              987.892026    983.979869  1920  -3.912157
(1160.635, 1523.262]            1331.977807   1323.919429  1920  -8.058378
(1523.262, 1818.994]            1691.171203   1681.548043  1920  -9.623160
(1818.994, 217

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [40]:
# ----- a, b 동시 추정(최소제곱) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))  # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print("Baseline-only  RMSE:", rmse(y_va, baseline_va), "MAE:", mean_absolute_error(y_va, baseline_va))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))


[Blending] a=1.013, b=-11.078
Baseline-only  RMSE: 1008.0623627048025 MAE: 549.6628074404762
BLENDED       RMSE: 234.0812015762596 MAE: 110.94799522778526


In [42]:
# 잔차 예측
test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)

# 검증에서 구한 a,b로 복원
a_use = a if 'a' in globals() else 1.0
b_use = b if 'b' in globals() else 0.0
test_pred = baseline_te.values + a_use * test_pred_resid + b_use


NameError: name 'final_model' is not defined

In [44]:
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


In [45]:
# 1) make_features() 셀(패치 포함) 실행

# 2) 피처 다시 생성
train_feat = make_features(train_df)

# 3) 확인
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_lag_24h': 'cons_lag_24h' in train_feat.columns,
  'cons_lag_168h': 'cons_lag_168h' in train_feat.columns,
  'delta_1h': 'delta_1h' in train_feat.columns,
  'delta_7d': 'delta_7d' in train_feat.columns
})


{'cons_lag1': True, 'cons_lag_24h': True, 'cons_lag_168h': True, 'delta_1h': True, 'delta_7d': True}


In [46]:
params['min_data_in_leaf'] = 150   # 120 → 150
params['lambda_l2'] = 3.0          # 2.0 → 3.0

In [47]:
# =========================
# 5R. 잔차 학습 + (a,b) 블렌딩 + 하드 빌딩 가중치
# =========================
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def rmse(y, yhat):
    y, yhat = np.asarray(y), np.asarray(yhat)
    return float(np.sqrt(mean_squared_error(y, yhat)))

# --- 검증 분할(재확인) ---
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# --- 피처 선택 (안전하게 reindex) ---
features = get_feature_cols(train_part)  # 기존 함수 사용
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# --- 베이스라인 → 잔차 타깃 ---
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# --- 하드 빌딩 가중치 (Top-10 위주) ---
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}  # 필요시 수정
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

# --- LightGBM 학습 (보수적 규제) ---
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# --- (a,b) 블렌딩: y ≈ base + a*resid_pred + b ---
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))   # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print(f"[Baseline] RMSE={rmse(y_va, baseline_va):.3f}  MAE={mean_absolute_error(y_va, baseline_va):.3f}")
print(f"[Blended ] RMSE={rmse(y_va, pred_va):.3f}  MAE={mean_absolute_error(y_va, pred_va):.3f}  R2={r2_score(y_va, pred_va):.4f}")

# --- 디사일 캘리브레이션 ---
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

# --- Top-10 by 건물번호 (블렌딩 기준) ---
va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_va)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 다음 단계에서 6R에서 재사용할 수 있게 보관
AB_BLEND = (a, b)
BEST_NUM_BOOST = int(model.best_iteration or 2000)


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 191.932	valid's rmse: 250.611
[800]	train's rmse: 158.815	valid's rmse: 239.59
[1200]	train's rmse: 140.375	valid's rmse: 234.776
[1600]	train's rmse: 127.489	valid's rmse: 232.896
[2000]	train's rmse: 117.524	valid's rmse: 232.333
Early stopping, best iteration is:
[2090]	train's rmse: 115.572	valid's rmse: 231.904
[Blending] a=1.015, b=-12.297
[Baseline] RMSE=1008.062  MAE=549.663
[Blended ] RMSE=231.127  MAE=111.847  R2=0.9964

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-13.572, 767.288]       479.966969    460.802462  1920 -19.164507
(767.288, 1143.73]       988.190281    972.225393  1920 -15.964888
(1143.73, 1509.149]     1331.591286   1312.078026  1920 -19.513260
(1509.149, 1809.44]     1693.329979   1672.495501  1920 -20.834479
(1809.44, 2173.781]     1980.010083   1969.816649  1920 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [48]:
# =========================
# 6R. 최종 재학습 (a,b 블렌딩 적용)  *제출은 옵션*
# =========================
DO_SUBMIT = False  # ← 기본은 예측/저장 비활성화. 필요할 때만 True로.

# all_feat이 이미 만들어져 있다면 재사용, 아니면 생성
try:
    _ = all_feat  # 존재 체크
except NameError:
    # test_orig/build_df/read_csv_smart/clean_capacity/ensure_datetime_cols 등이 준비되어 있어야 함
    test_orig = read_csv_smart(TEST_PATH)
    build_df  = read_csv_smart(BUILD_PATH)
    test_df = pd.merge(test_orig, build_df, on='건물번호', how='left')
    test_df = clean_capacity_fields(test_df)
    test_df = ensure_datetime_cols(test_df)
    all_df  = pd.concat([train_df, test_df], ignore_index=True)
    # (옵션) 일사/일조 백필을 썼다면 여기서 호출
    # all_df = backfill_solar_by_time(all_df)
    all_feat = make_features(all_df)

all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

features_full = get_feature_cols(all_feat_train)
cat_cols_full = [c for c in ['건물번호','건물유형'] if c in features_full]
for c in cat_cols_full:
    all_feat_train[c] = all_feat_train[c].astype('category')
    all_feat_test[c]  = all_feat_test[c].astype('category')

X_full = all_feat_train.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
y_full = all_feat_train['전력소비량(kWh)']

baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

y_full_resid = (y_full - baseline_full).astype(float)
lgb_full = lgb.Dataset(X_full, label=y_full_resid, categorical_feature=cat_cols_full or None)

# 검증에서 쓴 하이퍼파라미터/부스트 수 재사용
params_full = params.copy()
final_model = lgb.train(params_full, lgb_full, num_boost_round=BEST_NUM_BOOST)

# 테스트 잔차 예측 + (a,b) 블렌딩 복원
if DO_SUBMIT:
    X_te = all_feat_test.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
    test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)
    a_use, b_use = AB_BLEND if 'AB_BLEND' in globals() else (1.0, 0.0)
    test_pred = baseline_te.values + a_use * test_pred_resid + b_use

    sub = read_csv_smart(SAMPLE_SUB)
    sub['answer'] = pd.Series(test_pred, index=sub.index)
    sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
    print(f"저장 완료 → {OUT_SUB}")
else:
    print("제출 생략: DO_SUBMIT=False (모델만 재학습 완료)")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


제출 생략: DO_SUBMIT=False (모델만 재학습 완료)


In [49]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 6) 보정 후보 계산
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형(a,b): y ≈ base + a*pred + b
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) Isotonic: y ≈ base + g(pred)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
gamma = 0.5  # 0.3~0.7 시도 가능

train_resid = (y_tr - baseline_tr).values

# 빌딩별 train 잔차 평균 (observed=False로 경고 제거)
bld_bias_map = (
    pd.Series(train_resid, index=train_part.index)
      .groupby(train_part['건물번호'], observed=False)
      .mean()
)

# 인덱스 타입 통일(int)
if isinstance(getattr(bld_bias_map.index, 'dtype', None), pd.CategoricalDtype):
    bld_bias_map.index = bld_bias_map.index.astype('int64')

# valid 건물번호도 int로 변환 후 dict 매핑 → float → NaN을 0.0으로
bld_bias_dict = bld_bias_map.to_dict()
bld_codes_va = valid_part['건물번호'].astype('int64')
bias_va = bld_codes_va.map(bld_bias_dict).astype('float64').fillna(0.0).to_numpy()

pred_iso_bias = pred_iso + gamma * bias_va
sc_iso_bias = (RMSE(y_va, pred_iso_bias), mean_absolute_error(y_va, pred_iso_bias), r2_score(y_va, pred_iso_bias))


# 7) 베스트 보정 선택
cands = {
    "AB":        (pred_ab,       sc_ab,       {"a":a,"b":b}),
    "ISO":       (pred_iso,      sc_iso,      {"iso":iso}),
    "ISO+BLD":   (pred_iso_bias, sc_iso_bias, {"iso":iso,"gamma":gamma,"bld_bias_map":bld_bias_map}),
}
best_name, (pred_best, (rmse_best, mae_best, r2_best), params_best) = min(
    cands.items(), key=lambda kv: kv[1][1][0]
)

print(f"[Calibration] best={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB scores : RMSE={sc_ab[0]:.3f}, ISO : {sc_iso[0]:.3f}, ISO+BLD : {sc_iso_bias[0]:.3f}")
if best_name == "AB":
    print(f"  - a={params_best['a']:.3f}, b={params_best['b']:.3f}")
elif best_name.startswith("ISO"):
    print(f"  - gamma={params_best.get('gamma','-')} (ISO 객체 저장됨)")

# 8) 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 9) 6단계에서 재사용할 아티팩트 저장(전역)
CALIB_MODE   = best_name
CALIB_PARAMS = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST = features
CAT_COLS     = cat_cols
LGBM_PARAMS  = params
print("\n[Saved] CALIB_MODE, CALIB_PARAMS, BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 191.932	valid's rmse: 250.611
[800]	train's rmse: 158.815	valid's rmse: 239.59
[1200]	train's rmse: 140.375	valid's rmse: 234.776
[1600]	train's rmse: 127.489	valid's rmse: 232.896
[2000]	train's rmse: 117.524	valid's rmse: 232.333
Early stopping, best iteration is:
[2090]	train's rmse: 115.572	valid's rmse: 231.904
[Calibration] best=ISO  RMSE=222.850  MAE=109.444  R2=0.9967
  - AB scores : RMSE=231.127, ISO : 222.850, ISO+BLD : 224.497
  - gamma=- (ISO 객체 저장됨)

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-0.19, 773.573]         480.713370    468.545060  1920 -12.168310
(773.573, 1150.179]      987.890062    977.938907  1920  -9.951155
(1150.179, 1514.157]    1330.305422   1317.805263  1920 -12.500158
(1514.157, 1813.01]     1695.009984   1676.481321  1920 -18.528663
(1813.01, 2166.778]    

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [50]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':48,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':150,'lambda_l2':3.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 6) 보정 후보 계산
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형(a,b): y ≈ base + a*pred + b
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) Isotonic: y ≈ base + g(pred)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
gamma = 0.5  # 0.3~0.7 시도 가능

train_resid = (y_tr - baseline_tr).values

# 빌딩별 train 잔차 평균 (observed=False로 경고 제거)
bld_bias_map = (
    pd.Series(train_resid, index=train_part.index)
      .groupby(train_part['건물번호'], observed=False)
      .mean()
)

# 인덱스 타입 통일(int)
if isinstance(getattr(bld_bias_map.index, 'dtype', None), pd.CategoricalDtype):
    bld_bias_map.index = bld_bias_map.index.astype('int64')

# valid 건물번호도 int로 변환 후 dict 매핑 → float → NaN을 0.0으로
bld_bias_dict = bld_bias_map.to_dict()
bld_codes_va = valid_part['건물번호'].astype('int64')
bias_va = bld_codes_va.map(bld_bias_dict).astype('float64').fillna(0.0).to_numpy()

pred_iso_bias = pred_iso + gamma * bias_va
sc_iso_bias = (RMSE(y_va, pred_iso_bias), mean_absolute_error(y_va, pred_iso_bias), r2_score(y_va, pred_iso_bias))


# 7) 베스트 보정 선택
cands = {
    "AB":        (pred_ab,       sc_ab,       {"a":a,"b":b}),
    "ISO":       (pred_iso,      sc_iso,      {"iso":iso}),
    "ISO+BLD":   (pred_iso_bias, sc_iso_bias, {"iso":iso,"gamma":gamma,"bld_bias_map":bld_bias_map}),
}
best_name, (pred_best, (rmse_best, mae_best, r2_best), params_best) = min(
    cands.items(), key=lambda kv: kv[1][1][0]
)

print(f"[Calibration] best={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB scores : RMSE={sc_ab[0]:.3f}, ISO : {sc_iso[0]:.3f}, ISO+BLD : {sc_iso_bias[0]:.3f}")
if best_name == "AB":
    print(f"  - a={params_best['a']:.3f}, b={params_best['b']:.3f}")
elif best_name.startswith("ISO"):
    print(f"  - gamma={params_best.get('gamma','-')} (ISO 객체 저장됨)")

# 8) 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 9) 6단계에서 재사용할 아티팩트 저장(전역)
CALIB_MODE   = best_name
CALIB_PARAMS = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST = features
CAT_COLS     = cat_cols
LGBM_PARAMS  = params
print("\n[Saved] CALIB_MODE, CALIB_PARAMS, BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 205.222	valid's rmse: 252.457
[800]	train's rmse: 172.089	valid's rmse: 238.118
[1200]	train's rmse: 153.767	valid's rmse: 233.093
[1600]	train's rmse: 141.129	valid's rmse: 230.623
[2000]	train's rmse: 131.572	valid's rmse: 229.91
[2400]	train's rmse: 123.665	valid's rmse: 229.166
[2800]	train's rmse: 117.054	valid's rmse: 228.757
Early stopping, best iteration is:
[2774]	train's rmse: 117.47	valid's rmse: 228.635
[Calibration] best=ISO  RMSE=220.547  MAE=107.776  R2=0.9967
  - AB scores : RMSE=228.022, ISO : 220.547, ISO+BLD : 222.327
  - gamma=- (ISO 객체 저장됨)

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-4.404, 773.014]        480.723703    470.111125  1920 -10.612578
(773.014, 1153.43]       987.781547    979.375712  1920  -8.405835
(1153.43, 1516.8]       1330.462797   1320.860218  1920 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [51]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':48,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':150,'lambda_l2':3.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# =========================
# 6)~9) 보정/선택/리포트 (롤백/고정 버전)
# =========================
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# --- (옵션) 하드 빌딩 가중치 '고정'
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}  # 이전에 잘 나왔던 셋으로 고정
# w_tr은 위 학습 셀에서 이미 이 셋 기준으로 적용된 상태면 그대로 두면 됨

pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형 보정: y ≈ base + a*pred + b  (참고용)
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) ISO(플레인): 잔차 그대로로 적합(클리핑/바이어스 보정 없음)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# --- 강제 선택: ISO(플레인)
best_name = "ISO"
pred_best = pred_iso
rmse_best, mae_best, r2_best = sc_iso
params_best = {"iso": iso}

print(f"[Calibration] forced={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB RMSE={sc_ab[0]:.3f} | ISO RMSE={sc_iso[0]:.3f}")

# --- 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
top10 = va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10)
print("\n[Top-10 by 건물번호]\n", top10)

# --- 다음 런용 HARD_BLD '자동 갱신 끄기' (그대로 유지)
# (아무 것도 갱신하지 않음)

# --- 6R용 아티팩트 저장
CALIB_MODE     = best_name
CALIB_PARAMS   = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST   = features
CAT_COLS       = cat_cols
LGBM_PARAMS    = params
print("\n[Saved] CALIB_MODE=ISO (plain), BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 205.222	valid's rmse: 252.457
[800]	train's rmse: 172.089	valid's rmse: 238.118
[1200]	train's rmse: 153.767	valid's rmse: 233.093
[1600]	train's rmse: 141.129	valid's rmse: 230.623
[2000]	train's rmse: 131.572	valid's rmse: 229.91
[2400]	train's rmse: 123.665	valid's rmse: 229.166
[2800]	train's rmse: 117.054	valid's rmse: 228.757
Early stopping, best iteration is:
[2774]	train's rmse: 117.47	valid's rmse: 228.635
[Calibration] forced=ISO  RMSE=220.547  MAE=107.776  R2=0.9967
  - AB RMSE=228.022 | ISO RMSE=220.547

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-4.404, 773.014]        480.723703    470.111125  1920 -10.612578
(773.014, 1153.43]       987.781547    979.375712  1920  -8.405835
(1153.43, 1516.8]       1330.462797   1320.860218  1920  -9.602579
(1516.8, 1815.118]      1693.607271 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  top10 = va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10)


In [52]:
# =========================
# 6R. Full Retrain & Submit (feature alignment fix)
# =========================
DO_SUBMIT = True  # 필요시 False

# 0) all_feat 준비(있으면 재사용)
try:
    _ = all_feat
except NameError:
    test_orig = read_csv_smart(TEST_PATH)
    build_df  = read_csv_smart(BUILD_PATH)
    test_df = pd.merge(test_orig, build_df, on='건물번호', how='left')
    test_df = clean_capacity_fields(test_df)
    test_df = ensure_datetime_cols(test_df)
    all_df  = pd.concat([train_df, test_df], ignore_index=True)
    all_feat = make_features(all_df)

all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

# === 핵심: 5R에서 저장한 FEATURE_LIST/CAT_COLS를 반드시 사용 ===
# (혹시 변수 없으면 5R의 model에서 가져오고 교집합만 사용)
if 'FEATURE_LIST' not in globals():
    FEATURE_LIST = list(model.feature_name())
if 'CAT_COLS' not in globals():
    CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

# 안전: train/test에 없는 컬럼 제거 (교집합 유지)
common_feats = [c for c in FEATURE_LIST if c in all_feat_train.columns and c in all_feat_test.columns]
if len(common_feats) != len(FEATURE_LIST):
    missing = set(FEATURE_LIST) - set(common_feats)
    print("⚠️ 다음 피처가 test/train에 없어 제외됨:", sorted(list(missing)))
FEATURE_LIST = common_feats
CAT_COLS = [c for c in CAT_COLS if c in FEATURE_LIST]

# 정렬/결측 처리 유틸
def align_for_lgb(df_train, df_test, feature_list, cat_cols):
    X_tr = df_train.reindex(columns=feature_list)
    X_te = df_test.reindex(columns=feature_list)

    # 카테고리 먼저 맞춰주기
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_te[c] = X_te[c].astype('category')
        cats = X_tr[c].cat.categories
        X_te[c] = X_te[c].cat.set_categories(cats)

    # 수치 컬럼 결측/inf 처리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols]
                      .replace([np.inf, -np.inf], np.nan)
                      .astype(float)
                      .fillna(0.0))
    X_te[num_cols] = (X_te[num_cols]
                      .replace([np.inf, -np.inf], np.nan)
                      .astype(float)
                      .fillna(0.0))
    return X_tr, X_te

# 1) 정렬된 학습/예측 행렬 만들기
features_full = FEATURE_LIST
cat_cols_full = CAT_COLS
X_full, X_te = align_for_lgb(all_feat_train, all_feat_test, features_full, cat_cols_full)

# 안전 체크
assert list(X_full.columns) == list(X_te.columns), "train/test 피처 순서 불일치"
print(f"n_features (train/test): {X_full.shape[1]} / {X_te.shape[1]}")

# 2) 전체 재학습(잔차 타깃)
y_full = all_feat_train['전력소비량(kWh)']
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

w_full = np.ones(len(all_feat_train), dtype=float)
try:
    # 5R에서 썼던 HARD_BLD 그대로 있으면 재적용
    w_full[all_feat_train['건물번호'].isin(HARD_BLD).values] = 1.8
except NameError:
    pass

lgb_full = lgb.Dataset(
    X_full,
    label=(y_full - baseline_full).astype(float),
    weight=w_full,
    categorical_feature=cat_cols_full or None
)

final_model = lgb.train(LGBM_PARAMS, lgb_full, num_boost_round=BEST_NUM_BOOST)

# 3) 테스트 잔차 예측 + 5R에서 선택된 보정 적용
resid_te = final_model.predict(X_te, num_iteration=final_model.best_iteration)

if CALIB_MODE == "ISO":
    iso = CALIB_PARAMS["iso"]
    test_pred = baseline_te.values + iso.transform(resid_te)
elif CALIB_MODE == "AB":
    a, b = CALIB_PARAMS["a"], CALIB_PARAMS["b"]
    test_pred = baseline_te.values + a * resid_te + b
elif CALIB_MODE == "ISO+BLD":
    iso   = CALIB_PARAMS["iso"]
    gamma = CALIB_PARAMS["gamma"]
    bmap  = CALIB_PARAMS["bld_bias_map"]
    if isinstance(getattr(bmap.index, 'dtype', None), pd.CategoricalDtype):
        bmap.index = bmap.index.astype('int64')
    bias_te = all_feat_test['건물번호'].astype('int64').map(bmap.to_dict()).astype('float64').fillna(0.0).to_numpy()
    test_pred = baseline_te.values + iso.transform(resid_te) + gamma * bias_te
else:
    test_pred = baseline_te.values + resid_te  # fallback

# 안전 클램프
test_pred = np.clip(test_pred, 0, None)

if DO_SUBMIT:
    sub = read_csv_smart(SAMPLE_SUB)
    sub['answer'] = pd.Series(test_pred, index=sub.index)
    sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
    print(f"저장 완료 → {OUT_SUB}  (rows={len(sub)})")
else:
    print("제출 생략(DO_SUBMIT=False): 예측만 계산 완료")



n_features (train/test): 48 / 48
저장 완료 → C:\Users\user\Downloads\open (1)\baseline_lgbm_submission.csv  (rows=16800)


In [53]:
# =========================
# test 전처리: train과 '완전히 동일' 스키마/타입으로 맞추기
#  - building_info 병합
#  - 설비 용량 '-' → 0 (float)
#  - '일시' → '날짜','시간','dt' 생성
#  - 일사/일조 없으면 0.0 컬럼 추가
#  - train(merged_train.csv)를 기준으로 컬럼/순서/카테고리 일치
#  - 저장: merged_test.csv
# =========================
import pandas as pd
import numpy as np

# 경로
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"  # 앞에서 저장한 train 전처리 결과
TEST_PATH         = r"C:\Users\user\Downloads\open (1)\test.csv"
BUILD_PATH        = r"C:\Users\user\Downloads\open (1)\building_info.csv"
OUT_TEST_PATH     = r"C:\Users\user\Downloads\open (1)\merged_test.csv"

# ----- 유틸 -----
def read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].replace('-', 0), errors='coerce').fillna(0.0).astype(float)
    return df

def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

# ----- 1) 기준(train 전처리 결과) 스키마 로드 -----
train_ref = read_csv_smart(TRAIN_MERGED_PATH)
# 카테고리 정보가 있으면 유지 (건물번호/건물유형은 우리가 카테고리로 저장했을 가능성 큼)
for c in ['건물번호','건물유형']:
    if c in train_ref.columns:
        try:
            train_ref[c] = train_ref[c].astype('category')
        except Exception:
            pass

# ----- 2) test 로드 & train과 동일 전처리 -----
test = read_csv_smart(TEST_PATH)
build = read_csv_smart(BUILD_PATH)

test_merged = pd.merge(test, build, on='건물번호', how='left')
test_merged = clean_capacity_fields(test_merged)
test_merged = ensure_datetime_cols(test_merged)

# 일사/일조 없으면 0으로 생성(학습 일관성)
if '일사(MJ/m2)' not in test_merged.columns:
    test_merged['일사(MJ/m2)'] = 0.0
if '일조(hr)' not in test_merged.columns:
    test_merged['일조(hr)'] = 0.0

# ----- 3) 컬럼/순서/타입을 train에 '맞춰서' 정렬 -----
# target은 test에 없으므로 ref에서 제거
ref_cols = [c for c in train_ref.columns if c != '전력소비량(kWh)']

# ref에 있는데 test에 없는 컬럼은 dtype에 따라 기본값으로 생성
for c in ref_cols:
    if c not in test_merged.columns:
        ref_dtype = train_ref[c].dtype
        if np.issubdtype(ref_dtype, np.number):
            test_merged[c] = 0.0
        elif np.issubdtype(ref_dtype, np.datetime64):
            test_merged[c] = pd.NaT
        else:
            test_merged[c] = pd.Series([pd.NA]*len(test_merged), dtype="object")

# 반대로, test에만 있는 컬럼은 그대로 두되 저장/모델링 때는 ref_cols 순서만 사용
# 카테고리 일치(라벨 매칭)
for c in ['건물번호','건물유형']:
    if c in ref_cols and c in test_merged.columns:
        # train_ref 쪽 카테고리 있으면 세팅
        if str(train_ref[c].dtype) == 'category':
            cats = train_ref[c].astype('category').cat.categories
            test_merged[c] = test_merged[c].astype('category').cat.set_categories(cats)
        else:
            # 아닌 경우에도 최소한 dtype 통일
            test_merged[c] = test_merged[c].astype(train_ref[c].dtype)

# 최종 컬럼 순서 train 기준으로 정렬
test_aligned = test_merged.reindex(columns=ref_cols)

# ----- 4) 저장 -----
test_aligned.to_csv(OUT_TEST_PATH, index=False, encoding='utf-8-sig')
print(f"[OK] 저장: {OUT_TEST_PATH}  rows={len(test_aligned)}  cols={len(test_aligned.columns)}")
# (선택) sanity check
missing_after = [c for c in ref_cols if c not in test_aligned.columns]
if missing_after:
    print("⚠️ 아직 없는 컬럼:", missing_after)


[OK] 저장: C:\Users\user\Downloads\open (1)\merged_test.csv  rows=16800  cols=15


In [54]:
# =========================================
# Test 예측 (train과 동일 파이프라인, num_date_time 제외, factor 처리)
# =========================================
import numpy as np, pandas as pd, lightgbm as lgb

# --- 경로 (필요시 수정) ---
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"
TEST_PATH         = r"C:\Users\user\Downloads\open (1)\merged_test.csv"
BUILD_PATH        = r"C:\Users\user\Downloads\open (1)\building_info.csv"
SAMPLE_SUB        = r"C:\Users\user\Downloads\open (1)\sample_submission.csv"
OUT_SUB           = r"C:\Users\user\Downloads\open (1)\submission.csv"

# --- 유틸 ---
def read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def align_for_lgb(df_tr, df_te, feature_list, cat_cols):
    """train/test를 같은 컬럼·순서·dtype으로 정렬"""
    X_tr = df_tr.reindex(columns=feature_list).copy()
    X_te = df_te.reindex(columns=feature_list).copy()
    # 카테고리 맞춤
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_te[c] = X_te[c].astype('category').cat.set_categories(X_tr[c].cat.categories)
    # 수치 결측/inf 처리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    X_te[num_cols] = (X_te[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    return X_tr, X_te

# --- 0) 데이터 로드 (train 전처리 결과 + test 전처리본) ---
train_df = read_csv_smart(TRAIN_MERGED_PATH)
test_raw = read_csv_smart(TEST_PATH)   # merged_test.csv (이미 병합된 파일)

# ✅ 둘 다 날짜/시간/dt를 '확실히' datetime으로 보정
train_df = ensure_datetime_cols(train_df)
test_df  = test_raw.copy()
test_df  = clean_capacity_fields(test_df)
test_df  = ensure_datetime_cols(test_df)

# (강화 가드: dt가 여전히 object면 한 번 더 강제 변환)
for df_ in (train_df, test_df):
    if 'dt' in df_.columns and not np.issubdtype(df_['dt'].dtype, np.datetime64):
        df_['날짜'] = pd.to_datetime(df_['날짜'])
        df_['시간'] = df_['시간'].astype(int)
        df_['dt']  = df_['날짜'] + pd.to_timedelta(df_['시간'], unit='h')

# --- 2) train+test 합쳐 동일 피처 생성 ---
all_df   = pd.concat([train_df, test_df], ignore_index=True)
all_feat = make_features(all_df)  # 이제 .dt 접근 OK

# --- 3) 피처 목록: 5R에서 저장한 FEATURE_LIST 우선, 없으면 생성 ---
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
if 'FEATURE_LIST' not in globals():
    from pandas.api.types import is_numeric_dtype, is_categorical_dtype
    FEATURE_LIST = [c for c in all_feat_train.columns if c not in EXCLUDE
                    and (is_numeric_dtype(all_feat_train[c]) or str(all_feat_train[c].dtype)=='category')]
else:
    # num_date_time 등 제외 보장
    FEATURE_LIST = [c for c in FEATURE_LIST if c not in EXCLUDE]

# 카테고리 지정(요구: 건물번호/건물유형 factor)
if 'CAT_COLS' not in globals():
    CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]
else:
    CAT_COLS = [c for c in CAT_COLS if c in FEATURE_LIST]

# 공통 피처만 사용 (학습/예측 모두 존재)
common_feats = [c for c in FEATURE_LIST if c in all_feat_train.columns and c in all_feat_test.columns]
if len(common_feats) != len(FEATURE_LIST):
    removed = sorted(set(FEATURE_LIST) - set(common_feats))
    print("⚠️ 제외된 피처:", removed)
FEATURE_LIST = common_feats
CAT_COLS     = [c for c in CAT_COLS if c in FEATURE_LIST]

# --- 4) 행렬 정렬 & 카테고리(factor) 일치 ---
X_full, X_te = align_for_lgb(all_feat_train, all_feat_test, FEATURE_LIST, CAT_COLS)
y_full = all_feat_train['전력소비량(kWh)']

# --- 5) 베이스라인 계산 (train/test)
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

# --- 6) 전체 재학습(잔차 타깃) ---
# 5R에서 저장한 파라미터/반복수 우선 사용, 없으면 안전 기본값
if 'LGBM_PARAMS' not in globals():
    LGBM_PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 48,          # ← 64 → 48
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 150,   # ← 120 → 150
    'lambda_l2': 3.0,          # ← 2.0 → 3.0
    'seed': 42,
    'verbosity': -1,
    'num_threads': 4
}
    # 5R에서 사용할 params도 이걸 그대로 참조하게
params = LGBM_PARAMS.copy()

print("[LGBM PARAMS in use]")
for k, v in LGBM_PARAMS.items():
    print(f"  {k}: {v}")
    
if 'BEST_NUM_BOOST' not in globals():
    BEST_NUM_BOOST = 2000

# 하드 빌딩 가중치(있으면 적용)
w_full = np.ones(len(all_feat_train), dtype=float)
if 'HARD_BLD' in globals():
    w_full[all_feat_train['건물번호'].isin(HARD_BLD).values] = 1.8

lgb_full = lgb.Dataset(
    X_full,
    label=(y_full - baseline_full).astype(float),
    weight=w_full,
    categorical_feature=CAT_COLS or None
)
final_model = lgb.train(LGBM_PARAMS, lgb_full, num_boost_round=BEST_NUM_BOOST)

# --- 7) 테스트 잔차 예측 + 5R 보정 적용 (ISO / AB / ISO+BLD) ---
resid_te = final_model.predict(X_te, num_iteration=final_model.best_iteration)

if 'CALIB_MODE' in globals() and CALIB_MODE == "ISO":
    iso = CALIB_PARAMS["iso"]
    test_pred = baseline_te.values + iso.transform(resid_te)
elif 'CALIB_MODE' in globals() and CALIB_MODE == "AB":
    a, b = CALIB_PARAMS["a"], CALIB_PARAMS["b"]
    test_pred = baseline_te.values + a * resid_te + b
elif 'CALIB_MODE' in globals() and CALIB_MODE == "ISO+BLD":
    iso   = CALIB_PARAMS["iso"]
    gamma = CALIB_PARAMS["gamma"]
    bmap  = CALIB_PARAMS["bld_bias_map"]
    # 빌딩 바이어스 매핑
    try:
        from pandas.api.types import CategoricalDtype
        if isinstance(getattr(bmap.index, 'dtype', None), CategoricalDtype):
            bmap.index = bmap.index.astype('int64')
    except Exception:
        pass
    bias_te = all_feat_test['건물번호'].astype('int64').map(bmap.to_dict()).astype('float64').fillna(0.0).to_numpy()
    test_pred = baseline_te.values + iso.transform(resid_te) + gamma * bias_te
else:
    # 보정 정보 없으면 기본 복원
    test_pred = baseline_te.values + resid_te

# 음수 방지
test_pred = np.clip(test_pred, 0, None)

# --- 8) 저장 (num_date_time은 제출을 위한 ID로만 사용; 피처에는 미사용) ---
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')

print(f"[DONE] 예측 저장 → {OUT_SUB}")
print("n_features(train/test):", X_full.shape[1], "/", X_te.shape[1])
print("neg_rate:", float((test_pred < 0).mean()), " nan_rate:", float(np.isnan(test_pred).mean()))


TypeError: '>' not supported between instances of 'str' and 'int'

In [55]:
import numpy as np, pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit

SEED = 42
np.random.seed(SEED)

CFG = dict(
    id_col="건물번호",
    time_col="timestamp",
    y_col="전력소비량(kWh)",
    lags=[1,2,3,24,48,168],
    roll_mean=[3,6,12,24,168],
    roll_std=[24,168],
)

def add_calendar_feats(df, time_col):
    t = pd.to_datetime(df[time_col])
    df["hour"] = t.dt.hour
    df["dow"] = t.dt.dayofweek
    df["is_weekend"] = (df["dow"] >= 5).astype(int)
    df["month"] = t.dt.month
    return df

def build_features_train(df, cfg=CFG):
    df = df.sort_values([cfg["id_col"], cfg["time_col"]]).copy()
    df = add_calendar_feats(df, cfg["time_col"])
    # 그룹별 시계열 피처
    def _per_group(g):
        y = g[cfg["y_col"]]
        # lag
        for L in cfg["lags"]:
            g[f"lag_{L}"] = y.shift(L)
        # rolling (shift 후)
        y_prev = y.shift(1)
        for w in cfg["roll_mean"]:
            g[f"rmean_{w}"] = y_prev.rolling(w, min_periods=1).mean()
        for w in cfg["roll_std"]:
            g[f"rstd_{w}"] = y_prev.rolling(w, min_periods=1).std()
        # diff
        g["diff_1"] = y.diff(1)
        g["diff_24"] = y.diff(24)
        return g
    df = df.groupby(cfg["id_col"], group_keys=False).apply(_per_group)
    # 학습 행 필터: lag/rolling로 인해 NaN 생긴 초반부 제거
    feat_cols = [c for c in df.columns if c not in [cfg["y_col"]]]
    X = df.dropna(subset=[f"lag_{max(cfg['lags'])}"])[feat_cols]
    y = df.loc[X.index, cfg["y_col"]]
    return X, y, feat_cols

def make_cv_splits(df, n_splits=5):
    """
    df는 전체 학습 df (정렬된 상태 가정).
    시간 순서만 고려한 단순 TSSplit. (건물별 누수는 위에서 피처 생성시 방지됨)
    """
    tss = TimeSeriesSplit(n_splits=n_splits)
    idx = np.arange(len(df))
    return list(tss.split(idx))

def train_hgb(X, y):
    model = HistGradientBoostingRegressor(
        loss="squared_error",  # 타깃이 항상 양수면 "poisson"도 실험해봐
        learning_rate=0.06,
        max_depth=None,
        max_leaf_nodes=31,
        min_samples_leaf=25,
        max_bins=255,
        l2_regularization=0.0,
        early_stopping=True,
        random_state=SEED,
    )
    model.fit(X, y)
    return model

def predict_autoreg(test_df, model, hist_df, cfg=CFG):
    """
    test_df: 예측할 구간 (id, timestamp만 있어도 됨 + 외생변수)
    hist_df: test 시작 직전까지의 '학습 구간 원본'(target 포함). 여기서 버퍼 초기화
    """
    feat_cols = None
    out = []
    for bid, te in test_df.sort_values([cfg["id_col"], cfg["time_col"]]).groupby(cfg["id_col"]):
        hist = hist_df[hist_df[cfg["id_col"]] == bid].sort_values(cfg["time_col"]).copy()
        te = te.sort_values(cfg["time_col"]).copy()
        # 버퍼: hist + te(예측 채워넣기용)
        buf = pd.concat([hist, te], ignore_index=True)
        buf = add_calendar_feats(buf, cfg["time_col"])
        for i, row in te.iterrows():
            # 현재 시점 인덱스
            t_idx = buf.index[ buf.index >= i ].min()  # i는 원래 te의 인덱스일 수 있어, 안전하게 재계산을 권장
            # 안전하게 현재 시점의 실제 loc 찾기
        # 간단/안전 버전: 한 번 더 정렬/리셋로 robust하게
        buf = buf.sort_values(cfg["time_col"]).reset_index(drop=True)
        te = te.sort_values(cfg["time_col"]).reset_index(drop=True)

        for t in te.index:
            # 최신 버퍼에서 시계열 피처 갱신
            g = buf.copy()
            y = g[cfg["y_col"]] if cfg["y_col"] in g.columns else pd.Series(index=g.index, dtype=float)
            # groupby 없이 단일 건물 버퍼라 직접 생성
            for L in cfg["lags"]:
                g[f"lag_{L}"] = y.shift(L)
            y_prev = y.shift(1)
            for w in cfg["roll_mean"]:
                g[f"rmean_{w}"] = y_prev.rolling(w, min_periods=1).mean()
            for w in cfg["roll_std"]:
                g[f"rstd_{w}"] = y_prev.rolling(w, min_periods=1).std()
            g["diff_1"] = y.diff(1)
            g["diff_24"] = y.diff(24)

            feat_cols = [c for c in g.columns if c not in [cfg["y_col"]]]
            x_now = g.iloc[len(hist)+t][feat_cols]  # 현재 시점 피처
            yhat = model.predict(pd.DataFrame([x_now]))[0]

            # 예측값을 버퍼의 현재 시점에 기록 → 다음 시점 피처에 사용됨
            if cfg["y_col"] not in buf.columns:
                buf[cfg["y_col"]] = np.nan
            buf.loc[len(hist)+t, cfg["y_col"]] = yhat

            out.append({cfg["id_col"]: bid, cfg["time_col"]: te.loc[t, cfg["time_col"]], "yhat": yhat})
    pred = pd.DataFrame(out)
    return pred

def make_submission(df_feat_new, yhat_df, id_cols=("건물번호","timestamp"), target_name="전력소비량(kWh)"):
    # id로 merge하여 순서 안정
    sub = df_feat_new[id_cols].merge(
        yhat_df.rename(columns={"yhat": target_name}),
        on=list(id_cols),
        how="left",
    )
    # 혹시 누락되면 정렬 보정
    sub = sub.sort_values(list(id_cols)).reset_index(drop=True)
    assert sub[target_name].notna().all(), "예측 누락이 있어요. id 매칭/루프 확인 필요"
    return sub




In [56]:
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from sklearn.inspection import permutation_importance, partial_dependence
from scipy.stats import spearmanr
import warnings; warnings.filterwarnings("ignore")

SEED = 42
rng = np.random.default_rng(SEED)

# 스코어(낮을수록 좋음) — refit은 MAE 기준
def rmse(y_true, y_pred): 
    return mean_squared_error(y_true, y_pred, squared=False)

SCORERS = {
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "RMSE": make_scorer(rmse, greater_is_better=False),
}

# 시간 순서 CV (누수 방지). 필요시 n_splits, test_size 조정
tss = TimeSeriesSplit(n_splits=5)


In [57]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor

MODEL_SPECS = {
    "lgbm": {
        "est": LGBMRegressor(
            objective="regression",
            random_state=SEED,
            n_estimators=2000,
            subsample=1.0, colsample_bytree=1.0,  # 재현성↑
            deterministic=True, n_jobs=1
        ),
        "param_dist": {
            "learning_rate":   np.linspace(0.02, 0.15, 30),
            "num_leaves":      np.arange(15, 64),
            "min_data_in_leaf":np.arange(10, 60),
            "max_depth":       np.append([-1], np.arange(4, 13)),
            "reg_lambda":      np.linspace(0.0, 2.0, 21),
            "max_bin":         [63,127,255,511],
        },
    },
    "xgb": {
        "est": XGBRegressor(
            objective="reg:squarederror",
            n_estimators=2000,
            subsample=1.0, colsample_bytree=1.0,
            tree_method="hist",
            enable_categorical=True,   # ★ 이 줄 추가
            random_state=SEED, n_jobs=1
        ),
        "param_dist": {
            "learning_rate":   np.linspace(0.02, 0.15, 30),
            "max_depth":       np.arange(3, 13),
            "min_child_weight":np.linspace(1.0, 12.0, 23),
            "gamma":           np.linspace(0.0, 2.0, 21),
            "reg_lambda":      np.linspace(0.0, 2.0, 21),
        },
    },
    "cat": {
        "est": CatBoostRegressor(
            loss_function="RMSE",
            iterations=2000,
            random_seed=SEED,
            depth=6,
            learning_rate=0.06,
            verbose=False,
            allow_writing_files=False
        ),
        "param_dist": {
            "depth":           np.arange(4, 10),
            "learning_rate":   np.linspace(0.02, 0.15, 30),
            "l2_leaf_reg":     np.linspace(1.0, 8.0, 29),
            "random_strength": np.linspace(0.0, 1.0, 21),
        },
    },
    "hgb": {
        "est": HistGradientBoostingRegressor(
            loss="squared_error", random_state=SEED, max_iter=500
        ),
        "param_dist": {
            "learning_rate":   np.linspace(0.02, 0.15, 30),
            "max_leaf_nodes":  np.arange(15, 63),
            "min_samples_leaf":np.arange(10, 60),
            "max_bins":        [63,127,255,511],
            "l2_regularization":np.linspace(0.0, 2.0, 21),
        },
    },
    "rf": {
        "est": RandomForestRegressor(
            n_estimators=800, random_state=SEED, n_jobs=1
        ),
        "param_dist": {
            "max_depth":       np.append([None], np.arange(5, 21)),
            "min_samples_leaf":np.arange(1, 21),
            "max_features":    ["sqrt", "log2", None, 0.3, 0.5, 0.7],
        },
    },
}


In [58]:
def tune_one(name, est, param_dist, X, y, n_iter=40):  # n_iter는 시간에 맞춰 조절
    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=SCORERS,
        refit="MAE",
        cv=tss,
        random_state=SEED,
        verbose=0
    )
    search.fit(X, y)
    # 결과 요약
    best = search.best_estimator_
    res = pd.DataFrame(search.cv_results_)
    best_row = res.loc[res['rank_test_MAE']==1].iloc[0]
    out = {
        "model": name,
        "best_params": search.best_params_,
        "cv_MAE(mean)": -best_row["mean_test_MAE"],
        "cv_RMSE(mean)": -best_row["mean_test_RMSE"],
    }
    return best, out, search


In [59]:
CAT_COLS = X_tr.select_dtypes(include=["category","object"]).columns.tolist()

def cat_to_codes(df):
    df2 = df.copy()
    for c in CAT_COLS:
        df2[c] = df2[c].astype("category").cat.codes.astype("int32")
    return df2

def X_by_model(name, X):
    # 사이킷런 HGB/RF는 코드화, 나머지는 원본 유지
    return cat_to_codes(X) if name in ["hgb","rf"] else X


In [60]:
def tune_one(name, est, param_dist, X, y, n_iter=40):
    X_in = X_by_model(name, X)     # ★ 모델별 입력 변환
    fit_params = {}
    if name == "cat":
        cat_idx = [X_in.columns.get_loc(c) for c in CAT_COLS]
        fit_params["cat_features"] = cat_idx  # ★ CatBoost에 카테고리 인덱스 전달

    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=SCORERS,
        refit="MAE",
        cv=tss,
        random_state=SEED,
        verbose=0
    )
    search.fit(X_in, y, **fit_params)  # ★ fit_params 반영
    best = search.best_estimator_
    res = pd.DataFrame(search.cv_results_)
    best_row = res.loc[res['rank_test_MAE']==1].iloc[0]
    out = {
        "model": name,
        "best_params": search.best_params_,
        "cv_MAE(mean)": -best_row["mean_test_MAE"],
        "cv_RMSE(mean)": -best_row["mean_test_RMSE"],
    }
    return best, out, search
    


In [61]:
best_models = {}
rows = []
search_store = {}

for name, spec in MODEL_SPECS.items():
    best, summary, search = tune_one(name, spec["est"], spec["param_dist"], X_tr, y_tr, n_iter=40)
    best_models[name] = best
    rows.append(summary)
    search_store[name] = search

cv_summary = pd.DataFrame(rows).sort_values("cv_MAE(mean)")
cv_summary


  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                   

KeyboardInterrupt: 

In [63]:
def tune_one(name, est, param_dist, X, y, n_iter=40):
    X_in = X_by_model(name, X)   # 모델별 입력 변환 (HGB/RF는 cat_to_codes 등)
    fit_params = {}
    if name == "cat":
        # CatBoost는 카테고리 컬럼 인덱스를 넘겨야 함
        cat_idx = [X_in.columns.get_loc(c) for c in CAT_COLS if c in X_in.columns]
        fit_params["cat_features"] = cat_idx

    search = RandomizedSearchCV(
        estimator=est,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=SCORERS,
        refit="MAE",
        cv=tss,
        random_state=SEED,
        verbose=2,        # 진행 로그 보이게
        n_jobs=1,         # ★ 병렬 끔 → wmic 문제 회피
        error_score="raise"  # 실패 파라미터 조합은 즉시 에러로 알려줌
    )
    search.fit(X_in, y, **fit_params)

    best = search.best_estimator_
    res = pd.DataFrame(search.cv_results_)
    best_row = res.loc[res['rank_test_MAE']==1].iloc[0]
    summary = {
        "model": name,
        "best_params": search.best_params_,
        "cv_MAE(mean)": -best_row["mean_test_MAE"],
        "cv_RMSE(mean)": -best_row["mean_test_RMSE"],
    }
    return best, summary, search


In [65]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

def rmse(y_true, y_pred):
    # squared=False 미지원 환경 대비: 직접 제곱근
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

SCORERS = {
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "RMSE": make_scorer(rmse, greater_is_better=False),
}


In [66]:
best_models = {}
rows = []
search_store = {}

for name, spec in MODEL_SPECS.items():
    print(f"\n=== [{name}] RandomizedSearch 시작 ===")
    best, summary, search = tune_one(name, spec["est"], spec["param_dist"], X_tr, y_tr, n_iter=40)
    print(f"=== [{name}] 끝: MAE {summary['cv_MAE(mean)']:.5f}, RMSE {summary['cv_RMSE(mean)']:.5f} ===")
    best_models[name] = best
    rows.append(summary)
    search_store[name] = search

cv_summary = pd.DataFrame(rows).sort_values("cv_MAE(mean)")
cv_summary



=== [lgbm] RandomizedSearch 시작 ===
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  11.0s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  14.9s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  19.2s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  24.0s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  26.4s
[CV] END learning_rate=0.1275862068965517, max_bin=63, max_depth=7, min_data_in_leaf=59, num_leaves=47, reg_lambda=1.7000000000000002; total time=  11.1s
[CV] END learning_rate=0.1275862068965517, 

InvalidParameterError: The 'max_bins' parameter of HistGradientBoostingRegressor must be an int in the range [2, 255]. Got 511 instead.

In [67]:
from scipy.stats import randint, uniform
from xgboost import XGBRegressor

MODEL_SPECS["xgb"]["est"] = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    enable_categorical=True,
    random_state=SEED, n_jobs=1
)

MODEL_SPECS["xgb"]["param_dist"] = {
    # 너무 큰 lr 제거하고, 적당한 범위로
    "learning_rate": uniform(0.03, 0.08),        # 0.03 ~ 0.11
    "max_depth": randint(3, 9),                  # 3 ~ 8
    "min_child_weight": uniform(2.0, 8.0),       # 2 ~ 10
    "gamma": uniform(0.0, 1.5),
    # 과적합 억제용 샘플링 추가
    "subsample": uniform(0.6, 0.4),              # 0.6 ~ 1.0
    "colsample_bytree": uniform(0.6, 0.4),       # 0.6 ~ 1.0
    # 규제 튜닝
    "reg_lambda": uniform(0.0, 2.0),
    # 트리 개수도 탐색 (너무 크지 않게)
    "n_estimators": randint(400, 1201)           # 400 ~ 1200
}


In [68]:
from catboost import CatBoostRegressor

MODEL_SPECS["cat"]["est"] = CatBoostRegressor(
    loss_function="RMSE",
    random_seed=SEED,
    allow_writing_files=False,
    verbose=False,
    thread_count=1
)

MODEL_SPECS["cat"]["param_dist"] = {
    "depth": randint(4, 9),                      # 4 ~ 8
    "learning_rate": uniform(0.03, 0.09),        # 0.03 ~ 0.12 중 좁힘
    "l2_leaf_reg": uniform(1.0, 7.0),
    "random_strength": uniform(0.0, 1.0),
    # 규제/샘플링 추가
    "bagging_temperature": uniform(0.0, 1.0),
    "rsm": uniform(0.6, 0.4),                    # 0.6 ~ 1.0 (column sampling)
    # 반복 수도 탐색
    "iterations": randint(500, 1501)             # 500 ~ 1500
}


In [69]:
N_ITER_MAP = {"lgbm": 40, "xgb": 20, "cat": 20, "hgb": 30, "rf": 15}

for name, spec in MODEL_SPECS.items():
    n_iter = N_ITER_MAP.get(name, 20)
    print(f"\n=== [{name}] RandomizedSearch 시작 (n_iter={n_iter}) ===")
    best, summary, search = tune_one(name, spec["est"], spec["param_dist"], X_tr, y_tr, n_iter=n_iter)
    ...



=== [lgbm] RandomizedSearch 시작 (n_iter=40) ===
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  11.3s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  22.6s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  20.2s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  23.7s
[CV] END learning_rate=0.14103448275862068, max_bin=255, max_depth=6, min_data_in_leaf=15, num_leaves=40, reg_lambda=1.6; total time=  26.2s
[CV] END learning_rate=0.1275862068965517, max_bin=63, max_depth=7, min_data_in_leaf=59, num_leaves=47, reg_lambda=1.7000000000000002; total time=  11.0s
[CV] END learning_rate=0.127586

InvalidParameterError: The 'max_bins' parameter of HistGradientBoostingRegressor must be an int in the range [2, 255]. Got 511 instead.

In [71]:
# 0) 후보 검색: 메모리 내에서 best_estimator_/best_score_ 가진 객체 자동 수집
def _is_fitted_search(obj):
    return hasattr(obj, "best_score_") and hasattr(obj, "best_estimator_")

searches = {name: obj for name, obj in globals().items() if _is_fitted_search(obj)}

# 흔한 별칭도 추가로 시도 (있으면 딕트에 합침)
for alias in ["lgbm_rs","rs_lgbm","xgb_rs","rs_xgb","cat_rs","rs_cat","gbm_rs","hgb_rs","rf_rs"]:
    if alias in globals() and _is_fitted_search(globals()[alias]):
        searches[alias] = globals()[alias]

if not searches:
    raise RuntimeError("학습 완료된 검색 객체를 못 찾았습니다. (커널 리셋/변수명 확인)")

# 1) 우승자 고르기 (같은 스코어링 가정)
winner_key = max(searches, key=lambda k: searches[k].best_score_)
winner_rs  = searches[winner_key]
winner_model = winner_rs.best_estimator_  # refit=True라면 이미 전체 데이터로 재학습된 모델

print(f"[WIN] {winner_key}  score={winner_rs.best_score_:.6f}")
print("best_params_", winner_rs.best_params_)

# 2) refit=False로 돌렸다면 직접 재학습
if winner_model is None:
    Est = type(winner_rs.estimator)
    winner_model = Est(**winner_rs.best_params_).fit(X, y)

# 3) 저장
from joblib import dump
dump(winner_model, "final_winner.joblib")



[WIN] search  score=-276.281829
best_params_ {'bagging_temperature': np.float64(0.3567533266935893), 'depth': 7, 'iterations': 687, 'l2_leaf_reg': np.float64(4.798872582107739), 'learning_rate': np.float64(0.04268318024772864), 'random_strength': np.float64(0.8021969807540397), 'rsm': np.float64(0.6298202574719083)}


['final_winner.joblib']

In [76]:
from joblib import load
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 0) 모델 로드
obj = load("final_winner.joblib")
est = obj if hasattr(obj, "predict") else getattr(obj, "best_estimator_", obj)
print("Loaded:", type(obj).__name__, "| Using estimator:", type(est).__name__)

# 1) 데이터 준비 (df 기준, 타깃 명시)
assert 'df' in globals(), "df 변수가 필요합니다."
TARGET = "전력소비량(kWh)" if "전력소비량(kWh)" in df.columns else df.columns[-1]
y = df[TARGET]
X = df.drop(columns=[TARGET]).copy()
print(f"[data] TARGET='{TARGET}', X.shape={X.shape}")

# 2) 모델이 기억한 피처 순서에 맞춰 재정렬
if hasattr(est, "feature_names_"):
    fn = list(est.feature_names_)
    missing = [c for c in fn if c not in X.columns]
    extras  = [c for c in X.columns if c not in fn]
    if missing or extras:
        print("[warn] feature 불일치",
              "\n - model missing:", missing,
              "\n - data extras:", extras)
        # 불일치가 있으면 예측 실패 가능성↑ → 새 학습으로 폴백될 수 있음
    else:
        X = X[fn]  # 순서 맞추기
        print("[info] 컬럼 순서 모델에 맞춰 정렬 완료.")

# 3) 홀드아웃 분할
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

# 4) 예측 (실패 시 동일 파라미터 새 모델 만들어 학습 후 예측)
def predict_or_refit(model, X_te, X_tr, y_tr):
    try:
        return model.predict(X_te), model
    except Exception as e:
        print("(pretrained predict 실패 → 동일 파라미터 새 모델로 재학습)\n ", e)
        new_model = type(model)(**model.get_params())
        new_model.fit(X_tr, y_tr)
        return new_model.predict(X_te), new_model

y_pred, est = predict_or_refit(est, X_te, X_tr, y_tr)

# 5) 회귀 지표 (버전 호환: RMSE = sqrt(MSE))
rmse = mean_squared_error(y_te, y_pred) ** 0.5
mae  = mean_absolute_error(y_te, y_pred)
r2   = r2_score(y_te, y_pred)
print(f"[REG] RMSE={rmse:.4f}  MAE={mae:.4f}  R2={r2:.4f}")

# 6) 중요도(가능하면 상위 20개)
try:
    if hasattr(est, "get_feature_importance"):
        fi = pd.Series(est.get_feature_importance(), index=X.columns).sort_values(ascending=False).head(20)
        print("\nTop feature importances:")
        print(fi)
except Exception:
    pass



Loaded: CatBoostRegressor | Using estimator: CatBoostRegressor
[data] TARGET='전력소비량(kWh)', X.shape=(204000, 17)
[warn] feature 불일치 
 - model missing: ['cons_lag1', 'cons_lag_24h', 'cons_lag_48h', 'cons_lag_72h', 'cons_lag_168h', 'cons_mean_24h', 'cons_samehour_mean_7d', 'cons_std_24h', 'delta_1h', 'delta_7d', 'CDD', 'CDD_x_rad', 'CDD_humid_adj', 'weekday', 'is_weekend', 'is_holiday', 'has_pv', 'has_ess', 'has_pcs', 'is_daylight', 'is_offpeak', 'is_peak', 'ess_charge_potential', 'ess_discharge_potential', 'log1p_태양광용량(kW)', 'log1p_ESS저장용량(kWh)', 'log1p_PCS용량(kW)', 'ess_to_load_lag_ratio', 'month', 'hour', 'dayofyear', 'cons_lag1_per_m2', 'cons_mean24_per_m2', 'CDD_x_rad_area'] 
 - data extras: ['건물유형', '날짜', '시간']
(pretrained predict 실패 → 동일 파라미터 새 모델로 재학습)
  catboost/libs/data/model_dataset_compatibility.cpp:81: At position 7 should be feature with name 연면적(m2) (found 건물유형).
[REG] RMSE=444.0766  MAE=273.8381  R2=0.9854

Top feature importances:
건물유형            37.700752
건물번호           

In [78]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from joblib import dump

BASE_TEMP = 26.0  # CDD 기준온도

def build_features(raw):
    df2 = raw.copy()

    # 0) 시간키 만들기
    if not np.issubdtype(df2["날짜"].dtype, np.datetime64):
        df2["날짜"] = pd.to_datetime(df2["날짜"])
    if "시간" in df2.columns:
        df2["dt"] = df2["날짜"] + pd.to_timedelta(df2["시간"], unit="h")
    else:
        # '날짜'가 이미 시간까지 포함이면 그대로 사용
        df2["dt"] = pd.to_datetime(df2["날짜"])
        df2["시간"] = df2["dt"].dt.hour

    # 1) 기본 시간 파생
    df2["month"]     = df2["dt"].dt.month
    df2["hour"]      = df2["dt"].dt.hour
    df2["dayofyear"] = df2["dt"].dt.dayofyear
    df2["weekday"]   = df2["dt"].dt.weekday
    df2["is_weekend"] = (df2["weekday"] >= 5).astype(int)
    # 휴일 정보가 없으면 0으로 둠 (있으면 병합해 1로 표시)
    df2["is_holiday"] = 0

    # 2) CDD류
    if "기온(°C)" in df2.columns:
        cdd = (df2["기온(°C)"] - BASE_TEMP).clip(lower=0)
        df2["CDD"] = cdd
        if "일사(MJ/m2)" in df2.columns:
            df2["CDD_x_rad"] = cdd * df2["일사(MJ/m2)"]
        else:
            df2["CDD_x_rad"] = 0.0
        if "습도(%)" in df2.columns:
            # 간단 가중 (습도 영향 0.3 배수)
            df2["CDD_humid_adj"] = cdd * (1 + 0.3 * (df2["습도(%)"]/100.0))
        else:
            df2["CDD_humid_adj"] = cdd
    else:
        df2[["CDD","CDD_x_rad","CDD_humid_adj"]] = 0.0

    # 3) 설비 유무/로그 변환
    for col, logcol, flag in [
        ("태양광용량(kW)", "log1p_태양광용량(kW)", "has_pv"),
        ("ESS저장용량(kWh)", "log1p_ESS저장용량(kWh)", "has_ess"),
        ("PCS용량(kW)", "log1p_PCS용량(kW)", "has_pcs"),
    ]:
        if col in df2.columns:
            df2[logcol] = np.log1p(df2[col].fillna(0))
            df2[flag]   = (df2[col].fillna(0) > 0).astype(int)
        else:
            df2[logcol] = 0.0
            df2[flag]   = 0

    # 4) 주간/야간/일사 기반 플래그
    if "일사(MJ/m2)" in df2.columns:
        df2["is_daylight"] = (df2["일사(MJ/m2)"] > 0).astype(int)
    else:
        df2["is_daylight"] = df2["hour"].between(8, 17).astype(int)

    # TOU(요금제) 정보 없으니 기본값 0
    df2["is_offpeak"] = 0
    df2["is_peak"]    = 0

    # 5) 소비량 랙/롤링 (건물별 시계열)
    tgt = "전력소비량(kWh)"
    df2 = df2.sort_values(["건물번호","dt"]).copy()

    def _group_feats(g):
        g["cons_lag1"]      = g[tgt].shift(1)
        g["cons_lag_24h"]   = g[tgt].shift(24)
        g["cons_lag_48h"]   = g[tgt].shift(48)
        g["cons_lag_72h"]   = g[tgt].shift(72)
        g["cons_lag_168h"]  = g[tgt].shift(168)

        g["cons_mean_24h"]  = g[tgt].shift(1).rolling(24).mean()
        g["cons_std_24h"]   = g[tgt].shift(1).rolling(24).std()

        # 지난 7일 같은 시각 평균: 24,48,...,168h 랙의 평균
        samehour_lags = [24,48,72,96,120,144,168]
        g["cons_samehour_mean_7d"] = np.nanmean([g[tgt].shift(k) for k in samehour_lags], axis=0)

        g["delta_1h"] = g[tgt] - g["cons_lag1"]
        g["delta_7d"] = g[tgt] - g["cons_lag_168h"]

        if "연면적(m2)" in g.columns:
            g["cons_lag1_per_m2"]   = g["cons_lag1"] / g["연면적(m2)"].replace(0,np.nan)
            g["cons_mean24_per_m2"] = g["cons_mean_24h"] / g["연면적(m2)"].replace(0,np.nan)
        else:
            g["cons_lag1_per_m2"] = g["cons_mean24_per_m2"] = np.nan

        # ESS 관련 간단 파생
        if "ESS저장용량(kWh)" in g.columns:
            g["ess_to_load_lag_ratio"] = g["ESS저장용량(kWh)"] / (g["cons_lag1"].abs() + 1e-3)
        else:
            g["ess_to_load_lag_ratio"] = 0.0

        return g

    df2 = df2.groupby("건물번호", group_keys=False).apply(_group_feats)

    # 면적 보정된 CDD*일사
    if {"CDD_x_rad","연면적(m2)"} <= set(df2.columns):
        df2["CDD_x_rad_area"] = df2["CDD_x_rad"] * df2["연면적(m2)"]
    else:
        df2["CDD_x_rad_area"] = 0.0

    # 6) 초기 NA 제거 (시계열 랙/롤링으로 생긴 앞부분)
    df2 = df2.dropna(subset=[
        "cons_lag1","cons_lag_24h","cons_lag_168h","cons_mean_24h","cons_std_24h",
        "cons_samehour_mean_7d"
    ])

    return df2

# === 피처 생성 → 학습/평가 ===
df_feat = build_features(df)

TARGET = "전력소비량(kWh)"
y = df_feat[TARGET]
# 모델 missing에 있었던 피처들을 최대한 포함 + 원본 주요 수치도 포함
feature_cols = [c for c in df_feat.columns if c not in [TARGET, "dt", "날짜"]]

X = df_feat[feature_cols]

# 시간 정렬 후 홀드아웃 분할(80/20)
df_feat = df_feat.sort_values(["건물번호","dt"])
split_idx = int(len(df_feat)*0.8)
X_tr, X_te = X.iloc[:split_idx], X.iloc[split_idx:]
y_tr, y_te = y.iloc[:split_idx], y.iloc[split_idx:]

# CatBoost 하이퍼파라미터: 랜덤서치 우승 세트
params = dict(
    bagging_temperature=0.3567533266935893,
    depth=7,
    iterations=687,
    l2_leaf_reg=4.798872582107739,
    learning_rate=0.04268318024772864,
    random_strength=0.8021969807540397,
    rsm=0.6298202574719083,
    loss_function="RMSE",
    eval_metric="RMSE",
    verbose=100,
    random_state=42,
)

# 범주형 지정(문자/카테고리형)
cat_cols = X_tr.select_dtypes(include=["object","category"]).columns.tolist()

est2 = CatBoostRegressor(**params)
est2.fit(X_tr, y_tr, cat_features=cat_cols, eval_set=(X_te, y_te))

y_hat = est2.predict(X_te)
rmse = mean_squared_error(y_te, y_hat) ** 0.5
mae  = mean_absolute_error(y_te, y_hat)
r2   = r2_score(y_te, y_hat)
print(f"[WITH engineered feats] RMSE={rmse:.4f}  MAE={mae:.4f}  R2={r2:.4f}")

# 저장(피처 스키마 함께)
dump({"model": est2, "features": feature_cols, "cat_cols": cat_cols, "target": TARGET},
     "cat_w_engineered_feats.joblib")
print("saved -> cat_w_engineered_feats.joblib")


0:	learn: 3828.2641116	test: 2397.2452595	best: 2397.2452595 (0)	total: 14.8ms	remaining: 10.1s
100:	learn: 217.9493224	test: 149.7434810	best: 149.7434810 (100)	total: 1.39s	remaining: 8.06s
200:	learn: 137.7667311	test: 93.5256631	best: 93.5256631 (200)	total: 2.7s	remaining: 6.53s
300:	learn: 108.2132538	test: 75.3806850	best: 75.3806850 (300)	total: 4.05s	remaining: 5.2s
400:	learn: 91.1485818	test: 63.9402526	best: 63.9389206 (399)	total: 5.39s	remaining: 3.85s
500:	learn: 80.0228457	test: 58.3351909	best: 58.2957086 (499)	total: 6.78s	remaining: 2.52s
600:	learn: 72.3833507	test: 54.7263474	best: 54.7263474 (600)	total: 8.09s	remaining: 1.16s
686:	learn: 67.2343403	test: 52.4991448	best: 52.4826659 (678)	total: 9.27s	remaining: 0us

bestTest = 52.48266587
bestIteration = 678

Shrink model to first 679 iterations.
[WITH engineered feats] RMSE=52.4827  MAE=35.6911  R2=0.9990
saved -> cat_w_engineered_feats.joblib


In [79]:
from joblib import load

# 학습 때 쓴 build_features()는 그대로 재사용해야 합니다.
pack = load("cat_w_engineered_feats.joblib")
model   = pack["model"]
FEATS   = pack["features"]
CATCOLS = pack["cat_cols"]
TARGET  = pack["target"]  # '전력소비량(kWh)'

def predict_from_raw(raw_df):
    df_feat = build_features(raw_df)     # ★ 학습과 동일 함수
    X = df_feat[FEATS].copy()
    for c in CATCOLS:
        if c in X:
            X[c] = X[c].astype("category")
    y_hat = model.predict(X)
    return df_feat[["건물번호","dt"]].assign(pred=y_hat)


In [80]:
import pandas as pd

def forecast_next_24h(hist_df, future_exog_df):
    """
    hist_df : 과거 실측 포함(raw 스키마, 건물번호/날짜/시간/기상/설비/소비량)
    future_exog_df : 같은 스키마에서 미래구간의 외생변수(기온·일사·습도·날짜·시간 등)만 채운 데이터
                      TARGET(전력소비량)은 비워둠
    """
    # 시간축/열 맞춰서 이어붙이기
    df_all = pd.concat([hist_df, future_exog_df], ignore_index=True, sort=False)

    preds = []
    # 미래 시각을 오름차순으로 한 스텝씩 예측→TARGET 채워넣기(랙 업데이트용)
    for t in sorted(future_exog_df["날짜"].astype("datetime64[ns]") + 
                    pd.to_timedelta(future_exog_df["시간"], unit="h").unique()):
        df_all["dt"] = pd.to_datetime(df_all["날짜"]) + pd.to_timedelta(df_all["시간"], unit="h")
        df_feat = build_features(df_all)

        this_mask = df_feat["dt"].eq(t)
        X_t = df_feat.loc[this_mask, FEATS].copy()
        for c in CATCOLS:
            if c in X_t:
                X_t[c] = X_t[c].astype("category")

        y_t = model.predict(X_t)
        # 원본 df_all의 해당 시각 TARGET을 예측값으로 채워 다음 스텝 랙에 반영
        idx_all = df_all["dt"].eq(t)
        df_all.loc[idx_all, TARGET] = y_t
        preds.append(df_feat.loc[this_mask, ["건물번호","dt"]].assign(pred=y_t))

    return pd.concat(preds, ignore_index=True)


In [81]:
# 스키마/카테고리 목록 함께 백업
import json
json.dump({"features": FEATS, "cat_cols": CATCOLS, "target": TARGET},
          open("model_schema.json","w"), ensure_ascii=False, indent=2)

# CatBoost 원포맷 저장(파이썬 아닌 환경에서 쓸 때)
model.save_model("cat_model.cbm", format="cbm")


In [82]:
import json, math, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from joblib import load
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# 0-1) 모델 패키지 로드
pack   = load("cat_w_engineered_feats.joblib")
model  = pack["model"]
FEATS  = pack["features"]
CATCOLS= pack["cat_cols"]
TARGET = pack["target"]     # '전력소비량(kWh)'

print("Loaded:", type(model).__name__)
print("n_features:", len(FEATS), "| target:", TARGET)

# 0-2) 유틸
def to_dt(df):
    if "dt" in df: return df.copy()
    out = df.copy()
    out["dt"] = pd.to_datetime(out["날짜"]) + pd.to_timedelta(out["시간"], unit="h")
    return out

def ensure_cat(df, cat_cols):
    for c in cat_cols:
        if c in df:
            df[c] = df[c].astype("category")
    return df

def rmse(y_true, y_pred):
    # sklearn 버전 이슈( squared 파라미터 ) 회피
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def metrics_frame(y_true, y_pred):
    return pd.DataFrame({
        "RMSE":[rmse(y_true, y_pred)],
        "MAE":[mean_absolute_error(y_true, y_pred)],
        "R2":[r2_score(y_true, y_pred)]
    })

def scatter_diag(ax, x, y, alpha=0.3):
    ax.scatter(x, y, s=8, alpha=alpha)
    lo = np.nanmin([x.min(), y.min()])
    hi = np.nanmax([x.max(), y.max()])
    ax.plot([lo,hi],[lo,hi], linestyle="--")
    ax.set_xlim(lo, hi); ax.set_ylim(lo, hi)


Loaded: CatBoostRegressor
n_features: 48 | target: 전력소비량(kWh)


In [83]:
def holdout_compare(raw_df, test_ratio=0.2, buildings_to_plot=None):
    df = to_dt(raw_df).sort_values("dt").reset_index(drop=True)

    # 시계열 80/20 분할
    cutoff = df["dt"].quantile(1 - test_ratio)
    df_tr  = df[df["dt"] <= cutoff].copy()
    df_te  = df[df["dt"] >  cutoff].copy()

    # --- 재귀 예측(테스트 구간) ---
    df_all = pd.concat([df_tr, df_te], ignore_index=True)
    preds  = []
    # 테스트 시간 오름차순(전 건물 동시 예측)
    future_times = np.sort(df_te["dt"].unique())
    for t in future_times:
        # 매 스텝마다 피처 재계산(과거 실측 + 직전 시점 예측 포함)
        feat_all = build_features(df_all)   # ★ 학습과 동일 함수 사용
        feat_all = ensure_cat(feat_all, CATCOLS)
        # 이번 시점 t의 행만 뽑아 예측
        m = feat_all["dt"].eq(t)
        X_t = feat_all.loc[m, FEATS]
        yhat_t = model.predict(X_t)
        # 예측을 원본 df_all의 TARGET에 채워넣어 다음 시점 랙이 예측값을 참조하도록
        idx = df_all["dt"].eq(t)
        df_all.loc[idx, TARGET] = yhat_t
        preds.append(
            feat_all.loc[m, ["건물번호","dt"]].assign(pred=yhat_t)
        )

    pred_df = pd.concat(preds, ignore_index=True)
    # 실측 결합
    truth = df_te[["건물번호","dt", TARGET]].rename(columns={TARGET:"y"})
    out   = pred_df.merge(truth, on=["건물번호","dt"], how="left")

    # --- 요약 지표 ---
    overall = metrics_frame(out["y"], out["pred"])
    by_bld  = out.groupby("건물번호").apply(lambda g: metrics_frame(g["y"], g["pred"])).reset_index().drop(columns=0)
    by_bld.rename(columns={"RMSE":"RMSE","MAE":"MAE","R2":"R2"}, inplace=True)

    print("=== Overall (holdout) ===")
    display(overall)
    print("\n=== By Building ===")
    display(by_bld.sort_values("RMSE"))

    # --- 플롯: 산점/잔차/시간대 ---
    fig1, ax1 = plt.subplots(figsize=(5,5))
    scatter_diag(ax1, out["y"].values, out["pred"].values, alpha=0.2)
    ax1.set_xlabel("Actual"); ax1.set_ylabel("Predicted"); ax1.set_title("Holdout: Pred vs Actual")
    plt.show()

    # Residual dist
    out["resid"] = out["pred"] - out["y"]
    fig2, ax2 = plt.subplots(figsize=(6,3))
    ax2.hist(out["resid"].dropna(), bins=60)
    ax2.set_title("Residual distribution (pred - actual)")
    plt.show()

    # Hour-of-day bias
    tmp = out.copy()
    tmp["hour"] = tmp["dt"].dt.hour
    hstat = tmp.groupby("hour")["resid"].agg(["median","mean","std"]).reset_index()
    print("\n=== Residual by hour ===")
    display(hstat)
    fig3, ax3 = plt.subplots(figsize=(6,3))
    ax3.plot(hstat["hour"], hstat["mean"], marker="o")
    ax3.set_xlabel("Hour"); ax3.set_ylabel("Residual mean"); ax3.set_title("Residual mean by hour")
    plt.show()

    # 타임시리즈 예시(건물 3개)
    if buildings_to_plot is None:
        buildings_to_plot = list(out["건물번호"].dropna().unique())[:3]

    for b in buildings_to_plot:
        g = out[out["건물번호"]==b].sort_values("dt")
        fig, ax = plt.subplots(figsize=(10,3))
        ax.plot(g["dt"], g["y"],  label="Actual")
        ax.plot(g["dt"], g["pred"], label="Pred", alpha=0.8)
        ax.legend(); ax.set_title(f"Building {b} - Holdout")
        plt.show()

    return out, overall, by_bld

# 실행 예시
# raw_df = ...  # 원본 전체 데이터프레임(학습에 쓴 것과 동일 스키마)
# out, overall, by_bld = holdout_compare(raw_df, test_ratio=0.2)


In [84]:
def rolling_backtest(raw_df, n_splits=4):
    df = to_dt(raw_df).sort_values("dt").reset_index(drop=True)
    ts = df["dt"].sort_values().unique()
    cut_idx = np.linspace(0, len(ts), n_splits+1, dtype=int)

    report = []
    pred_all = []

    for k in range(n_splits):
        t0, t1 = ts[cut_idx[k]], ts[cut_idx[k+1]-1]
        df_tr = df[df["dt"] <= t0].copy()
        df_te = df[(df["dt"] > t0) & (df["dt"] <= t1)].copy()
        if df_te.empty: continue

        # 재귀 예측
        df_all = pd.concat([df_tr, df_te], ignore_index=True)
        preds=[]
        for t in np.sort(df_te["dt"].unique()):
            feat_all = build_features(df_all)
            feat_all = ensure_cat(feat_all, CATCOLS)
            m = feat_all["dt"].eq(t)
            X_t = feat_all.loc[m, FEATS]
            yhat_t = model.predict(X_t)
            idx = df_all["dt"].eq(t)
            df_all.loc[idx, TARGET] = yhat_t
            preds.append(feat_all.loc[m, ["건물번호","dt"]].assign(pred=yhat_t))

        fold_pred = pd.concat(preds, ignore_index=True)
        fold_truth= df_te[["건물번호","dt", TARGET]].rename(columns={TARGET:"y"})
        fold_out  = fold_pred.merge(fold_truth, on=["건물번호","dt"], how="left")
        fold_out["fold"] = k
        pred_all.append(fold_out)

        m = metrics_frame(fold_out["y"], fold_out["pred"])
        m["fold"] = k; m["from"]=t0; m["to"]=t1
        report.append(m)

        print(f"[Fold {k}] {t0} → {t1} | RMSE={m['RMSE'].values[0]:.3f}  MAE={m['MAE'].values[0]:.3f}  R2={m['R2'].values[0]:.4f}")

    rep = pd.concat(report, ignore_index=True) if report else pd.DataFrame()
    pred_all = pd.concat(pred_all, ignore_index=True) if pred_all else pd.DataFrame()

    print("\n=== Rolling backtest summary ===")
    display(rep)

    # 플롯: 폴드별 RMSE
    if not rep.empty:
        fig, ax = plt.subplots(figsize=(6,3))
        ax.plot(rep["fold"], rep["RMSE"], marker="o")
        ax.set_xlabel("Fold"); ax.set_ylabel("RMSE"); ax.set_title("Rolling backtest RMSE")
        plt.show()

    return pred_all, rep

# 실행 예시
# pred_all, rep = rolling_backtest(raw_df, n_splits=6)


[OK] Loaded model from dict: cat_w_engineered_feats.joblib -> CatBoostRegressor
[model] n_features_expected: 48


TypeError: '>' not supported between instances of 'str' and 'int'

In [87]:
# 1) 모델 로드 (dict 대응 + feat_names 복구)
model = None
feat_names_saved = None

for p in MODEL_PATHS:
    if os.path.exists(p):
        obj = joblib.load(p)
        if isinstance(obj, dict):
            model = obj.get("model", obj)
            feat_names_saved = obj.get("feat_names")  # dict에 저장해둔 피처명 복구
            print(f"[OK] Loaded model from dict: {p} -> {type(model).__name__}")
        else:
            model = obj
            print(f"[OK] Loaded model: {p} -> {type(model).__name__}")
        break
if model is None:
    raise FileNotFoundError("모델 파일을 찾지 못했습니다.")

# 피처 이름 확보 (dict 저장분 > 모델 내부 > 없음)
feat_names = feat_names_saved or list(getattr(model, "feature_names_", [])) or None
print("[model] n_features_expected:", len(feat_names) if feat_names else "(unknown)")


[OK] Loaded model from dict: cat_w_engineered_feats.joblib -> CatBoostRegressor
[model] n_features_expected: 48


In [88]:
def add_simple_extra_feats(df, expect_cols):
    out = df.copy()
    if not expect_cols:
        return out

    # ---- 수치형 강제 변환 (문자 -> 숫자) ----
    num_cols = [
        "태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)",
        "기온(°C)", "일사(MJ/m2)", "습도(%)", "연면적(m2)"
    ]
    for c in num_cols:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")

    # ---- has_* 플래그 ----
    if "has_pv" in expect_cols and "태양광용량(kW)" in out.columns:
        pv = out["태양광용량(kW)"].fillna(0.0)
        out["has_pv"] = (pv > 0).astype(int)

    if "has_ess" in expect_cols and "ESS저장용량(kWh)" in out.columns:
        ess = out["ESS저장용량(kWh)"].fillna(0.0)
        out["has_ess"] = (ess > 0).astype(int)

    if "has_pcs" in expect_cols and "PCS용량(kW)" in out.columns:
        pcs = out["PCS용량(kW)"].fillna(0.0)
        out["has_pcs"] = (pcs > 0).astype(int)

    # ---- log1p_* ----
    if "log1p_태양광용량(kW)" in expect_cols and "태양광용량(kW)" in out.columns:
        out["log1p_태양광용량(kW)"] = np.log1p(out["태양광용량(kW)"].fillna(0.0))
    if "log1p_ESS저장용량(kWh)" in expect_cols and "ESS저장용량(kWh)" in out.columns:
        out["log1p_ESS저장용량(kWh)"] = np.log1p(out["ESS저장용량(kWh)"].fillna(0.0))
    if "log1p_PCS용량(kW)" in expect_cols and "PCS용량(kW)" in out.columns:
        out["log1p_PCS용량(kW)"] = np.log1p(out["PCS용량(kW)"].fillna(0.0))

    # ---- CDD 및 파생 ----
    if "기온(°C)" in out.columns:
        t = out["기온(°C)"].astype(float)
        cdd = np.clip(t - CDDH_BASE_TEMP, 0, None)
        if "CDD" in expect_cols:
            out["CDD"] = cdd

        if "일사(MJ/m2)" in out.columns and "CDD_x_rad" in expect_cols:
            out["CDD_x_rad"] = cdd * out["일사(MJ/m2)"].astype(float)

        if "습도(%)" in out.columns and "CDD_humid_adj" in expect_cols:
            out["CDD_humid_adj"] = cdd * (1 + out["습도(%)"].astype(float) / 100.0)

        if (
            "연면적(m2)" in out.columns and
            "일사(MJ/m2)" in out.columns and
            "CDD_x_rad_area" in expect_cols
        ):
            out["CDD_x_rad_area"] = (
                cdd * out["일사(MJ/m2)"].astype(float) * out["연면적(m2)"].astype(float)
            )

    return out


In [89]:
if feat_names:
    for c in feat_names:
        if c not in test_df.columns:
            test_df[c] = np.nan
    X_test = test_df.reindex(columns=feat_names)
else:
    drop_cols = ID_COLS + ([Y_COL] if Y_COL in test_df.columns else [])
    base_cols = [c for c in test_df.columns if c not in drop_cols]
    X_test = test_df[base_cols]
    print("[warn] feature_names_ 없음 -> 공통 컬럼만 사용")


In [92]:
cat_cols = ["건물용도"]   # 학습 때 범주형으로 넣었던 컬럼들


In [94]:
print(X_test.columns.tolist())


['건물번호', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)', '일사(MJ/m2)', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', '시간', 'hour_sin', 'hour_cos', 'month', 'hour', 'dayofyear', 'weekday', 'is_weekend', 'is_holiday', 'CDD', 'CDD_x_rad', 'CDD_humid_adj', 'log1p_태양광용량(kW)', 'has_pv', 'log1p_ESS저장용량(kWh)', 'has_ess', 'log1p_PCS용량(kW)', 'has_pcs', 'is_daylight', 'is_offpeak', 'is_peak', 'cons_lag1', 'cons_lag_24h', 'cons_lag_48h', 'cons_lag_72h', 'cons_lag_168h', 'cons_mean_24h', 'cons_std_24h', 'cons_samehour_mean_7d', 'delta_1h', 'delta_7d', 'cons_lag1_per_m2', 'cons_mean24_per_m2', 'ess_to_load_lag_ratio', 'CDD_x_rad_area']


In [97]:
import pandas as pd
import numpy as np
from catboost import Pool

# 0) 안전장치: 필요한 전역이 있는지 확인
assert 'X_test' in globals(), "X_test가 없습니다."
assert 'test_df' in globals(), "test_df가 없습니다."
assert 'model' in globals(), "model이 없습니다."

# 1) '건물유형'을 모델 학습과 동일하게 숫자로 맞추기
if '건물유형' in X_test.columns and not pd.api.types.is_numeric_dtype(X_test['건물유형']):
    mapped = None

    # (우선권) train_df에 '건물번호'별 숫자 코드가 있으면 그걸 그대로 매핑 (가장 안전)
    if 'train_df' in globals() and train_df is not None \
       and '건물번호' in train_df.columns and '건물유형' in train_df.columns \
       and pd.api.types.is_numeric_dtype(train_df['건물유형']):
        b2code = (train_df
                  .dropna(subset=['건물번호','건물유형'])
                  .groupby('건물번호')['건물유형']
                  .agg(lambda s: s.mode().iloc[0]))
        mapped = test_df['건물번호'].map(b2code)

    # (대안1) 아직도 문자열이면: train_df의 문자열 카테고리 기준으로 고정 인코딩
    if mapped is None or mapped.isna().all():
        if 'train_df' in globals() and train_df is not None and '건물유형' in train_df.columns:
            cats = pd.Categorical(
                pd.concat([train_df['건물유형'].astype(str),
                           X_test['건물유형'].astype(str)], ignore_index=True)
            ).categories
            cat2id = {c:i for i,c in enumerate(cats)}
            mapped = X_test['건물유형'].astype(str).map(cat2id)

    # (대안2) 최후의 수단: 테스트 안에서만 factorize (정확도는 다소 손해)
    if mapped is None or mapped.isna().all():
        mapped = pd.Series(pd.factorize(X_test['건물유형'].astype(str))[0], index=X_test.index)

    X_test['건물유형'] = pd.to_numeric(mapped, errors='coerce').fillna(-1).astype(float)

# 2) 혹시 남은 object 컬럼이 있으면 숫자로 강제 변환 (불필요 컬럼은 사전에 제외했을 가능성 큼)
for c in X_test.columns:
    if X_test[c].dtype == 'object':
        X_test[c] = pd.to_numeric(X_test[c], errors='coerce')
X_test = X_test.fillna(0)

# 3) 예측
X_pool = Pool(X_test)  # cat_features 안 넘김 (모델이 숫자 피처로 학습됨)
pred = model.predict(X_pool)

# 4) 제출 파일 저장
id_cols = ["건물번호","날짜","시간"] if all(col in test_df.columns for col in ["건물번호","날짜","시간"]) \
          else [c for c in ["건물번호"] if c in test_df.columns]
submit = test_df[id_cols].copy()
submit["전력소비량(kWh)"] = pred
submit.to_csv("submission.csv", index=False, encoding="utf-8-sig")
print("✅ saved -> submission.csv")
print(submit.head())


✅ saved -> submission.csv
   건물번호         날짜  시간  전력소비량(kWh)
0     1 2024-08-25   0  122.559533
1     1 2024-08-25   1  122.494814
2     1 2024-08-25   2  122.623458
3     1 2024-08-25   3  122.078199
4     1 2024-08-25   4  122.078199


In [30]:
# 결과 저장
df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')