In [79]:
import pandas as pd

# 파일 경로
train_path = r"C:\Users\user\Downloads\open (1)\train.csv"
building_info_path = r"C:\Users\user\Downloads\open (1)\building_info.csv"

# CSV 불러오기
train_df = pd.read_csv(train_path)
building_info_df = pd.read_csv(building_info_path)

# 병합 (건물번호 기준)
merged_df = pd.merge(train_df, building_info_df, on='건물번호', how='left')

# 결과 저장
merged_df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')

print("병합 완료! merged_train.csv로 저장됨")


병합 완료! merged_train.csv로 저장됨


In [80]:
def read_csv_smart(path):
    import pandas as pd
    for enc in ['cp949', 'utf-8-sig', 'utf-8', 'euc-kr']:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # 최후의 보루: 깨지는 글자는 � 로 대체
    return pd.read_csv(path, encoding='utf-8', errors='replace')

In [81]:
# CSV 불러오기 (예시)
df = read_csv_smart("C:\\Users\\user\\Downloads\\open (1)\\merged_train.csv")


# '일시'를 문자열로 변환 후 날짜와 시간 분리
df['일시'] = df['일시'].astype(str)

# 날짜(YYYYMMDD)와 시간(HH) 분리
df['날짜'] = df['일시'].str.slice(0, 8)     # 앞 8자리 → 날짜
df['시간'] = df['일시'].str.slice(9, 11)    # 9~10번째 자리 → 시간

# 날짜를 datetime 형식으로 변환
df['날짜'] = pd.to_datetime(df['날짜'], format='%Y%m%d')
df['시간'] = df['시간'].astype(int)

# 확인
print(df[['일시', '날짜', '시간']].head())



            일시         날짜  시간
0  20240601 00 2024-06-01   0
1  20240601 01 2024-06-01   1
2  20240601 02 2024-06-01   2
3  20240601 03 2024-06-01   3
4  20240601 04 2024-06-01   4


In [82]:
df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,호텔,82912.71,77586.0,-,-,-,2024-06-01,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,호텔,82912.71,77586.0,-,-,-,2024-06-01,1
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,호텔,82912.71,77586.0,-,-,-,2024-06-01,2
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,호텔,82912.71,77586.0,-,-,-,2024-06-01,3
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,호텔,82912.71,77586.0,-,-,-,2024-06-01,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,호텔,162070.24,152943.0,-,-,-,2024-08-24,19
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,호텔,162070.24,152943.0,-,-,-,2024-08-24,20
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,호텔,162070.24,152943.0,-,-,-,2024-08-24,21
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,호텔,162070.24,152943.0,-,-,-,2024-08-24,22


In [83]:
# 결측치 대체할 컬럼 목록
cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']

# '-'를 0으로 바꾸고 숫자형으로 변환
for col in cols:
    df[col] = df[col].replace('-', 0).astype(float)

In [84]:
df = df.drop(columns=['num_date_time', '일시'])

In [85]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['건물유형'] = le.fit_transform(df['건물유형'])
df['날짜'] = pd.to_datetime(df['날짜'])

In [86]:
import pandas as pd
import numpy as np

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    # 0) 기본 정렬 & datetime 만들기
    #    (이미 df['날짜']와 df['시간']이 있다면 그대로 쓰되, 한 줄짜리 datetime을 만들어두면 편함)
    df = df.copy()
    df['날짜'] = pd.to_datetime(df['날짜'])
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # -------------------------------------------------------
    # 1) 최근 24시간 평균, 최근 7일(같은 시각) 평균  → 전부 "과거만" 보도록 shift 사용
    # -------------------------------------------------------
    grp = df.groupby('건물번호', group_keys=False)

    # (a) 최근 24시간 평균 (전력소비량 기준)
    #  - window=24, past-only를 위해 shift(1) 후 rolling
    df['cons_lag1'] = grp['전력소비량(kWh)'].shift(1)
    df['cons_mean_24h'] = grp['cons_lag1'].rolling(window=24, min_periods=1).mean()

    # (b) 최근 7일 같은 시각 평균 (24시간 간격으로 7개)
    #  - 1일 전 같은 시각부터 7일 전 같은 시각까지 평균
    same_hour_lag = grp['전력소비량(kWh)'].shift(24)
    df['cons_samehour_mean_7d'] = same_hour_lag.rolling(window=7, min_periods=1).mean()

    # 참고로 모델에 바로 쓰진 않아도 되는 추가 라그들(원하면 활성화)
    df['cons_lag_24h'] = grp['전력소비량(kWh)'].shift(24)
    df['cons_lag_48h'] = grp['전력소비량(kWh)'].shift(48)
    df['cons_lag_72h'] = grp['전력소비량(kWh)'].shift(72)
    df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전 같은 시각

    # -------------------------------------------------------
    # 2) 기온·일사 기반 냉방 수요 지표 (CDD류)
    # -------------------------------------------------------
    # 한국 여름 기준 base temp 24°C 가합리(필요시 23~26으로 튜닝)
    base_temp = 24.0
    # ‘냉방도수’(Cooling Degree) 시간 단위
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    # 일사량(MJ/m2)과의 상호작용: 햇볕이 강할수록 체감 부하↑
    # 일사량이 0~상위 99퍼센타일 사이로 정규화(robust)
    q99 = df['일사(MJ/m2)'].quantile(0.99)
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / (q99 + 1e-6))
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    # 습도(%)와의 상호작용: 습도가 높으면 동일 온도에서도 냉방 부하↑
    # 간단히 (1 + 습도/100*알파) 가중. 알파=0.3 정도로 시작(튜닝 가능)
    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # -------------------------------------------------------
    # 3) 주말/평일, 공휴일
    # -------------------------------------------------------
    df['weekday'] = df['dt'].dt.weekday  # 월=0 ... 일=6
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    # 2024-06~08 사이 한국 공휴일: 현충일(6/6), 광복절(8/15)
    kr_holidays = {
        pd.Timestamp(2024, 6, 6),  # 현충일
        pd.Timestamp(2024, 8, 15), # 광복절
    }
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # -------------------------------------------------------
    # 4) 태양광·ESS·PCS 용량 대비 “동작 가능성” 지표
    #    (실제 제어 로그가 없으니 ‘가능성/잠재력’을 피처로 넣는다)
    # -------------------------------------------------------
    # 설비 유무 이진
    df['has_pv'] = (df['태양광용량(kW)'] > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)'] > 0).astype(int)

    # 낮/밤 플래그 (대략 일사량>0이면 주간으로 간주)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    # 태양광 ‘동작 가능성’ (설비 있고 + 주간/일사>0)
    df['pv_active_potential'] = ((df['has_pv'] == 1) & (df['is_daylight'] == 1)).astype(int)

    # 피크/오프피크 (현실 요금제와 다를 수 있지만 합리적 초기값)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    # ESS 충방전 ‘가능성’ 피처
    df['ess_charge_potential']   = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    # 용량 스케일 자체도 피처로 사용(로그 스케일로 완만화; 0은 0으로 유지)
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    # 누설 방지: 타깃 기반 비율은 과거 라그로만 계산
    # ESS 대비 부하 비율(전일 같은 시각 소비량 사용)
    df['ess_to_load_lag_ratio'] = np.where(
        df['cons_lag_24h'].notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # -------------------------------------------------------
    # 5) 기타 유틸리티 파생
    # -------------------------------------------------------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']  # 가독성용 복사
    df['dayofyear'] = df['dt'].dt.dayofyear

    # 모델 입력 전에 의미 없는 원본(또는 중복) 컬럼 정리 원하면 아래 사용
    # drop_cols = ['dt']  # 학습 시 굳이 안 써도 되면 제거
    # df = df.drop(columns=drop_cols)

    return df

# 사용 예시:
# df_feat = make_features(df)
# df_feat.head()


In [87]:
# =========================
# 0. 라이브러리 & 경로 설정
# =========================
import os
import numpy as np
import pandas as pd

# pip install lightgbm 먼저 (처음 1번만)
# pip install lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

DATA_DIR = r"C:\Users\user\Downloads\open (1)"
TRAIN_MERGED_PATH = os.path.join(DATA_DIR, "merged_train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")
BUILD_PATH = os.path.join(DATA_DIR, "building_info.csv")
SAMPLE_SUB = os.path.join(DATA_DIR, "sample_submission.csv")
OUT_SUB    = os.path.join(DATA_DIR, "baseline_lgbm_submission.csv")


In [88]:
# ===== IMPORTS (필요시 중복 있어도 무방) =====
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

# ===== 피처 선택 헬퍼: 숫자/카테고리만 남기기 + 불필요 컬럼 드롭 =====
def get_feature_cols(df: pd.DataFrame) -> list:
    base_drop = ['전력소비량(kWh)', 'dt', '날짜', '시간', '일시', 'num_date_time']
    cols = [c for c in df.columns if c not in base_drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# =========================
# 1. 유틸 함수들
# =========================
def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    """일시 → 날짜/시간 분리(또는 이미 분리돼 있으면 그대로) + dt 생성"""
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    """설비 용량에 '-'가 있으면 0으로 치환 후 float 변환"""
    df = df.copy()
    cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

# =========================
# 2. 특징 엔지니어링(수정 버전)
# =========================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: 전부 shift/rolling(=transform)로 과거만 사용
    - groupby().rolling() 대신 groupby().transform(...) 사용
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # 그룹 객체
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전
        # 최근 24시간 평균 (과거만 참고)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())
        # 최근 7일 같은 시각 평균
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    return df

# =========================
# 3. 학습 데이터 로드 & 피처 생성
# =========================
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)
train_feat = make_features(train_df)

# =========================
# 4. 시계열 검증 분할 (2024-08-17 ~ 2024-08-24)
# =========================
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)

is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# === 피처 선택 (object/문자열 배제) ===
features = get_feature_cols(train_feat)
target_col = '전력소비량(kWh)'

# 카테고리 지정(피처에 포함된 컬럼만)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr, y_tr = train_part[features], train_part[target_col]
X_va, y_va = valid_part[features], valid_part[target_col]

# =========================
# 5. LightGBM 학습 & 검증 (콜백 방식 조기종료)
# =========================
lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}

callbacks = [
    lgb.early_stopping(stopping_rounds=200),
    lgb.log_evaluation(period=200)
]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

pred_va = model.predict(X_va, num_iteration=model.best_iteration)
print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))



  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 173.366	valid's rmse: 230.791
[400]	train's rmse: 154.914	valid's rmse: 221.188
[600]	train's rmse: 143.383	valid's rmse: 216.656
[800]	train's rmse: 134.649	valid's rmse: 214.773
[1000]	train's rmse: 127.614	valid's rmse: 213.857
[1200]	train's rmse: 121.745	valid's rmse: 213.28
[1400]	train's rmse: 116.665	valid's rmse: 213.123
[1600]	train's rmse: 112.139	valid's rmse: 212.795
[1800]	train's rmse: 108.081	valid's rmse: 212.451
Early stopping, best iteration is:
[1739]	train's rmse: 109.199	valid's rmse: 212.38
VALID RMSE: 212.38019335781
VALID MAE : 101.67560328341894


In [89]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    # dt/시간 보장
    if 'dt' not in all_df.columns:
        raise ValueError("dt 없으면 ensure_datetime_cols 먼저 호출")
    all_df = all_df.copy()
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']

    # train/test 구분: 타깃 존재 여부로 판별
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else pd.Series(False, index=all_df.index)

    # 기준 통계 (train에서만)
    rad_ref = all_df.loc[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df.loc[is_train].groupby(['month','hour'])['일조(hr)'].median()

    # index 매칭해서 test 행만 채움
    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_fill_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_fill_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))

    all_df.loc[need_fill_rad, '일사(MJ/m2)'] = fill_rad[need_fill_rad.values]
    all_df.loc[need_fill_sun, '일조(hr)']    = fill_sun[need_fill_sun.values]

    # 혹시라도 남은 결측은 0으로
    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [90]:
# train + test concat
all_df = pd.concat([train_df, test_df], ignore_index=True)
all_df = clean_capacity_fields(all_df)
all_df = ensure_datetime_cols(all_df)

# ★ 추가
all_df = backfill_solar_by_time(all_df)

# 그 다음 피처 생성
all_feat = make_features(all_df)


In [91]:
# make_features() 끝부분에 추가
df['hour_sin'] = np.sin(2*np.pi*df['시간']/24)
df['hour_cos'] = np.cos(2*np.pi*df['시간']/24)


In [92]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: shift/rolling 모두 과거만 사용
    - groupby().transform(...) 으로 인덱스 정렬 유지
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬 후 그룹 객체 생성
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)  # ← 여기서 grp 정의

    # ---------- 타깃 라그 & 롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전

        # 최근 24시간 평균(과거만; shift(1) 후 rolling)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())

        # 최근 7일 같은 시각 평균(24시간 간격 7개)
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())

        # ✅ 최근 24시간 표준편차(변동성) — cons_lag1(과거값) 기반
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(window=24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    # (옵션) 시간 사이클릭 인코딩 원하면 활성화
    # df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    # df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    return df




In [93]:
params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.4,  # 1.2~1.6 사이 튜닝
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}


In [94]:
MAX_ABS = 1e12  # 과도한 값 하드 클립 임계

def sanitize_matrix(X: pd.DataFrame) -> pd.DataFrame:
    """Inf 제거 + 과대값 클립 (LightGBM은 NaN은 허용, Inf는 불가)"""
    X = X.copy()
    num_cols = X.select_dtypes(include=[np.number]).columns
    X[num_cols] = X[num_cols].replace([np.inf, -np.inf], np.nan)
    X[num_cols] = X[num_cols].clip(lower=-MAX_ABS, upper=MAX_ABS)
    return X

def safe_log1p_vec(a):
    """음수/비정상값 방지 후 log1p"""
    a = np.asarray(a, dtype=float)
    # 비정상(y에 NaN/inf) → 0으로 대체 (혹은 np.nan 유지하고 마스킹하려면 전략 바꿔도 됨)
    a = np.where(np.isfinite(a), a, 0.0)
    a = np.clip(a, 0, None)  # -0 방지
    return np.log1p(a)



In [95]:
# =========================
# 5. LightGBM 학습 & 검증 (로그변환 + 입력 정화)
# =========================
# X 정화(Inf 제거/클립)
X_tr = sanitize_matrix(X_tr)
X_va = sanitize_matrix(X_va)

# y 안전 로그변환
y_tr_log = safe_log1p_vec(y_tr)
y_va_log = safe_log1p_vec(y_va)

# 안전 체크(디버그 용 — 문제 있으면 어떤 값인지 바로 알 수 있음)
assert np.isfinite(y_tr_log).all(), "y_tr_log에 비정상값이 있습니다."
assert np.isfinite(y_va_log).all(), "y_va_log에 비정상값이 있습니다."

lgb_train = lgb.Dataset(X_tr, label=y_tr_log, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_log, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 예측(로그→원복)
pred_va_log = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = np.expm1(pred_va_log)

print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 0.0987863	valid's rmse: 0.0916123
[400]	train's rmse: 0.0898912	valid's rmse: 0.0886367
[600]	train's rmse: 0.083911	valid's rmse: 0.0875522
[800]	train's rmse: 0.0790916	valid's rmse: 0.0868965
[1000]	train's rmse: 0.0750522	valid's rmse: 0.0865516
[1200]	train's rmse: 0.0716982	valid's rmse: 0.0864109
[1400]	train's rmse: 0.0687608	valid's rmse: 0.0861572
[1600]	train's rmse: 0.0660789	valid's rmse: 0.0860012
[1800]	train's rmse: 0.0635716	valid's rmse: 0.0858775
[2000]	train's rmse: 0.061464	valid's rmse: 0.0858378
Early stopping, best iteration is:
[1969]	train's rmse: 0.0617973	valid's rmse: 0.0857921
VALID RMSE: 232.33998456152779
VALID MAE : 108.60940992565521


In [96]:
grid = [
    {'num_leaves': 48, 'min_data_in_leaf': 80, 'lambda_l2': 0.5},
    {'num_leaves': 64, 'min_data_in_leaf': 60, 'lambda_l2': 1.0},
    {'num_leaves': 96, 'min_data_in_leaf': 40, 'lambda_l2': 2.0},
]


In [97]:
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n", va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n", va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


Top-10 by 건물번호:
 건물번호
3     1259.190766
10     912.468557
79     473.692181
67     444.580994
1      408.781326
23     387.225291
45     386.683106
34     378.439170
57     368.142016
64     352.141419
Name: se, dtype: float64
By 건물유형:
 건물유형
병원          433.331827
호텔          346.299498
IDC(전화국)    270.536145
백화점         224.606737
연구소         171.321101
학교          152.798324
건물기타        149.179154
공공          131.158694
상용          113.874962
아파트          78.608938
Name: se, dtype: float64


  print("Top-10 by 건물번호:\n", va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [98]:
def build_baseline(df):
    base = df['cons_samehour_mean_7d']
    base = base.fillna(df['cons_mean_24h'])
    base = base.fillna(df['cons_lag1'])
    base = base.fillna(df['전력소비량(kWh)'].median() if '전력소비량(kWh)' in df.columns else 0)
    return base

In [99]:
# =========================
# 5. LightGBM 학습 & 검증 (잔차 학습)
# =========================
# 베이스라인 생성 (누설 없음: 전부 shift/rolling 기반 피처)
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)

# 잔차 타깃
y_tr_resid = y_tr - baseline_tr.values
y_va_resid = y_va - baseline_va.values

# (안전) X에서 ±inf 제거
X_tr = X_tr.replace([np.inf, -np.inf], np.nan)
X_va = X_va.replace([np.inf, -np.inf], np.nan)

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',   # 나중에 tweedie/huber 시도 가능
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 잔차 예측 + 베이스라인 복원
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = baseline_va.values + pred_va_resid

print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 208.044	valid's rmse: 299.621
[400]	train's rmse: 173.802	valid's rmse: 278.029
[600]	train's rmse: 156.796	valid's rmse: 270.213
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1000]	train's rmse: 136.718	valid's rmse: 263.252
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1400]	train's rmse: 123.945	valid's rmse: 260.827
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[1800]	train's rmse: 114.191	valid's rmse: 259.365
[2000]	train's rmse: 110.226	valid's rmse: 259.047
[2200]	train's rmse: 106.599	valid's rmse: 258.916
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
VALID RMSE: 258.5116251016002
VALID MAE : 136.39708264373976


In [100]:
# =========================
# 6. 제출 파일 생성 (num_date_time 미사용 버전)
# =========================
sub = read_csv_smart(SAMPLE_SUB)  # 샘플 제출 양식 그대로 사용

# 예측값 길이가 샘플과 달라도 안전하게 할당(인덱스 기준 정렬)
# - 길이가 짧으면 남는 행은 NaN
# - 길이가 길면 초과 분량은 자동으로 버려짐
sub['answer'] = pd.Series(test_pred, index=sub.index)

# 저장 (엑셀 호환 위해 utf-8-sig 권장)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"저장 완료 → {OUT_SUB}")



저장 완료 → C:\Users\user\Downloads\open (1)\baseline_lgbm_submission.csv


In [101]:
# 문제 빌딩 리스트 (네 출력 기준)
hard_blds = {3, 10, 79, 45, 1, 23, 67, 57, 34, 64}

# 5단계 학습 직전에 가중치 계산
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(hard_blds).values] = 2.0  # 2~3배 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


In [102]:
def build_baseline(df: pd.DataFrame) -> pd.Series:
    """
    잔차 학습용 베이스라인:
      1) cons_samehour_mean_7d
      2) cons_mean_24h
      3) cons_lag1
      4) (최후) 학습 구간 타깃 중앙값
    """
    base = df.get('cons_samehour_mean_7d')
    if base is None: base = pd.Series(index=df.index, dtype=float)
    base = base.fillna(df.get('cons_mean_24h'))
    base = base.fillna(df.get('cons_lag1'))
    if '전력소비량(kWh)' in df.columns:
        base = base.fillna(df['전력소비량(kWh)'].median())
    else:
        base = base.fillna(0.0)
    return base


In [103]:
# =========================
# 5R. LightGBM 학습 & 검증 (잔차 학습)
# =========================

# 피처 재선택(혹시 위에서 수정되었을 수 있으니 보강)
features = get_feature_cols(train_feat)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr, y_tr = train_part[features], train_part['전력소비량(kWh)']
X_va, y_va = valid_part[features], valid_part['전력소비량(kWh)']

# 베이스라인 생성(인덱스 정렬 유지)
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)

# 잔차 타깃
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (선택) 문제 유형 가중치 — 처음엔 False로 두고 결과 본 뒤 켜줘
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4: 2.0, 9: 2.0}  # 필요시 가중치 조정
if USE_TYPE_WEIGHTS:
    w_tr = np.ones(len(train_part), dtype=float)
    mask = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).values
    w_tr = w_tr * mask
else:
    w_tr = None

# 안전 처리(±inf → NaN)
X_tr = X_tr.replace([np.inf, -np.inf], np.nan)
X_va = X_va.replace([np.inf, -np.inf], np.nan)

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',   # 잔차는 음수/양수 모두 가능 → 회귀(평균제곱오차)로
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

# 잔차 예측 + 베이스라인 복원
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
pred_va = baseline_va.values + pred_va_resid

print("VALID RMSE (residual):", rmse(y_va, pred_va))
print("VALID MAE  (residual):", mean_absolute_error(y_va, pred_va))

# --- 에러 분석: 건물/유형별 RMSE ---
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n",
      va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n",
      va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 208.044	valid's rmse: 299.621
[400]	train's rmse: 173.802	valid's rmse: 278.029
[600]	train's rmse: 156.796	valid's rmse: 270.213
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1000]	train's rmse: 136.718	valid's rmse: 263.252
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1400]	train's rmse: 123.945	valid's rmse: 260.827
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[1800]	train's rmse: 114.191	valid's rmse: 259.365
[2000]	train's rmse: 110.226	valid's rmse: 259.047
[2200]	train's rmse: 106.599	valid's rmse: 258.916
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
VALID RMSE (residual): 258.5116251016002
VALID MAE  (residual): 136.39708264373976
Top-10 by 건물번호:
 건물번호
3     980.592600
10    960.665798
79    828.491256
45    589.883378
12    588.206425
69    489.947304
23    448.471126
1     437.299306
6     397.305285
34    390.835509
Name: se, dtype: floa

  va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [104]:
import os, random, numpy as np
os.environ["PYTHONHASHSEED"] = "0"
random.seed(42); np.random.seed(42)
# LightGBM 쪽은 params['seed']=42, params['num_threads']=4 정도 권장


In [105]:
def downcast(df):
    df = df.copy()
    for c in df.select_dtypes(include=['float64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='float')
    for c in df.select_dtypes(include=['int64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='integer')
    return df

# train_df, test_df 읽은 직후 한 번씩:
# train_df = downcast(train_df)
# test_df  = downcast(test_df)


In [106]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    all_df = all_df.copy()
    if 'dt' not in all_df.columns:
        raise ValueError("ensure_datetime_cols 먼저 호출해줘.")
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else False

    rad_ref = all_df[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df[is_train].groupby(['month','hour'])['일조(hr)'].median()

    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))
    all_df.loc[need_rad, '일사(MJ/m2)'] = fill_rad[need_rad.values]
    all_df.loc[need_sun, '일조(hr)']    = fill_sun[need_sun.values]

    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [107]:
def build_baseline(df: pd.DataFrame) -> pd.Series:
    base = df.get('cons_samehour_mean_7d')
    if base is None:
        base = pd.Series(index=df.index, dtype=float)
    base = base.fillna(df.get('cons_mean_24h'))
    base = base.fillna(df.get('cons_lag1'))
    if '전력소비량(kWh)' in df.columns:
        base = base.fillna(df['전력소비량(kWh)'].median())
    else:
        base = base.fillna(0.0)
    return base


In [108]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

def get_feature_cols(df):
    drop = ['전력소비량(kWh)','dt','날짜','시간','일시','num_date_time']
    cols = [c for c in df.columns if c not in drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# 검증 파트
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features)
X_va = valid_part.reindex(columns=features)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


In [109]:
# 베이스라인 & 잔차
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (옵션) 유형 가중치
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4:2.0, 9:2.0}
w_tr = None
if USE_TYPE_WEIGHTS and '건물유형' in train_part.columns:
    w_tr = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).astype(float).values

# LGBM 학습
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':50,'lambda_l2':1.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr.replace([np.inf,-np.inf], np.nan), label=y_tr_resid, weight=w_tr,
                        categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va.replace([np.inf,-np.inf], np.nan), label=y_va_resid,
                        categorical_feature=cat_cols or None)
callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
model = lgb.train(params, lgb_train, num_boost_round=5000,
                  valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
                  callbacks=callbacks)

# 복원해서 검증 점수
pred_va = baseline_va.values + model.predict(X_va, num_iteration=model.best_iteration)
print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))

# 에러 테이블 저장(재현성 위해 파일로 남겨)
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
va_err_per_bld = va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False)
va_err_per_type = va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False)
va_err_per_bld.head(10).to_csv("val_top10_buildings.csv", encoding='utf-8-sig')
va_err_per_type.to_csv("val_by_type.csv", encoding='utf-8-sig')


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 173.802	valid's rmse: 278.029
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[2000]	train's rmse: 110.226	valid's rmse: 259.047
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
VALID RMSE: 258.5116251016002
VALID MAE : 136.39708264373976


  va_err_per_bld = va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False)


In [110]:
# all_df → ensure_datetime_cols → (옵션) backfill_solar_by_time → make_features 까지 완료됐다고 가정
all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

features_full = get_feature_cols(all_feat_train)
cat_cols_full = [c for c in ['건물번호','건물유형'] if c in features_full]
for c in cat_cols_full:
    all_feat_train[c] = all_feat_train[c].astype('category')
    all_feat_test[c]  = all_feat_test[c].astype('category')

X_full = all_feat_train.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
y_full = all_feat_train['전력소비량(kWh)']
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

y_full_resid = (y_full - baseline_full).astype(float)
lgb_full = lgb.Dataset(X_full, label=y_full_resid, categorical_feature=cat_cols_full or None)
final_model = lgb.train(params, lgb_full, num_boost_round=(model.best_iteration or 2000))

X_te = all_feat_test.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
test_pred = baseline_te.values + final_model.predict(X_te, num_iteration=final_model.best_iteration)

sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"저장 완료 → {OUT_SUB}")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


저장 완료 → C:\Users\user\Downloads\open (1)\baseline_lgbm_submission.csv


[Blending] alpha=1.009
Baseline-only  RMSE: 1008.0623627048025 MAE: 549.6628074404762
Residual-only RMSE: 258.5116251016002
BLENDED       RMSE: 258.3701911359071 MAE: 136.64807493862273


In [112]:
# =========================
# 5R. LightGBM 학습 & 검증 (잔차 학습 + α 블렌딩)
# =========================

# 피처 재선택(안전하게 reindex)
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]

for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf, -np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf, -np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 베이스라인(누설 없음: shift/rolling 기반) → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# (옵션) 유형 가중치 — 처음엔 꺼두고 결과 보고 켜자
USE_TYPE_WEIGHTS = False
TYPE_WEIGHT = {4: 2.0, 9: 2.0}
w_tr = None
if USE_TYPE_WEIGHTS and ('건물유형' in train_part.columns):
    w_tr = train_part['건물유형'].map(TYPE_WEIGHT).fillna(1.0).astype(float).values

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1,
    'num_threads': 4,
}
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=400)]

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train','valid'],
    callbacks=callbacks
)

# ----- 블렌딩 α 계산 (최소제곱 닫힌형식) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

base_rmse = rmse(y_va, baseline_va)
base_mae  = mean_absolute_error(y_va, baseline_va)
true_resid_va = (y_va - baseline_va).values

den = float(np.sum(pred_va_resid**2) + 1e-9)
alpha = float(np.sum(true_resid_va * pred_va_resid) / den)   # 최적 α
alpha = max(0.0, min(alpha, 1.5))  # 안정화 클립

pred_va = baseline_va.values + alpha * pred_va_resid

print(f"[Blending] alpha={alpha:.3f}")
print("Baseline-only  RMSE:", base_rmse, "MAE:", base_mae)
print("Residual-only RMSE:", rmse(true_resid_va, pred_va_resid))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))

# 나중에 6R에서 재사용
ALPHA_BEST = alpha

# --- 에러 분석 (빌딩/유형별) ---
va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se'] = (y_va.values - pred_va)**2
print("Top-10 by 건물번호:\n",
      va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))
print("By 건물유형:\n",
      va_err.groupby('건물유형')['se'].mean().pow(0.5).sort_values(ascending=False))


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 173.802	valid's rmse: 278.029
[800]	train's rmse: 145.22	valid's rmse: 266.311
[1200]	train's rmse: 129.734	valid's rmse: 262.155
[1600]	train's rmse: 118.758	valid's rmse: 260.395
[2000]	train's rmse: 110.226	valid's rmse: 259.047
Early stopping, best iteration is:
[2076]	train's rmse: 108.802	valid's rmse: 258.512
[Blending] alpha=1.009
Baseline-only  RMSE: 1008.0623627048025 MAE: 549.6628074404762
Residual-only RMSE: 258.5116251016002
BLENDED       RMSE: 258.3701911359071 MAE: 136.64807493862273
Top-10 by 건물번호:
 건물번호
3     978.285763
10    957.739418
79    822.031091
12    589.845720
45    584.686116
69    492.270693
23    452.463520
1     434.511621
34    397.622658
6     396.046106
Name: se, dtype: float64
By 건물유형:
 건물유형
병원          366.291553
호텔          361.696699
백화점         302.305715
학교          296.919253
IDC(전화국)    223.386442
연구소         221.307427
건물기타        207.941469
공공          163.56668

  va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [114]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 잔차 예측
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

# 베이스라인 점수
base_rmse = np.sqrt(mean_squared_error(y_va, baseline_va))
base_mae  = mean_absolute_error(y_va, baseline_va)
base_r2   = r2_score(y_va, baseline_va)

# 최적 α (닫힌형식) + 블렌딩 점수
true_resid = (y_va - baseline_va).values
den = float(np.sum(pred_va_resid**2) + 1e-9)
alpha = float(np.sum(true_resid * pred_va_resid) / den)
alpha = max(0.0, min(alpha, 1.5))
pred_va = baseline_va.values + alpha * pred_va_resid

blend_rmse = np.sqrt(mean_squared_error(y_va, pred_va))
blend_mae  = mean_absolute_error(y_va, pred_va)
blend_r2   = r2_score(y_va, pred_va)

print(f"alpha={alpha:.3f}")
print(f"[Baseline] RMSE={base_rmse:.3f} MAE={base_mae:.3f} R2={base_r2:.4f}")
print(f"[Blended ] RMSE={blend_rmse:.3f} MAE={blend_mae:.3f} R2={blend_r2:.4f}")



alpha=1.009
[Baseline] RMSE=1008.062 MAE=549.663 R2=0.9318
[Blended ] RMSE=258.370 MAE=136.648 R2=0.9955


In [115]:
# 예측값 기준 10분위로 나눠서 실제 평균과 비교
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print(cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

                            y_mean     yhat_mean     n        bias
bin                                                               
(-5.878, 788.75]        480.861052    479.852394  1920   -1.008658
(788.75, 1166.572]      988.435635    999.528969  1920   11.093333
(1166.572, 1533.806]   1332.196734   1339.457136  1920    7.260401
(1533.806, 1847.956]   1693.815510   1702.089636  1920    8.274125
(1847.956, 2213.722]   1981.714578   2011.167779  1920   29.453200
(2213.722, 2769.5]     2447.469797   2489.828361  1920   42.358564
(2769.5, 3546.97]      3081.754672   3157.553138  1920   75.798466
(3546.97, 5100.008]    4180.157505   4283.268101  1920  103.110595
(5100.008, 9581.44]    6666.269812   6795.288892  1920  129.019079
(9581.44, 27048.039]  13259.197734  13381.738040  1920  122.540305
평균 절대 바이어스: 52.991672915209804


  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))


In [116]:
res = valid_part[["건물번호","건물유형","hour","기온(°C)"]].copy()
res["y"] = y_va.values
res["yhat"] = pred_va
res["resid"] = res["y"] - res["yhat"]

# 시간대별
by_hour = res.groupby("hour")["resid"].agg(["mean","std","count"])
print("시간대별 잔차통계:\n", by_hour)

# 온도 구간별 (5°C 단위)
res["temp_bin"] = (res["기온(°C)"]//5)*5
by_temp = res.groupby("temp_bin")["resid"].agg(["mean","std","count"])
print("온도구간별 잔차통계:\n", by_temp)

# 건물유형별 RMSE (베이스라인 vs 블렌딩 비교)
tmp = valid_part[["건물유형"]].copy()
tmp["base_se"]   = (y_va.values - baseline_va.values)**2
tmp["blend_se"]  = (y_va.values - pred_va)**2
cmp_type = tmp.groupby("건물유형")[["base_se","blend_se"]].mean().pow(0.5).sort_values("blend_se", ascending=False)
cmp_type["improve(%)"] = (1 - cmp_type["blend_se"]/cmp_type["base_se"])*100
print("유형별 RMSE 비교(베이스라인→블렌딩):\n", cmp_type)

# 상위 문제 빌딩 Top-10
tmp2 = valid_part[["건물번호"]].copy()
tmp2["se"] = (y_va.values - pred_va)**2
top_bld = tmp2.groupby("건물번호")["se"].mean().pow(0.5).sort_values(ascending=False).head(10)
print("Top-10 by 건물번호 (블렌딩):\n", top_bld)

시간대별 잔차통계:
             mean         std  count
hour                               
0       7.274977  158.112045    800
1       8.074063  146.801622    800
2      14.540863  144.251765    800
3       2.865920  149.352253    800
4      -6.347148  178.830496    800
5       3.286088  154.706927    800
6       8.802574  174.355283    800
7       1.325113  198.014652    800
8     -22.996409  234.257183    800
9     -36.270276  265.965160    800
10    -54.723798  262.330299    800
11   -117.387224  297.002805    800
12   -132.509075  276.386679    800
13   -145.195950  298.105950    800
14   -141.116714  297.738312    800
15   -146.673020  325.480169    800
16   -128.737805  384.677912    800
17   -110.144715  309.593039    800
18    -99.713367  292.513767    800
19    -79.789968  286.652216    800
20    -28.921888  244.311530    800
21    -24.611776  236.075649    800
22    -27.278372  198.976516    800
23    -10.710685  182.298121    800
온도구간별 잔차통계:
                 mean         std  count

  top_bld = tmp2.groupby("건물번호")["se"].mean().pow(0.5).sort_values(ascending=False).head(10)


In [120]:
# === 0) 스키러너블 모델로 재학습 (설명력용, 제출에 사용 안 함)
import lightgbm as lgb
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error

# 1) 우선 그대로 카테고리 dtype을 사용해서 학습 시도
params_sklearn = dict(
    objective='regression',
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=1,
    min_child_samples=50,   # = min_data_in_leaf
    reg_lambda=1.0,         # = lambda_l2
    random_state=42,
    n_estimators=5000,
    n_jobs=4
)

use_cat = True
try:
    model_perm = lgb.LGBMRegressor(**params_sklearn)
    model_perm.fit(
        X_tr, y_tr_resid,
        eval_set=[(X_va, y_va_resid)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)],
        categorical_feature=cat_cols or None  # pandas 'category' dtype이면 자동 인식
    )
except Exception as e:
    # 2) 버전/환경에 따라 categorical_feature가 막힐 수 있어서 안전한 fallback: 카테고리 → 코드화
    print("카테고리 직접 전달 실패 → 코드화하여 재시도:", e)
    use_cat = False
    X_tr_enc = X_tr.copy()
    X_va_enc = X_va.copy()
    for c in (cat_cols or []):
        X_tr_enc[c] = X_tr_enc[c].cat.codes
        X_va_enc[c] = X_va_enc[c].cat.codes
    model_perm = lgb.LGBMRegressor(**params_sklearn)
    model_perm.fit(
        X_tr_enc, y_tr_resid,
        eval_set=[(X_va_enc, y_va_resid)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
    )

# === 1) Permutation Importance (검증셋 기준)
X_pi = X_va if use_cat else X_va_enc
pi = permutation_importance(
    model_perm, X_pi, y_va_resid,
    n_repeats=5, random_state=42, n_jobs=4,
    scoring='neg_mean_squared_error'
)
import pandas as pd, numpy as np
pi_tbl = pd.DataFrame({
    "feature": X_pi.columns,
    "imp": pi.importances_mean,
    "std": pi.importances_std
}).sort_values("imp", ascending=False)
print("Permutation Importance Top-20:\n", pi_tbl.head(20))

# === 2) LightGBM 내장 중요도 (gain/split)도 함께 보기
gain = pd.Series(model_perm.booster_.feature_importance(importance_type='gain'),
                 index=X_pi.columns).sort_values(ascending=False)
split = pd.Series(model_perm.booster_.feature_importance(importance_type='split'),
                  index=X_pi.columns).sort_values(ascending=False)
imp_tbl = pd.DataFrame({"gain": gain, "split": split}).fillna(0).sort_values("gain", ascending=False)
print("LightGBM 중요도(Gain) Top-20:\n", imp_tbl.head(20))

# === 3) SHAP 대체 (shap 미설치/버전 이슈 대비 → pred_contrib)
try:
    import shap
    explainer = shap.TreeExplainer(model_perm)
    shap_vals = explainer.shap_values(X_pi)
    shap_abs = np.abs(shap_vals).mean(axis=0)
    shap_tbl = pd.Series(shap_abs, index=X_pi.columns).sort_values(ascending=False).head(20)
    print("SHAP |value| Top-20:\n", shap_tbl)
except Exception as e:
    print("shap 미사용 → pred_contrib로 대체:", e)
    contrib = np.asarray(model_perm.predict(X_pi, pred_contrib=True))
    # 마지막 열은 base value, 제외
    shap_abs = np.abs(contrib[:, :-1]).mean(axis=0)
    shap_tbl = pd.Series(shap_abs, index=X_pi.columns).sort_values(ascending=False).head(20)
    print("Approx SHAP |value| Top-20:\n", shap_tbl)

# === 4) 캘리브레이션/블렌딩 비교도 같이 수치로 확인 (설명력 품질 점검, 예측 저장 X)
pred_va_resid_hat = model_perm.predict(X_pi, num_iteration=model_perm.best_iteration_)
base_rmse = np.sqrt(mean_squared_error(y_va, baseline_va))
true_resid = (y_va - baseline_va).values
den = float(np.sum(pred_va_resid_hat**2) + 1e-9)
alpha = float(np.sum(true_resid * pred_va_resid_hat) / den)
alpha = max(0.0, min(alpha, 1.5))
pred_va_blend = baseline_va.values + alpha * pred_va_resid_hat
blend_rmse = np.sqrt(mean_squared_error(y_va, pred_va_blend))
print(f"[설명력 체크] Baseline RMSE={base_rmse:.3f}  Blended RMSE={blend_rmse:.3f} (alpha={alpha:.3f})")



Training until validation scores don't improve for 200 rounds
[400]	valid_0's l2: 77300.4
[800]	valid_0's l2: 70921.3
[1200]	valid_0's l2: 68725.4
[1600]	valid_0's l2: 67805.4
[2000]	valid_0's l2: 67105.4
Early stopping, best iteration is:
[2076]	valid_0's l2: 66828.3
Permutation Importance Top-20:
                     feature           imp           std
18    cons_samehour_mean_7d  4.281631e+06  41895.011875
12                cons_lag1  3.329261e+06  20791.980802
0                      건물번호  4.462907e+05   8122.529034
38                     hour  2.833475e+05   9824.810526
13             cons_lag_24h  7.304677e+04    467.204158
22                  weekday  5.557012e+04   4869.273276
16            cons_lag_168h  4.519914e+04    601.363053
6                 일사(MJ/m2)  3.699363e+04   1250.087403
17            cons_mean_24h  2.257438e+04    454.303971
14             cons_lag_48h  1.478527e+04    591.954276
15             cons_lag_72h  8.798136e+03    311.344347
7                   연면적(m2)

In [121]:
# 5R 블록에서 옵션만 켜기
USE_TYPE_WEIGHTS = True
TYPE_WEIGHT = {4: 2.0, 9: 2.0}


In [164]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # 일사/일조 보정
    if '일사(MJ/m2)' not in df.columns: df['일사(MJ/m2)'] = 0.0
    if '일조(hr)'   not in df.columns: df['일조(hr)']   = 0.0

    # 정렬 & 그룹
    df = df.sort_values(['건물번호','dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그/롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(24, min_periods=1).mean())
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(7, min_periods=1).mean())
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h',
                  'cons_lag_168h','cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan
    # --- 일중·주기 변화량(라그 이후 계산) ---
    df['delta_1h'] = df['cons_lag1']    - df['cons_lag_24h']
    df['delta_7d'] = df['cons_lag_24h'] - df['cons_lag_168h']
    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)
    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm
    df['CDD_humid_adj'] = df['CDD'] * (1 + 0.3 * (df['습도(%)'] / 100.0))

    # ---------- 달력 & 설비 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month']     = df['dt'].dt.month
    df['hour']      = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear
    df['hour_sin']  = np.sin(2*np.pi*df['hour']/24)
    df['hour_cos']  = np.cos(2*np.pi*df['hour']/24)

    # ---------- 면적 정규화 (라그 생성 '뒤'에 계산) ----------
    eps = 1e-6
    if '연면적(m2)' in df.columns:
        area = pd.to_numeric(df['연면적(m2)'], errors='coerce')
        df['cons_lag1_per_m2']   = df['cons_lag1']    / (area + eps)
        df['cons_mean24_per_m2'] = df['cons_mean_24h'] / (area + eps)
        df['CDD_x_rad_area']     = df['CDD_x_rad'] * (area.fillna(0) / 1000.0)

    return df


In [146]:
df['hour_sin'] = np.sin(2*np.pi*df['시간']/24)
df['hour_cos'] = np.cos(2*np.pi*df['시간']/24)

In [147]:
params['min_data_in_leaf'] = 120   # 50 → 120
params['lambda_l2'] = 2.0          # 1.0 → 2.0
# (유지) learning_rate=0.05, num_leaves=64


In [148]:
# 5R 바로 앞에서 한 번만 실험(점수만 확인)
features_abl = [c for c in features if c != '건물번호']
X_tr_abl = train_part.reindex(columns=features_abl)
X_va_abl = valid_part.reindex(columns=features_abl)
# 같은 파라미터로 간단하게 1500부스트 정도만 재학습해서 RMSE 비교


In [152]:
train_feat = make_features(train_df)
is_val = (train_feat['dt'] >= pd.Timestamp(2024,8,17)) & (train_feat['dt'] <= pd.Timestamp(2024,8,24,23))
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()



In [163]:
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_mean_24h': 'cons_mean_24h' in train_feat.columns,
  'cons_lag1_per_m2': 'cons_lag1_per_m2' in train_feat.columns,
  'cons_mean24_per_m2': 'cons_mean24_per_m2' in train_feat.columns
})


{'cons_lag1': True, 'cons_mean_24h': True, 'cons_lag1_per_m2': True, 'cons_mean24_per_m2': True}


In [158]:
# 0) 피처 다시 생성 & 검증 분할
train_feat = make_features(train_df)

VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)

train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 1) 피처 선택(안전하게 reindex)
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]

for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 2) 베이스라인 생성 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 3) 잔차 모델 학습(보수적 파라미터)
import lightgbm as lgb
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 4) 설명력/보정력 평가 (예측 저장 X)
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

# 베이스라인/블렌딩 지표
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
base_rmse = rmse(y_va, baseline_va)
base_mae  = mean_absolute_error(y_va, baseline_va)
base_r2   = r2_score(y_va, baseline_va)

true_resid = (y_va - baseline_va).values
den = float((pred_va_resid**2).sum() + 1e-9)
alpha = float((true_resid * pred_va_resid).sum() / den)
alpha = max(0.0, min(alpha, 1.5))  # 안정화
pred_va_blend = baseline_va.values + alpha * pred_va_resid

blend_rmse = rmse(y_va, pred_va_blend)
blend_mae  = mean_absolute_error(y_va, pred_va_blend)
blend_r2   = r2_score(y_va, pred_va_blend)

print(f"alpha={alpha:.3f}")
print(f"[Baseline] RMSE={base_rmse:.3f}  MAE={base_mae:.3f}  R2={base_r2:.4f}")
print(f"[Blended ] RMSE={blend_rmse:.3f}  MAE={blend_mae:.3f}  R2={blend_r2:.4f}")

# 5) 캘리브레이션(10분위) & 유형/빌딩별 진단
import pandas as pd, numpy as np
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va_blend})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호','건물유형']].copy()
va_err['se_base']  = (y_va.values - baseline_va.values)**2
va_err['se_blend'] = (y_va.values - pred_va_blend)**2
print("\n[유형별 RMSE 비교]\n",
      va_err.groupby('건물유형')[['se_base','se_blend']].mean().pow(0.5).assign(
          improve_pct=lambda d: (1 - d['se_blend']/d['se_base'])*100
      ).sort_values('se_blend', ascending=False))

print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 177.35	valid's rmse: 269.019
[800]	train's rmse: 151.009	valid's rmse: 254.416
[1200]	train's rmse: 136.318	valid's rmse: 249.614
[1600]	train's rmse: 125.842	valid's rmse: 247.987
[2000]	train's rmse: 117.571	valid's rmse: 247.125
Early stopping, best iteration is:
[2072]	train's rmse: 116.217	valid's rmse: 246.974
alpha=1.010
[Baseline] RMSE=1008.062  MAE=549.663  R2=0.9318
[Blended ] RMSE=246.782  MAE=130.388  R2=0.9959

[Calibration by decile]
                              y_mean     yhat_mean     n        bias
bin                                                                
(-2.851, 791.788]        480.509068    477.892376  1920   -2.616692
(791.788, 1168.991]      989.058995    999.900741  1920   10.841746
(1168.991, 1532.819]    1331.070406   1344.051939  1920   12.981532
(1532.819, 1855.575]    1693.808521   1704.821467  1920   11.012946
(1855.575, 2202.672]    1982.798474   2014.547871  1920  

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [159]:
# ----- a, b 동시 추정(최소제곱) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))  # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print("Baseline-only  RMSE:", rmse(y_va, baseline_va), "MAE:", mean_absolute_error(y_va, baseline_va))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))


[Blending] a=1.011, b=-51.429
Baseline-only  RMSE: 1008.0623627048025 MAE: 549.6628074404762
BLENDED       RMSE: 241.36530189161562 MAE: 132.1879213077672


In [160]:
# 잔차 예측
test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)

# 검증에서 구한 a,b로 복원
a_use = a if 'a' in globals() else 1.0
b_use = b if 'b' in globals() else 0.0
test_pred = baseline_te.values + a_use * test_pred_resid + b_use


In [161]:
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


In [165]:
# 1) make_features() 셀(패치 포함) 실행

# 2) 피처 다시 생성
train_feat = make_features(train_df)

# 3) 확인
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_lag_24h': 'cons_lag_24h' in train_feat.columns,
  'cons_lag_168h': 'cons_lag_168h' in train_feat.columns,
  'delta_1h': 'delta_1h' in train_feat.columns,
  'delta_7d': 'delta_7d' in train_feat.columns
})


{'cons_lag1': True, 'cons_lag_24h': True, 'cons_lag_168h': True, 'delta_1h': True, 'delta_7d': True}


In [166]:
params['min_data_in_leaf'] = 150   # 120 → 150
params['lambda_l2'] = 3.0          # 2.0 → 3.0

In [168]:
# =========================
# 5R. 잔차 학습 + (a,b) 블렌딩 + 하드 빌딩 가중치
# =========================
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def rmse(y, yhat):
    y, yhat = np.asarray(y), np.asarray(yhat)
    return float(np.sqrt(mean_squared_error(y, yhat)))

# --- 검증 분할(재확인) ---
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# --- 피처 선택 (안전하게 reindex) ---
features = get_feature_cols(train_part)  # 기존 함수 사용
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# --- 베이스라인 → 잔차 타깃 ---
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# --- 하드 빌딩 가중치 (Top-10 위주) ---
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}  # 필요시 수정
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

# --- LightGBM 학습 (보수적 규제) ---
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)

model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# --- (a,b) 블렌딩: y ≈ base + a*resid_pred + b ---
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))   # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print(f"[Baseline] RMSE={rmse(y_va, baseline_va):.3f}  MAE={mean_absolute_error(y_va, baseline_va):.3f}")
print(f"[Blended ] RMSE={rmse(y_va, pred_va):.3f}  MAE={mean_absolute_error(y_va, pred_va):.3f}  R2={r2_score(y_va, pred_va):.4f}")

# --- 디사일 캘리브레이션 ---
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_va})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

# --- Top-10 by 건물번호 (블렌딩 기준) ---
va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_va)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 다음 단계에서 6R에서 재사용할 수 있게 보관
AB_BLEND = (a, b)
BEST_NUM_BOOST = int(model.best_iteration or 2000)


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 191.932	valid's rmse: 250.611
[800]	train's rmse: 158.815	valid's rmse: 239.59
[1200]	train's rmse: 140.375	valid's rmse: 234.776
[1600]	train's rmse: 127.489	valid's rmse: 232.896
[2000]	train's rmse: 117.524	valid's rmse: 232.333
Early stopping, best iteration is:
[2090]	train's rmse: 115.572	valid's rmse: 231.904
[Blending] a=1.015, b=-12.297
[Baseline] RMSE=1008.062  MAE=549.663
[Blended ] RMSE=231.127  MAE=111.847  R2=0.9964

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-13.572, 767.288]       479.966969    460.802462  1920 -19.164507
(767.288, 1143.73]       988.190281    972.225393  1920 -15.964888
(1143.73, 1509.149]     1331.591286   1312.078026  1920 -19.513260
(1509.149, 1809.44]     1693.329979   1672.495501  1920 -20.834479
(1809.44, 2173.781]     1980.010083   1969.816649  1920 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [169]:
# =========================
# 6R. 최종 재학습 (a,b 블렌딩 적용)  *제출은 옵션*
# =========================
DO_SUBMIT = False  # ← 기본은 예측/저장 비활성화. 필요할 때만 True로.

# all_feat이 이미 만들어져 있다면 재사용, 아니면 생성
try:
    _ = all_feat  # 존재 체크
except NameError:
    # test_orig/build_df/read_csv_smart/clean_capacity/ensure_datetime_cols 등이 준비되어 있어야 함
    test_orig = read_csv_smart(TEST_PATH)
    build_df  = read_csv_smart(BUILD_PATH)
    test_df = pd.merge(test_orig, build_df, on='건물번호', how='left')
    test_df = clean_capacity_fields(test_df)
    test_df = ensure_datetime_cols(test_df)
    all_df  = pd.concat([train_df, test_df], ignore_index=True)
    # (옵션) 일사/일조 백필을 썼다면 여기서 호출
    # all_df = backfill_solar_by_time(all_df)
    all_feat = make_features(all_df)

all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

features_full = get_feature_cols(all_feat_train)
cat_cols_full = [c for c in ['건물번호','건물유형'] if c in features_full]
for c in cat_cols_full:
    all_feat_train[c] = all_feat_train[c].astype('category')
    all_feat_test[c]  = all_feat_test[c].astype('category')

X_full = all_feat_train.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
y_full = all_feat_train['전력소비량(kWh)']

baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

y_full_resid = (y_full - baseline_full).astype(float)
lgb_full = lgb.Dataset(X_full, label=y_full_resid, categorical_feature=cat_cols_full or None)

# 검증에서 쓴 하이퍼파라미터/부스트 수 재사용
params_full = params.copy()
final_model = lgb.train(params_full, lgb_full, num_boost_round=BEST_NUM_BOOST)

# 테스트 잔차 예측 + (a,b) 블렌딩 복원
if DO_SUBMIT:
    X_te = all_feat_test.reindex(columns=features_full).replace([np.inf,-np.inf], np.nan)
    test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)
    a_use, b_use = AB_BLEND if 'AB_BLEND' in globals() else (1.0, 0.0)
    test_pred = baseline_te.values + a_use * test_pred_resid + b_use

    sub = read_csv_smart(SAMPLE_SUB)
    sub['answer'] = pd.Series(test_pred, index=sub.index)
    sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
    print(f"저장 완료 → {OUT_SUB}")
else:
    print("제출 생략: DO_SUBMIT=False (모델만 재학습 완료)")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


제출 생략: DO_SUBMIT=False (모델만 재학습 완료)


In [171]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':64,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':120,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 6) 보정 후보 계산
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형(a,b): y ≈ base + a*pred + b
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) Isotonic: y ≈ base + g(pred)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
gamma = 0.5  # 0.3~0.7 시도 가능

train_resid = (y_tr - baseline_tr).values

# 빌딩별 train 잔차 평균 (observed=False로 경고 제거)
bld_bias_map = (
    pd.Series(train_resid, index=train_part.index)
      .groupby(train_part['건물번호'], observed=False)
      .mean()
)

# 인덱스 타입 통일(int)
if isinstance(getattr(bld_bias_map.index, 'dtype', None), pd.CategoricalDtype):
    bld_bias_map.index = bld_bias_map.index.astype('int64')

# valid 건물번호도 int로 변환 후 dict 매핑 → float → NaN을 0.0으로
bld_bias_dict = bld_bias_map.to_dict()
bld_codes_va = valid_part['건물번호'].astype('int64')
bias_va = bld_codes_va.map(bld_bias_dict).astype('float64').fillna(0.0).to_numpy()

pred_iso_bias = pred_iso + gamma * bias_va
sc_iso_bias = (RMSE(y_va, pred_iso_bias), mean_absolute_error(y_va, pred_iso_bias), r2_score(y_va, pred_iso_bias))


# 7) 베스트 보정 선택
cands = {
    "AB":        (pred_ab,       sc_ab,       {"a":a,"b":b}),
    "ISO":       (pred_iso,      sc_iso,      {"iso":iso}),
    "ISO+BLD":   (pred_iso_bias, sc_iso_bias, {"iso":iso,"gamma":gamma,"bld_bias_map":bld_bias_map}),
}
best_name, (pred_best, (rmse_best, mae_best, r2_best), params_best) = min(
    cands.items(), key=lambda kv: kv[1][1][0]
)

print(f"[Calibration] best={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB scores : RMSE={sc_ab[0]:.3f}, ISO : {sc_iso[0]:.3f}, ISO+BLD : {sc_iso_bias[0]:.3f}")
if best_name == "AB":
    print(f"  - a={params_best['a']:.3f}, b={params_best['b']:.3f}")
elif best_name.startswith("ISO"):
    print(f"  - gamma={params_best.get('gamma','-')} (ISO 객체 저장됨)")

# 8) 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 9) 6단계에서 재사용할 아티팩트 저장(전역)
CALIB_MODE   = best_name
CALIB_PARAMS = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST = features
CAT_COLS     = cat_cols
LGBM_PARAMS  = params
print("\n[Saved] CALIB_MODE, CALIB_PARAMS, BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 191.932	valid's rmse: 250.611
[800]	train's rmse: 158.815	valid's rmse: 239.59
[1200]	train's rmse: 140.375	valid's rmse: 234.776
[1600]	train's rmse: 127.489	valid's rmse: 232.896
[2000]	train's rmse: 117.524	valid's rmse: 232.333
Early stopping, best iteration is:
[2090]	train's rmse: 115.572	valid's rmse: 231.904
[Calibration] best=ISO  RMSE=222.850  MAE=109.444  R2=0.9967
  - AB scores : RMSE=231.127, ISO : 222.850, ISO+BLD : 224.497
  - gamma=- (ISO 객체 저장됨)

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-0.19, 773.573]         480.713370    468.545060  1920 -12.168310
(773.573, 1150.179]      987.890062    977.938907  1920  -9.951155
(1150.179, 1514.157]    1330.305422   1317.805263  1920 -12.500158
(1514.157, 1813.01]     1695.009984   1676.481321  1920 -18.528663
(1813.01, 2166.778]    

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [172]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':48,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':150,'lambda_l2':3.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# 6) 보정 후보 계산
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형(a,b): y ≈ base + a*pred + b
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) Isotonic: y ≈ base + g(pred)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
# (B+) Isotonic + 빌딩 평균 잔차 보정(γ)
gamma = 0.5  # 0.3~0.7 시도 가능

train_resid = (y_tr - baseline_tr).values

# 빌딩별 train 잔차 평균 (observed=False로 경고 제거)
bld_bias_map = (
    pd.Series(train_resid, index=train_part.index)
      .groupby(train_part['건물번호'], observed=False)
      .mean()
)

# 인덱스 타입 통일(int)
if isinstance(getattr(bld_bias_map.index, 'dtype', None), pd.CategoricalDtype):
    bld_bias_map.index = bld_bias_map.index.astype('int64')

# valid 건물번호도 int로 변환 후 dict 매핑 → float → NaN을 0.0으로
bld_bias_dict = bld_bias_map.to_dict()
bld_codes_va = valid_part['건물번호'].astype('int64')
bias_va = bld_codes_va.map(bld_bias_dict).astype('float64').fillna(0.0).to_numpy()

pred_iso_bias = pred_iso + gamma * bias_va
sc_iso_bias = (RMSE(y_va, pred_iso_bias), mean_absolute_error(y_va, pred_iso_bias), r2_score(y_va, pred_iso_bias))


# 7) 베스트 보정 선택
cands = {
    "AB":        (pred_ab,       sc_ab,       {"a":a,"b":b}),
    "ISO":       (pred_iso,      sc_iso,      {"iso":iso}),
    "ISO+BLD":   (pred_iso_bias, sc_iso_bias, {"iso":iso,"gamma":gamma,"bld_bias_map":bld_bias_map}),
}
best_name, (pred_best, (rmse_best, mae_best, r2_best), params_best) = min(
    cands.items(), key=lambda kv: kv[1][1][0]
)

print(f"[Calibration] best={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB scores : RMSE={sc_ab[0]:.3f}, ISO : {sc_iso[0]:.3f}, ISO+BLD : {sc_iso_bias[0]:.3f}")
if best_name == "AB":
    print(f"  - a={params_best['a']:.3f}, b={params_best['b']:.3f}")
elif best_name.startswith("ISO"):
    print(f"  - gamma={params_best.get('gamma','-')} (ISO 객체 저장됨)")

# 8) 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 9) 6단계에서 재사용할 아티팩트 저장(전역)
CALIB_MODE   = best_name
CALIB_PARAMS = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST = features
CAT_COLS     = cat_cols
LGBM_PARAMS  = params
print("\n[Saved] CALIB_MODE, CALIB_PARAMS, BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 205.222	valid's rmse: 252.457
[800]	train's rmse: 172.089	valid's rmse: 238.118
[1200]	train's rmse: 153.767	valid's rmse: 233.093
[1600]	train's rmse: 141.129	valid's rmse: 230.623
[2000]	train's rmse: 131.572	valid's rmse: 229.91
[2400]	train's rmse: 123.665	valid's rmse: 229.166
[2800]	train's rmse: 117.054	valid's rmse: 228.757
Early stopping, best iteration is:
[2774]	train's rmse: 117.47	valid's rmse: 228.635
[Calibration] best=ISO  RMSE=220.547  MAE=107.776  R2=0.9967
  - AB scores : RMSE=228.022, ISO : 220.547, ISO+BLD : 222.327
  - gamma=- (ISO 객체 저장됨)

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-4.404, 773.014]        480.723703    470.111125  1920 -10.612578
(773.014, 1153.43]       987.781547    979.375712  1920  -8.405835
(1153.43, 1516.8]       1330.462797   1320.860218  1920 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10))


In [175]:
# =========================
# 5R. Residual + Hard-Building Weights + Calibration Selection
# =========================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 피처 새로고침(위 셀에서 train_feat 이미 있으면 재사용)
try:
    _ = train_feat
except NameError:
    train_feat = make_features(train_df)

# 1) 검증 분할
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 2) 피처 선택 + 카테고리
features = get_feature_cols(train_part)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr = train_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
X_va = valid_part.reindex(columns=features).replace([np.inf,-np.inf], np.nan)
y_tr = train_part['전력소비량(kWh)']
y_va = valid_part['전력소비량(kWh)']

# 3) 베이스라인 → 잔차 타깃
baseline_tr = build_baseline(train_part)
baseline_va = build_baseline(valid_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 4) 하드 빌딩 가중치 (필요시 수정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 튠

# 5) LGB 학습(보수적 규제)
params = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':48,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':150,'lambda_l2':3.0,'seed':42,'verbosity':-1,'num_threads':4
}
lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=cat_cols or None)
model = lgb.train(
    params, lgb_train, num_boost_round=5000,
    valid_sets=[lgb_train,lgb_valid], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(400)]
)

# =========================
# 6)~9) 보정/선택/리포트 (롤백/고정 버전)
# =========================
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# --- (옵션) 하드 빌딩 가중치 '고정'
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}  # 이전에 잘 나왔던 셋으로 고정
# w_tr은 위 학습 셀에서 이미 이 셋 기준으로 적용된 상태면 그대로 두면 됨

pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

# (A) 선형 보정: y ≈ base + a*pred + b  (참고용)
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab = baseline_va.values + a * pred_va_resid + b
sc_ab = (RMSE(y_va, pred_ab), mean_absolute_error(y_va, pred_ab), r2_score(y_va, pred_ab))

# (B) ISO(플레인): 잔차 그대로로 적합(클리핑/바이어스 보정 없음)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, true_resid_va)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)
sc_iso = (RMSE(y_va, pred_iso), mean_absolute_error(y_va, pred_iso), r2_score(y_va, pred_iso))

# --- 강제 선택: ISO(플레인)
best_name = "ISO"
pred_best = pred_iso
rmse_best, mae_best, r2_best = sc_iso
params_best = {"iso": iso}

print(f"[Calibration] forced={best_name}  RMSE={rmse_best:.3f}  MAE={mae_best:.3f}  R2={r2_best:.4f}")
print(f"  - AB RMSE={sc_ab[0]:.3f} | ISO RMSE={sc_iso[0]:.3f}")

# --- 디사일 캘리브레이션 & Top-10
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = valid_part[['건물번호']].copy()
va_err['se_blend'] = (y_va.values - pred_best)**2
top10 = va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10)
print("\n[Top-10 by 건물번호]\n", top10)

# --- 다음 런용 HARD_BLD '자동 갱신 끄기' (그대로 유지)
# (아무 것도 갱신하지 않음)

# --- 6R용 아티팩트 저장
CALIB_MODE     = best_name
CALIB_PARAMS   = params_best
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST   = features
CAT_COLS       = cat_cols
LGBM_PARAMS    = params
print("\n[Saved] CALIB_MODE=ISO (plain), BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS, LGBM_PARAMS")


  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 205.222	valid's rmse: 252.457
[800]	train's rmse: 172.089	valid's rmse: 238.118
[1200]	train's rmse: 153.767	valid's rmse: 233.093
[1600]	train's rmse: 141.129	valid's rmse: 230.623
[2000]	train's rmse: 131.572	valid's rmse: 229.91
[2400]	train's rmse: 123.665	valid's rmse: 229.166
[2800]	train's rmse: 117.054	valid's rmse: 228.757
Early stopping, best iteration is:
[2774]	train's rmse: 117.47	valid's rmse: 228.635
[Calibration] forced=ISO  RMSE=220.547  MAE=107.776  R2=0.9967
  - AB RMSE=228.022 | ISO RMSE=220.547

[Calibration by decile]
                              y_mean     yhat_mean     n       bias
bin                                                               
(-4.404, 773.014]        480.723703    470.111125  1920 -10.612578
(773.014, 1153.43]       987.781547    979.375712  1920  -8.405835
(1153.43, 1516.8]       1330.462797   1320.860218  1920  -9.602579
(1516.8, 1815.118]      1693.607271 

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
  top10 = va_err.groupby('건물번호')['se_blend'].mean().pow(0.5).sort_values(ascending=False).head(10)


In [178]:
# =========================
# 6R. Full Retrain & Submit (feature alignment fix)
# =========================
DO_SUBMIT = True  # 필요시 False

# 0) all_feat 준비(있으면 재사용)
try:
    _ = all_feat
except NameError:
    test_orig = read_csv_smart(TEST_PATH)
    build_df  = read_csv_smart(BUILD_PATH)
    test_df = pd.merge(test_orig, build_df, on='건물번호', how='left')
    test_df = clean_capacity_fields(test_df)
    test_df = ensure_datetime_cols(test_df)
    all_df  = pd.concat([train_df, test_df], ignore_index=True)
    all_feat = make_features(all_df)

all_feat_train = all_feat.iloc[:len(train_df)].copy()
all_feat_test  = all_feat.iloc[len(train_df):].copy()

# === 핵심: 5R에서 저장한 FEATURE_LIST/CAT_COLS를 반드시 사용 ===
# (혹시 변수 없으면 5R의 model에서 가져오고 교집합만 사용)
if 'FEATURE_LIST' not in globals():
    FEATURE_LIST = list(model.feature_name())
if 'CAT_COLS' not in globals():
    CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

# 안전: train/test에 없는 컬럼 제거 (교집합 유지)
common_feats = [c for c in FEATURE_LIST if c in all_feat_train.columns and c in all_feat_test.columns]
if len(common_feats) != len(FEATURE_LIST):
    missing = set(FEATURE_LIST) - set(common_feats)
    print("⚠️ 다음 피처가 test/train에 없어 제외됨:", sorted(list(missing)))
FEATURE_LIST = common_feats
CAT_COLS = [c for c in CAT_COLS if c in FEATURE_LIST]

# 정렬/결측 처리 유틸
def align_for_lgb(df_train, df_test, feature_list, cat_cols):
    X_tr = df_train.reindex(columns=feature_list)
    X_te = df_test.reindex(columns=feature_list)

    # 카테고리 먼저 맞춰주기
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_te[c] = X_te[c].astype('category')
        cats = X_tr[c].cat.categories
        X_te[c] = X_te[c].cat.set_categories(cats)

    # 수치 컬럼 결측/inf 처리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols]
                      .replace([np.inf, -np.inf], np.nan)
                      .astype(float)
                      .fillna(0.0))
    X_te[num_cols] = (X_te[num_cols]
                      .replace([np.inf, -np.inf], np.nan)
                      .astype(float)
                      .fillna(0.0))
    return X_tr, X_te

# 1) 정렬된 학습/예측 행렬 만들기
features_full = FEATURE_LIST
cat_cols_full = CAT_COLS
X_full, X_te = align_for_lgb(all_feat_train, all_feat_test, features_full, cat_cols_full)

# 안전 체크
assert list(X_full.columns) == list(X_te.columns), "train/test 피처 순서 불일치"
print(f"n_features (train/test): {X_full.shape[1]} / {X_te.shape[1]}")

# 2) 전체 재학습(잔차 타깃)
y_full = all_feat_train['전력소비량(kWh)']
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

w_full = np.ones(len(all_feat_train), dtype=float)
try:
    # 5R에서 썼던 HARD_BLD 그대로 있으면 재적용
    w_full[all_feat_train['건물번호'].isin(HARD_BLD).values] = 1.8
except NameError:
    pass

lgb_full = lgb.Dataset(
    X_full,
    label=(y_full - baseline_full).astype(float),
    weight=w_full,
    categorical_feature=cat_cols_full or None
)

final_model = lgb.train(LGBM_PARAMS, lgb_full, num_boost_round=BEST_NUM_BOOST)

# 3) 테스트 잔차 예측 + 5R에서 선택된 보정 적용
resid_te = final_model.predict(X_te, num_iteration=final_model.best_iteration)

if CALIB_MODE == "ISO":
    iso = CALIB_PARAMS["iso"]
    test_pred = baseline_te.values + iso.transform(resid_te)
elif CALIB_MODE == "AB":
    a, b = CALIB_PARAMS["a"], CALIB_PARAMS["b"]
    test_pred = baseline_te.values + a * resid_te + b
elif CALIB_MODE == "ISO+BLD":
    iso   = CALIB_PARAMS["iso"]
    gamma = CALIB_PARAMS["gamma"]
    bmap  = CALIB_PARAMS["bld_bias_map"]
    if isinstance(getattr(bmap.index, 'dtype', None), pd.CategoricalDtype):
        bmap.index = bmap.index.astype('int64')
    bias_te = all_feat_test['건물번호'].astype('int64').map(bmap.to_dict()).astype('float64').fillna(0.0).to_numpy()
    test_pred = baseline_te.values + iso.transform(resid_te) + gamma * bias_te
else:
    test_pred = baseline_te.values + resid_te  # fallback

# 안전 클램프
test_pred = np.clip(test_pred, 0, None)

if DO_SUBMIT:
    sub = read_csv_smart(SAMPLE_SUB)
    sub['answer'] = pd.Series(test_pred, index=sub.index)
    sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
    print(f"저장 완료 → {OUT_SUB}  (rows={len(sub)})")
else:
    print("제출 생략(DO_SUBMIT=False): 예측만 계산 완료")



n_features (train/test): 40 / 40
저장 완료 → C:\Users\user\Downloads\open (1)\baseline_lgbm_submission.csv  (rows=16800)


In [180]:
# =========================
# test 전처리: train과 '완전히 동일' 스키마/타입으로 맞추기
#  - building_info 병합
#  - 설비 용량 '-' → 0 (float)
#  - '일시' → '날짜','시간','dt' 생성
#  - 일사/일조 없으면 0.0 컬럼 추가
#  - train(merged_train.csv)를 기준으로 컬럼/순서/카테고리 일치
#  - 저장: merged_test.csv
# =========================
import pandas as pd
import numpy as np

# 경로
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"  # 앞에서 저장한 train 전처리 결과
TEST_PATH         = r"C:\Users\user\Downloads\open (1)\test.csv"
BUILD_PATH        = r"C:\Users\user\Downloads\open (1)\building_info.csv"
OUT_TEST_PATH     = r"C:\Users\user\Downloads\open (1)\merged_test.csv"

# ----- 유틸 -----
def read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].replace('-', 0), errors='coerce').fillna(0.0).astype(float)
    return df

def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

# ----- 1) 기준(train 전처리 결과) 스키마 로드 -----
train_ref = read_csv_smart(TRAIN_MERGED_PATH)
# 카테고리 정보가 있으면 유지 (건물번호/건물유형은 우리가 카테고리로 저장했을 가능성 큼)
for c in ['건물번호','건물유형']:
    if c in train_ref.columns:
        try:
            train_ref[c] = train_ref[c].astype('category')
        except Exception:
            pass

# ----- 2) test 로드 & train과 동일 전처리 -----
test = read_csv_smart(TEST_PATH)
build = read_csv_smart(BUILD_PATH)

test_merged = pd.merge(test, build, on='건물번호', how='left')
test_merged = clean_capacity_fields(test_merged)
test_merged = ensure_datetime_cols(test_merged)

# 일사/일조 없으면 0으로 생성(학습 일관성)
if '일사(MJ/m2)' not in test_merged.columns:
    test_merged['일사(MJ/m2)'] = 0.0
if '일조(hr)' not in test_merged.columns:
    test_merged['일조(hr)'] = 0.0

# ----- 3) 컬럼/순서/타입을 train에 '맞춰서' 정렬 -----
# target은 test에 없으므로 ref에서 제거
ref_cols = [c for c in train_ref.columns if c != '전력소비량(kWh)']

# ref에 있는데 test에 없는 컬럼은 dtype에 따라 기본값으로 생성
for c in ref_cols:
    if c not in test_merged.columns:
        ref_dtype = train_ref[c].dtype
        if np.issubdtype(ref_dtype, np.number):
            test_merged[c] = 0.0
        elif np.issubdtype(ref_dtype, np.datetime64):
            test_merged[c] = pd.NaT
        else:
            test_merged[c] = pd.Series([pd.NA]*len(test_merged), dtype="object")

# 반대로, test에만 있는 컬럼은 그대로 두되 저장/모델링 때는 ref_cols 순서만 사용
# 카테고리 일치(라벨 매칭)
for c in ['건물번호','건물유형']:
    if c in ref_cols and c in test_merged.columns:
        # train_ref 쪽 카테고리 있으면 세팅
        if str(train_ref[c].dtype) == 'category':
            cats = train_ref[c].astype('category').cat.categories
            test_merged[c] = test_merged[c].astype('category').cat.set_categories(cats)
        else:
            # 아닌 경우에도 최소한 dtype 통일
            test_merged[c] = test_merged[c].astype(train_ref[c].dtype)

# 최종 컬럼 순서 train 기준으로 정렬
test_aligned = test_merged.reindex(columns=ref_cols)

# ----- 4) 저장 -----
test_aligned.to_csv(OUT_TEST_PATH, index=False, encoding='utf-8-sig')
print(f"[OK] 저장: {OUT_TEST_PATH}  rows={len(test_aligned)}  cols={len(test_aligned.columns)}")
# (선택) sanity check
missing_after = [c for c in ref_cols if c not in test_aligned.columns]
if missing_after:
    print("⚠️ 아직 없는 컬럼:", missing_after)


[OK] 저장: C:\Users\user\Downloads\open (1)\merged_test.csv  rows=16800  cols=18


In [184]:
# =========================================
# Test 예측 (train과 동일 파이프라인, num_date_time 제외, factor 처리)
# =========================================
import numpy as np, pandas as pd, lightgbm as lgb

# --- 경로 (필요시 수정) ---
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"
TEST_PATH         = r"C:\Users\user\Downloads\open (1)\merged_test.csv"
BUILD_PATH        = r"C:\Users\user\Downloads\open (1)\building_info.csv"
SAMPLE_SUB        = r"C:\Users\user\Downloads\open (1)\sample_submission.csv"
OUT_SUB           = r"C:\Users\user\Downloads\open (1)\submission.csv"

# --- 유틸 ---
def read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def align_for_lgb(df_tr, df_te, feature_list, cat_cols):
    """train/test를 같은 컬럼·순서·dtype으로 정렬"""
    X_tr = df_tr.reindex(columns=feature_list).copy()
    X_te = df_te.reindex(columns=feature_list).copy()
    # 카테고리 맞춤
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_te[c] = X_te[c].astype('category').cat.set_categories(X_tr[c].cat.categories)
    # 수치 결측/inf 처리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    X_te[num_cols] = (X_te[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    return X_tr, X_te

# --- 0) 데이터 로드 (train 전처리 결과 + test 전처리본) ---
train_df = read_csv_smart(TRAIN_MERGED_PATH)
test_raw = read_csv_smart(TEST_PATH)   # merged_test.csv (이미 병합된 파일)

# ✅ 둘 다 날짜/시간/dt를 '확실히' datetime으로 보정
train_df = ensure_datetime_cols(train_df)
test_df  = test_raw.copy()
test_df  = clean_capacity_fields(test_df)
test_df  = ensure_datetime_cols(test_df)

# (강화 가드: dt가 여전히 object면 한 번 더 강제 변환)
for df_ in (train_df, test_df):
    if 'dt' in df_.columns and not np.issubdtype(df_['dt'].dtype, np.datetime64):
        df_['날짜'] = pd.to_datetime(df_['날짜'])
        df_['시간'] = df_['시간'].astype(int)
        df_['dt']  = df_['날짜'] + pd.to_timedelta(df_['시간'], unit='h')

# --- 2) train+test 합쳐 동일 피처 생성 ---
all_df   = pd.concat([train_df, test_df], ignore_index=True)
all_feat = make_features(all_df)  # 이제 .dt 접근 OK

# --- 3) 피처 목록: 5R에서 저장한 FEATURE_LIST 우선, 없으면 생성 ---
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
if 'FEATURE_LIST' not in globals():
    from pandas.api.types import is_numeric_dtype, is_categorical_dtype
    FEATURE_LIST = [c for c in all_feat_train.columns if c not in EXCLUDE
                    and (is_numeric_dtype(all_feat_train[c]) or str(all_feat_train[c].dtype)=='category')]
else:
    # num_date_time 등 제외 보장
    FEATURE_LIST = [c for c in FEATURE_LIST if c not in EXCLUDE]

# 카테고리 지정(요구: 건물번호/건물유형 factor)
if 'CAT_COLS' not in globals():
    CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]
else:
    CAT_COLS = [c for c in CAT_COLS if c in FEATURE_LIST]

# 공통 피처만 사용 (학습/예측 모두 존재)
common_feats = [c for c in FEATURE_LIST if c in all_feat_train.columns and c in all_feat_test.columns]
if len(common_feats) != len(FEATURE_LIST):
    removed = sorted(set(FEATURE_LIST) - set(common_feats))
    print("⚠️ 제외된 피처:", removed)
FEATURE_LIST = common_feats
CAT_COLS     = [c for c in CAT_COLS if c in FEATURE_LIST]

# --- 4) 행렬 정렬 & 카테고리(factor) 일치 ---
X_full, X_te = align_for_lgb(all_feat_train, all_feat_test, FEATURE_LIST, CAT_COLS)
y_full = all_feat_train['전력소비량(kWh)']

# --- 5) 베이스라인 계산 (train/test)
baseline_full = build_baseline(all_feat_train)
baseline_te   = build_baseline(all_feat_test)

# --- 6) 전체 재학습(잔차 타깃) ---
# 5R에서 저장한 파라미터/반복수 우선 사용, 없으면 안전 기본값
if 'LGBM_PARAMS' not in globals():
    LGBM_PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 48,          # ← 64 → 48
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 150,   # ← 120 → 150
    'lambda_l2': 3.0,          # ← 2.0 → 3.0
    'seed': 42,
    'verbosity': -1,
    'num_threads': 4
}
    # 5R에서 사용할 params도 이걸 그대로 참조하게
params = LGBM_PARAMS.copy()

print("[LGBM PARAMS in use]")
for k, v in LGBM_PARAMS.items():
    print(f"  {k}: {v}")
    
if 'BEST_NUM_BOOST' not in globals():
    BEST_NUM_BOOST = 2000

# 하드 빌딩 가중치(있으면 적용)
w_full = np.ones(len(all_feat_train), dtype=float)
if 'HARD_BLD' in globals():
    w_full[all_feat_train['건물번호'].isin(HARD_BLD).values] = 1.8

lgb_full = lgb.Dataset(
    X_full,
    label=(y_full - baseline_full).astype(float),
    weight=w_full,
    categorical_feature=CAT_COLS or None
)
final_model = lgb.train(LGBM_PARAMS, lgb_full, num_boost_round=BEST_NUM_BOOST)

# --- 7) 테스트 잔차 예측 + 5R 보정 적용 (ISO / AB / ISO+BLD) ---
resid_te = final_model.predict(X_te, num_iteration=final_model.best_iteration)

if 'CALIB_MODE' in globals() and CALIB_MODE == "ISO":
    iso = CALIB_PARAMS["iso"]
    test_pred = baseline_te.values + iso.transform(resid_te)
elif 'CALIB_MODE' in globals() and CALIB_MODE == "AB":
    a, b = CALIB_PARAMS["a"], CALIB_PARAMS["b"]
    test_pred = baseline_te.values + a * resid_te + b
elif 'CALIB_MODE' in globals() and CALIB_MODE == "ISO+BLD":
    iso   = CALIB_PARAMS["iso"]
    gamma = CALIB_PARAMS["gamma"]
    bmap  = CALIB_PARAMS["bld_bias_map"]
    # 빌딩 바이어스 매핑
    try:
        from pandas.api.types import CategoricalDtype
        if isinstance(getattr(bmap.index, 'dtype', None), CategoricalDtype):
            bmap.index = bmap.index.astype('int64')
    except Exception:
        pass
    bias_te = all_feat_test['건물번호'].astype('int64').map(bmap.to_dict()).astype('float64').fillna(0.0).to_numpy()
    test_pred = baseline_te.values + iso.transform(resid_te) + gamma * bias_te
else:
    # 보정 정보 없으면 기본 복원
    test_pred = baseline_te.values + resid_te

# 음수 방지
test_pred = np.clip(test_pred, 0, None)

# --- 8) 저장 (num_date_time은 제출을 위한 ID로만 사용; 피처에는 미사용) ---
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')

print(f"[DONE] 예측 저장 → {OUT_SUB}")
print("n_features(train/test):", X_full.shape[1], "/", X_te.shape[1])
print("neg_rate:", float((test_pred < 0).mean()), " nan_rate:", float(np.isnan(test_pred).mean()))


[LGBM PARAMS in use]
  objective: regression
  metric: rmse
  learning_rate: 0.05
  num_leaves: 48
  feature_fraction: 0.85
  bagging_fraction: 0.85
  bagging_freq: 1
  min_data_in_leaf: 150
  lambda_l2: 3.0
  seed: 42
  verbosity: -1
  num_threads: 4
[DONE] 예측 저장 → C:\Users\user\Downloads\open (1)\submission.csv
n_features(train/test): 40 / 40
neg_rate: 0.0  nan_rate: 0.0


In [185]:
# =========================================
# Step 5 — 검증 학습(Residual) + 보정 선택(SMAPE 기준)
#  - 전제: read_csv_smart, clean_capacity_fields, ensure_datetime_cols, make_features 가 이미 정의됨
#  - 경로: 필요 시 수정
# =========================================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.isotonic import IsotonicRegression

# ----- 경로 -----
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"

# ----- 공통 유틸 -----
def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))
def SMAPE(y, yhat, eps=1e-6):
    y = np.asarray(y, dtype=float); yhat = np.asarray(yhat, dtype=float)
    denom = (np.abs(y) + np.abs(yhat)) + eps
    return float(200.0 * np.mean(np.abs(y - yhat) / denom))

def pick_feature_list(df, exclude=None):
    from pandas.api.types import is_numeric_dtype
    exclude = set(exclude or [])
    feats = []
    for c in df.columns:
        if c in exclude: continue
        if str(df[c].dtype) == 'category' or is_numeric_dtype(df[c]):
            feats.append(c)
    return feats

def align_for_lgb(df_tr, df_va, feature_list, cat_cols):
    X_tr = df_tr.reindex(columns=feature_list).copy()
    X_va = df_va.reindex(columns=feature_list).copy()
    # 카테고리 라벨 정렬
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_va[c] = X_va[c].astype('category').cat.set_categories(X_tr[c].cat.categories)
    # 수치형 정리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = X_tr[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    X_va[num_cols] = X_va[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    return X_tr, X_va

def build_baseline_local(df):
    # make_features에서 만든 롤링 평균 기반(없는 경우 유연 대체)
    s7  = df.get('cons_samehour_mean_7d', pd.Series(0.0, index=df.index))
    s24 = df.get('cons_mean_24h',        pd.Series(0.0, index=df.index))
    base = 0.7 * s7.fillna(0) + 0.3 * s24.fillna(0)
    # 최후 보루: 건물별 과거 평균
    if base.isna().any():
        gb = df.groupby('건물번호')['전력소비량(kWh)'].transform('mean') if '전력소비량(kWh)' in df.columns else 0
        base = base.fillna(gb).fillna(0.0)
    return base

# ----- 데이터 로드 & 보정 -----
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)

# 피처 생성
train_feat = make_features(train_df)

# 검증 구간(08-17 ~ 08-24)
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
tr_part = train_feat[~is_val].copy()
va_part = train_feat[ is_val].copy()

# 피처/타깃
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
FEATURE_LIST = pick_feature_list(tr_part, exclude=EXCLUDE)
CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

X_tr, X_va = align_for_lgb(tr_part, va_part, FEATURE_LIST, CAT_COLS)
y_tr = tr_part['전력소비량(kWh)'].astype(float)
y_va = va_part['전력소비량(kWh)'].astype(float)

# 베이스라인 & 잔차 타깃
baseline_tr = build_baseline_local(tr_part)
baseline_va = build_baseline_local(va_part)
y_tr_resid = (y_tr - baseline_tr).astype(float)
y_va_resid = (y_va - baseline_va).astype(float)

# 하드 빌딩 가중치(선택)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(tr_part), dtype=float)
if '건물번호' in tr_part.columns:
    w_tr[tr_part['건물번호'].isin(HARD_BLD).values] = 1.8

# LGB 파라미터(최적)
LGBM_PARAMS = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':48,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
    'min_data_in_leaf':150,'lambda_l2':3.0,'seed':42,'verbosity':-1,'num_threads':4
}

# 학습
lgb_tr = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=CAT_COLS or None)
lgb_va = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=CAT_COLS or None)
model = lgb.train(
    LGBM_PARAMS, lgb_tr, num_boost_round=5000,
    valid_sets=[lgb_tr, lgb_va], valid_names=['train','valid'],
    callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=400)]
)

# 검증 잔차 예측
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

# --- 보정 후보 (SMAPE 기준 선택) ---
# A) AB (선형)
A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, y_va_resid.values, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5)); b = float(b)
pred_ab  = baseline_va.values + a * pred_va_resid + b

# AB 미세 튠(SMAPE 기준 소그리드)
a_grid = np.linspace(max(0.0, a-0.1), min(1.5, a+0.1), 9)
b_grid = np.linspace(b-20, b+20, 9)
best_s, best_a, best_b = SMAPE(y_va, pred_ab), a, b
best_pred_ab = pred_ab
for aa in a_grid:
    for bb in b_grid:
        yh = baseline_va.values + aa * pred_va_resid + bb
        s = SMAPE(y_va, yh)
        if s < best_s:
            best_s, best_a, best_b, best_pred_ab = s, aa, bb, yh
a, b, pred_ab = float(best_a), float(best_b), best_pred_ab

# B) ISO(plain)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_va_resid, y_va_resid.values)
pred_iso = baseline_va.values + iso.transform(pred_va_resid)

# 점수 집계
cands = {"AB": pred_ab, "ISO": pred_iso}
scores = {k: {"SMAPE": SMAPE(y_va, v),
              "RMSE":  RMSE(y_va, v),
              "MAE":   float(mean_absolute_error(y_va, v))}
          for k, v in cands.items()}

best_name = sorted(scores.items(), key=lambda kv: (kv[1]["SMAPE"], kv[1]["RMSE"]))[0][0]
pred_best = cands[best_name]

print("[Calibration by SMAPE] best =", best_name)
for k, v in scores.items():
    print(f"  {k}: SMAPE={v['SMAPE']:.4f}, RMSE={v['RMSE']:.3f}, MAE={v['MAE']:.3f}")

# 리포트
df_cal = pd.DataFrame({"y": y_va.values, "yhat": pred_best})
df_cal["bin"] = pd.qcut(df_cal["yhat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))
cal["bias"] = cal["yhat_mean"] - cal["y_mean"]
print("\n[Calibration by decile]\n", cal)
print("평균 절대 바이어스:", cal["bias"].abs().mean())

va_err = va_part[['건물번호']].copy()
va_err['se'] = (y_va.values - pred_best)**2
print("\n[Top-10 by 건물번호]\n",
      va_err.groupby('건물번호')['se'].mean().pow(0.5).sort_values(ascending=False).head(10))

# 아티팩트 저장(6단계/제출에서 재사용)
CALIB_MODE     = best_name
CALIB_PARAMS   = {"a":a, "b":b} if best_name=="AB" else {"iso":iso}
BEST_NUM_BOOST = int(model.best_iteration or 2000)
FEATURE_LIST   = FEATURE_LIST
CAT_COLS       = CAT_COLS
print("\n[Saved] CALIB_MODE, CALIB_PARAMS, BEST_NUM_BOOST, FEATURE_LIST, CAT_COLS")


Training until validation scores don't improve for 200 rounds
[400]	train's rmse: 195.037	valid's rmse: 233.342
[800]	train's rmse: 164.818	valid's rmse: 220.22
[1200]	train's rmse: 148.592	valid's rmse: 215.879
[1600]	train's rmse: 137.138	valid's rmse: 213.496
[2000]	train's rmse: 128.38	valid's rmse: 212.477
[2400]	train's rmse: 121.045	valid's rmse: 211.972
Early stopping, best iteration is:
[2531]	train's rmse: 118.887	valid's rmse: 211.713
[Calibration by SMAPE] best = AB
  AB: SMAPE=4.8102, RMSE=211.411, MAE=105.527
  ISO: SMAPE=4.9326, RMSE=205.342, MAE=104.091

[Calibration by decile]
                                       y_mean     yhat_mean     n       bias
bin                                                                        
(-22.668000000000003, 789.083]    480.411891    474.440015  1920  -5.971876
(789.083, 1158.568]               986.544745    987.099117  1920   0.554373
(1158.568, 1533.654]             1329.427161   1326.981954  1920  -2.445208
(1533.654, 1830.00

  cal = df_cal.groupby("bin").agg(y_mean=("y","mean"), yhat_mean=("yhat","mean"), n=("y","size"))


In [187]:

# =========================================
# Step 5R — Hyperparameter Search (SMAPE-opt)
#  - 잔차학습(LGB) + 보정(AB/ISO) 중 SMAPE 최소 조합 선택
#  - 결과: LGBM_PARAMS_BEST, BEST_NUM_BOOST, CALIB_MODE, CALIB_PARAMS 저장
# =========================================
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression

np.random.seed(42)

# ---- 경로 ----
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"

# ---- 유틸 ----
def RMSE(y, yhat): 
    return float(np.sqrt(mean_squared_error(y, yhat)))

def SMAPE(y, yhat, eps=1e-6):
    y = np.asarray(y, dtype=float); yhat = np.asarray(yhat, dtype=float)
    denom = (np.abs(y) + np.abs(yhat)) + eps
    return float(200.0 * np.mean(np.abs(y - yhat) / denom))

def pick_feature_list(df, exclude=None):
    from pandas.api.types import is_numeric_dtype
    exclude = set(exclude or [])
    feats = []
    for c in df.columns:
        if c in exclude: 
            continue
        if str(df[c].dtype) == 'category' or is_numeric_dtype(df[c]):
            feats.append(c)
    return feats

def align_for_lgb(df_tr, df_va, feature_list, cat_cols):
    X_tr = df_tr.reindex(columns=feature_list).copy()
    X_va = df_va.reindex(columns=feature_list).copy()
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        X_va[c] = X_va[c].astype('category').cat.set_categories(X_tr[c].cat.categories)
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0))
    X_va[num_cols] = (X_va[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0))
    return X_tr, X_va

def build_baseline_local(df):
    s7  = df.get('cons_samehour_mean_7d', pd.Series(0.0, index=df.index))
    s24 = df.get('cons_mean_24h',        pd.Series(0.0, index=df.index))
    base = 0.7 * s7.fillna(0) + 0.3 * s24.fillna(0)
    if base.isna().any():
        gb = df.groupby('건물번호')['전력소비량(kWh)'].transform('mean') if '전력소비량(kWh)' in df.columns else 0
        base = base.fillna(gb).fillna(0.0)
    return base

def tune_ab_by_smape(baseline_va, pred_va_resid, y_va_resid, 
                     a_init, b_init, 
                     a_span=0.1, b_span=20, a_cap=(0.0, 1.5), n_grid=9):
    # OLS 해 주변 소그리드 탐색으로 SMAPE 최소화
    a_low  = max(a_cap[0], a_init - a_span); a_high = min(a_cap[1], a_init + a_span)
    b_low  = b_init - b_span;                b_high = b_init + b_span
    a_grid = np.linspace(a_low, a_high, n_grid)
    b_grid = np.linspace(b_low, b_high, n_grid)
    best_s, best_a, best_b, best_pred = np.inf, a_init, b_init, None
    for aa in a_grid:
        for bb in b_grid:
            yh = baseline_va.values + aa * pred_va_resid + bb
            s  = SMAPE(baseline_va.values + y_va_resid, yh)  # 원스케일 y vs yhat
            if s < best_s:
                best_s, best_a, best_b, best_pred = s, float(aa), float(bb), yh
    return best_a, best_b, best_pred, best_s

# ---- 데이터/피처 ----
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)
train_feat = make_features(train_df)

VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
tr_part = train_feat[~is_val].copy()
va_part = train_feat[ is_val].copy()

EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
FEATURE_LIST = pick_feature_list(tr_part, exclude=EXCLUDE)
CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

X_tr, X_va = align_for_lgb(tr_part, va_part, FEATURE_LIST, CAT_COLS)
y_tr = tr_part['전력소비량(kWh)'].astype(float)
y_va = va_part['전력소비량(kWh)'].astype(float)

baseline_tr = build_baseline_local(tr_part)
baseline_va = build_baseline_local(va_part)
y_tr_resid  = (y_tr - baseline_tr).astype(float)
y_va_resid  = (y_va - baseline_va).astype(float)

# 하드 빌딩 가중치(탐색 대상에 포함할 수도 있지만 기본은 고정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(tr_part), dtype=float)
if '건물번호' in tr_part.columns:
    w_tr[tr_part['건물번호'].isin(HARD_BLD).values] = 1.8

# ---- 탐색 공간 ----
NUM_TRIALS = 40  # 필요시 60~100으로 늘려도 OK
space = {
    "learning_rate":  [0.03, 0.04, 0.05, 0.06, 0.07],
    "num_leaves":     list(range(24, 96, 8)),          # 24~88
    "min_data_in_leaf": [80, 100, 120, 150, 180, 220],
    "feature_fraction": [0.70, 0.75, 0.80, 0.85, 0.90],
    "bagging_fraction": [0.70, 0.75, 0.80, 0.85, 0.90],
    "bagging_freq":   [1, 2],
    "lambda_l2":      [0.0, 1.0, 2.0, 3.0, 5.0],
}

def sample_params():
    return {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate':   np.random.choice(space["learning_rate"]),
        'num_leaves':      int(np.random.choice(space["num_leaves"])),
        'feature_fraction':float(np.random.choice(space["feature_fraction"])),
        'bagging_fraction':float(np.random.choice(space["bagging_fraction"])),
        'bagging_freq':    int(np.random.choice(space["bagging_freq"])),
        'min_data_in_leaf':int(np.random.choice(space["min_data_in_leaf"])),
        'lambda_l2':       float(np.random.choice(space["lambda_l2"])),
        'seed': 42,
        'verbosity': -1,
        'num_threads': 4
    }

best = {
    "smape": np.inf,
    "rmse":  np.inf,
    "mae":   np.inf,
    "params": None,
    "num_boost": None,
    "calib": ("ISO", None)  # ("mode", params_dict)
}

for t in range(1, NUM_TRIALS+1):
    params = sample_params()

    # 학습 (early stopping)
    dtr = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=CAT_COLS or None)
    dva = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=CAT_COLS or None)
    model = lgb.train(
        params, dtr, num_boost_round=5000,
        valid_sets=[dtr, dva], valid_names=['train','valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
    )
    pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

    # --- 보정 후보: AB/ISO 중 SMAPE 최소 ---
    # AB 초기값(OLS)
    A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
    a0, b0 = np.linalg.lstsq(A, y_va_resid.values, rcond=None)[0]
    a0 = float(np.clip(a0, 0.0, 1.5)); b0 = float(b0)
    a_opt, b_opt, pred_ab, smape_ab = tune_ab_by_smape(baseline_va, pred_va_resid, y_va_resid, a0, b0)

    # ISO(plain)
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(pred_va_resid, y_va_resid.values)
    pred_iso = baseline_va.values + iso.transform(pred_va_resid)
    smape_iso = SMAPE(y_va, pred_iso)

    # 선택
    if smape_ab <= smape_iso:
        smape_best = smape_ab
        rmse_best  = RMSE(y_va, pred_ab)
        mae_best   = float(mean_absolute_error(y_va, pred_ab))
        calib_mode = ("AB", {"a":a_opt, "b":b_opt})
    else:
        smape_best = smape_iso
        rmse_best  = RMSE(y_va, pred_iso)
        mae_best   = float(mean_absolute_error(y_va, pred_iso))
        calib_mode = ("ISO", {"iso": iso})

    if smape_best < best["smape"]:
        best.update({
            "smape": smape_best,
            "rmse": rmse_best,
            "mae": mae_best,
            "params": params,
            "num_boost": int(model.best_iteration or 2000),
            "calib": calib_mode
        })
    if t % 10 == 0 or t == 1:
        print(f"[{t}/{NUM_TRIALS}] best_SMAPE={best['smape']:.4f}  (current {smape_best:.4f})  mode={calib_mode[0]}  iters={model.best_iteration}")

# ---- 결과 고정(전역) ----
LGBM_PARAMS_BEST = best["params"]
BEST_NUM_BOOST   = best["num_boost"]
CALIB_MODE       = best["calib"][0]
CALIB_PARAMS     = best["calib"][1]
FEATURE_LIST     = FEATURE_LIST
CAT_COLS         = CAT_COLS

print("\n[SEARCH DONE]")
print(" SMAPE:", f"{best['smape']:.4f}", "| RMSE:", f"{best['rmse']:.3f}", "| MAE:", f"{best['mae']:.3f}")
print(" MODE :", CALIB_MODE, CALIB_PARAMS)
print(" PARAMS:", LGBM_PARAMS_BEST)
print(" NUM_BOOST:", BEST_NUM_BOOST)


[1/40] best_SMAPE=4.4927  (current 4.4927)  mode=AB  iters=1434
[10/40] best_SMAPE=4.3632  (current 4.8248)  mode=AB  iters=1880
[20/40] best_SMAPE=4.3632  (current 5.2165)  mode=AB  iters=2072
[30/40] best_SMAPE=4.3632  (current 4.7298)  mode=AB  iters=1478
[40/40] best_SMAPE=4.3632  (current 4.7520)  mode=ISO  iters=4259

[SEARCH DONE]
 SMAPE: 4.3632 | RMSE: 214.405 | MAE: 105.516
 MODE : AB {'a': 1.006912374168081, 'b': 3.948418827927597}
 PARAMS: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': np.float64(0.05), 'num_leaves': 40, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 2, 'min_data_in_leaf': 80, 'lambda_l2': 2.0, 'seed': 42, 'verbosity': -1, 'num_threads': 4}
 NUM_BOOST: 2993


In [188]:
# ---- 데이터/피처 ----
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)
train_feat = make_features(train_df)

VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
tr_part = train_feat[~is_val].copy()
va_part = train_feat[ is_val].copy()

EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
FEATURE_LIST = pick_feature_list(tr_part, exclude=EXCLUDE)
CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

X_tr, X_va = align_for_lgb(tr_part, va_part, FEATURE_LIST, CAT_COLS)
y_tr = tr_part['전력소비량(kWh)'].astype(float)
y_va = va_part['전력소비량(kWh)'].astype(float)

baseline_tr = build_baseline_local(tr_part)
baseline_va = build_baseline_local(va_part)
y_tr_resid  = (y_tr - baseline_tr).astype(float)
y_va_resid  = (y_va - baseline_va).astype(float)

# 하드 빌딩 가중치(탐색 대상에 포함할 수도 있지만 기본은 고정)
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(tr_part), dtype=float)
if '건물번호' in tr_part.columns:
    w_tr[tr_part['건물번호'].isin(HARD_BLD).values] = 1.8

# ---- 탐색 공간 ----
space = {
    "learning_rate":  [0.05],                # lr는 고정(현재 안정)
    "num_leaves":     list(range(32, 64, 8)),# 32,40,48,56 (근처만 좁게)
    "min_data_in_leaf": [60, 80, 100, 120],  # ← 하한 쪽 확대
    "feature_fraction": [0.80, 0.85, 0.90],
    "bagging_fraction": [0.80, 0.85, 0.90],
    "bagging_freq":   [1, 2, 3],             # ← 상한 쪽 확대
    "lambda_l2":      [1.0, 2.0, 3.0, 5.0],
}

NUM_TRIALS = 40  # 필요시 60~100으로 늘려도 OK
NUM_TRIALS = 20         # 20~30 권장
MAX_BOOST  = 5000       # ← 2993에 닿았으니 여유
EARLY_STOP_ROUNDS = 200

def sample_params():
    return {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate':   np.random.choice(space["learning_rate"]),
        'num_leaves':      int(np.random.choice(space["num_leaves"])),
        'feature_fraction':float(np.random.choice(space["feature_fraction"])),
        'bagging_fraction':float(np.random.choice(space["bagging_fraction"])),
        'bagging_freq':    int(np.random.choice(space["bagging_freq"])),
        'min_data_in_leaf':int(np.random.choice(space["min_data_in_leaf"])),
        'lambda_l2':       float(np.random.choice(space["lambda_l2"])),
        'seed': 42,
        'verbosity': -1,
        'num_threads': 4
    }

best = {
    "smape": np.inf,
    "rmse":  np.inf,
    "mae":   np.inf,
    "params": None,
    "num_boost": None,
    "calib": ("ISO", None)  # ("mode", params_dict)
}

for t in range(1, NUM_TRIALS+1):
    params = sample_params()

    # 학습 (early stopping)
    dtr = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=CAT_COLS or None)
    dva = lgb.Dataset(X_va, label=y_va_resid, categorical_feature=CAT_COLS or None)
    model = lgb.train(
        params, dtr, num_boost_round=5000,
        valid_sets=[dtr, dva], valid_names=['train','valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
    )
    pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)

    # --- 보정 후보: AB/ISO 중 SMAPE 최소 ---
    # AB 초기값(OLS)
    A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
    a0, b0 = np.linalg.lstsq(A, y_va_resid.values, rcond=None)[0]
    a0 = float(np.clip(a0, 0.0, 1.5)); b0 = float(b0)
    a_opt, b_opt, pred_ab, smape_ab = tune_ab_by_smape(baseline_va, pred_va_resid, y_va_resid, a0, b0)

    # ISO(plain)
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(pred_va_resid, y_va_resid.values)
    pred_iso = baseline_va.values + iso.transform(pred_va_resid)
    smape_iso = SMAPE(y_va, pred_iso)

    # 선택
    if smape_ab <= smape_iso:
        smape_best = smape_ab
        rmse_best  = RMSE(y_va, pred_ab)
        mae_best   = float(mean_absolute_error(y_va, pred_ab))
        calib_mode = ("AB", {"a":a_opt, "b":b_opt})
    else:
        smape_best = smape_iso
        rmse_best  = RMSE(y_va, pred_iso)
        mae_best   = float(mean_absolute_error(y_va, pred_iso))
        calib_mode = ("ISO", {"iso": iso})

    if smape_best < best["smape"]:
        best.update({
            "smape": smape_best,
            "rmse": rmse_best,
            "mae": mae_best,
            "params": params,
            "num_boost": int(model.best_iteration or 2000),
            "calib": calib_mode
        })
    if t % 10 == 0 or t == 1:
        print(f"[{t}/{NUM_TRIALS}] best_SMAPE={best['smape']:.4f}  (current {smape_best:.4f})  mode={calib_mode[0]}  iters={model.best_iteration}")

# ---- 결과 고정(전역) ----
LGBM_PARAMS_BEST = best["params"]
BEST_NUM_BOOST   = best["num_boost"]
CALIB_MODE       = best["calib"][0]
CALIB_PARAMS     = best["calib"][1]
FEATURE_LIST     = FEATURE_LIST
CAT_COLS         = CAT_COLS

print("\n[SEARCH DONE]")
print(" SMAPE:", f"{best['smape']:.4f}", "| RMSE:", f"{best['rmse']:.3f}", "| MAE:", f"{best['mae']:.3f}")
print(" MODE :", CALIB_MODE, CALIB_PARAMS)
print(" PARAMS:", LGBM_PARAMS_BEST)
print(" NUM_BOOST:", BEST_NUM_BOOST)

[1/20] best_SMAPE=4.7252  (current 4.7252)  mode=AB  iters=2049
[10/20] best_SMAPE=4.4189  (current 4.6116)  mode=AB  iters=2913
[20/20] best_SMAPE=4.4189  (current 4.6001)  mode=AB  iters=2353

[SEARCH DONE]
 SMAPE: 4.4189 | RMSE: 215.649 | MAE: 105.480
 MODE : AB {'a': 1.006477444507705, 'b': 3.0962278261572784}
 PARAMS: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': np.float64(0.05), 'num_leaves': 56, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'min_data_in_leaf': 120, 'lambda_l2': 1.0, 'seed': 42, 'verbosity': -1, 'num_threads': 4}
 NUM_BOOST: 2525


In [189]:
# =========================
# 6R — (TEST_MERGED_PATH 존재 가정) 재학습 & Test 예측 & 제출 저장
#  - test는 이미 building_info 병합/전처리된 merged_test.csv를 그대로 사용
#  - num_date_time은 피처에서 제외, 제출 ID로만 사용
# =========================
import numpy as np, pandas as pd, lightgbm as lgb

# ---- 경로 ----
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"
TEST_MERGED_PATH  = r"C:\Users\user\Downloads\open (1)\merged_test.csv"   # ← 존재 가정
SAMPLE_SUB        = r"C:\Users\user\Downloads\open (1)\sample_submission.csv"
OUT_SUB           = r"C:\Users\user\Downloads\open (1)\submission.csv"

# ---- 유틸 ----
def read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def align_for_lgb(df_tr, df_te, feature_list, cat_cols):
    X_tr = df_tr.reindex(columns=feature_list).copy()
    X_te = df_te.reindex(columns=feature_list).copy()
    for c in (cat_cols or []):
        X_tr[c] = X_tr[c].astype('category')
        # test의 카테고리를 train의 카테고리와 동일하게 정렬
        X_te[c] = X_te[c].astype('category').cat.set_categories(X_tr[c].cat.categories)
    # 수치형 처리
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X_tr[num_cols] = (X_tr[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    X_te[num_cols] = (X_te[num_cols].replace([np.inf, -np.inf], np.nan).astype(float).fillna(0.0))
    return X_tr, X_te

def build_baseline_local(df):
    """롤링 기반 베이스라인 (누설 없음: 모두 shift로 생성된 피처 사용 가정)"""
    s7  = df.get('cons_samehour_mean_7d', pd.Series(0.0, index=df.index))
    s24 = df.get('cons_mean_24h',        pd.Series(0.0, index=df.index))
    base = 0.7 * s7.fillna(0) + 0.3 * s24.fillna(0)
    if base.isna().any():
        # 최후 보루(학습 구간에서만 의미 있음)
        if '전력소비량(kWh)' in df.columns:
            gb = df.groupby('건물번호')['전력소비량(kWh)'].transform('mean')
            base = base.fillna(gb)
    return base.fillna(0.0)

# ---- 1) 1차 베스트 세팅 고정 (네가 찾은 결과)
LGBM_PARAMS_BEST = {
    'objective':'regression','metric':'rmse','learning_rate':0.05,
    'num_leaves':40,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':2,
    'min_data_in_leaf':80,'lambda_l2':2.0,'seed':42,'verbosity':-1,'num_threads':4
}
BEST_NUM_BOOST = 2993
CALIB_MODE   = "AB"
CALIB_PARAMS = {'a': float(1.006912374168081), 'b': float(3.948418827927597)}

# ---- 2) 데이터 로드 & 전처리(두 파일 모두 이미 병합/전처리된 상태 가정)
train_df = read_csv_smart(TRAIN_MERGED_PATH)
test_df  = read_csv_smart(TEST_MERGED_PATH)

# 설비 용량 '-' → 0, dtype 정리 / 날짜·시간·dt 생성 (함수는 앞서 정의된 걸 사용)
train_df = clean_capacity_fields(train_df)
test_df  = clean_capacity_fields(test_df)
train_df = ensure_datetime_cols(train_df)
test_df  = ensure_datetime_cols(test_df)

# 카테고리(factor) 보장
for c in ['건물번호','건물유형']:
    if c in train_df.columns: train_df[c] = train_df[c].astype('category')
    if c in test_df.columns:  test_df[c]  = test_df[c].astype('category')

# 일사/일조 컬럼 방어(없으면 0.0)
for c, default in [('일사(MJ/m2)',0.0), ('일조(hr)',0.0)]:
    if c not in train_df.columns: train_df[c] = default
    if c not in test_df.columns:  test_df[c]  = default

# ---- 3) 피처 생성(누설 없음: make_features 내부에서 shift/rolling 과거만 사용)
all_df   = pd.concat([train_df, test_df], ignore_index=True)
all_feat = make_features(all_df)
feat_tr  = all_feat.iloc[:len(train_df)].copy()
feat_te  = all_feat.iloc[len(train_df):].copy()

# ---- 4) 피처 목록 구성 & 행렬 정렬
from pandas.api.types import is_numeric_dtype
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}  # 타깃/시간/ID 제거
FEATURE_LIST = [c for c in feat_tr.columns
                if c not in EXCLUDE and (str(feat_tr[c].dtype)=='category' or is_numeric_dtype(feat_tr[c]))]
# 공통 컬럼만 사용
FEATURE_LIST = [c for c in FEATURE_LIST if c in feat_te.columns]
CAT_COLS = [c for c in ['건물번호','건물유형'] if c in FEATURE_LIST]

X_full, X_te = align_for_lgb(feat_tr, feat_te, FEATURE_LIST, CAT_COLS)
y_full = feat_tr['전력소비량(kWh)'].astype(float)

# ---- 5) 베이스라인 (train/test 각각)
baseline_full = build_baseline_local(feat_tr)
baseline_te   = build_baseline_local(feat_te)

# (선택) 하드 빌딩 가중치
w_full = np.ones(len(feat_tr), dtype=float)
try:
    HARD_BLD = {64, 1, 34, 3, 6, 10, 45, 79, 54, 23}  # 최근 run에서 추린 하드 빌딩 셋
    w_full[feat_tr['건물번호'].isin(HARD_BLD).values] = 1.8
except Exception:
    pass

# ---- 6) 전체 재학습(잔차 타깃)
lgb_full = lgb.Dataset(
    X_full,
    label=(y_full - baseline_full).astype(float),
    weight=w_full,
    categorical_feature=CAT_COLS or None
)
final_model = lgb.train(LGBM_PARAMS_BEST, lgb_full, num_boost_round=BEST_NUM_BOOST)

# ---- 7) Test 예측(AB 보정 적용) + 안전 클램프
resid_te  = final_model.predict(X_te, num_iteration=final_model.best_iteration)
a, b      = CALIB_PARAMS["a"], CALIB_PARAMS["b"]
test_pred = baseline_te.values + a * resid_te + b
test_pred = np.clip(test_pred, 0, None)

# ---- 8) 제출 저장(UTF-8, ID 순서 유지: sample_submission 순서 기준)
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')

print(f"[DONE] 저장 → {OUT_SUB}")
print("n_features(train/test):", X_full.shape[1], "/", X_te.shape[1])
print("neg_rate:", float((test_pred < 0).mean()), " nan_rate:", float(np.isnan(test_pred).mean()))


  grp = df.groupby('건물번호', sort=False)


[DONE] 저장 → C:\Users\user\Downloads\open (1)\submission.csv
n_features(train/test): 49 / 49
neg_rate: 0.0  nan_rate: 0.0


In [30]:
# 결과 저장
df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')