In [4]:
import pandas as pd

# 파일 경로
train_path = r"C:\Users\user\Downloads\open (1)\train.csv"
building_info_path = r"C:\Users\user\Downloads\open (1)\building_info.csv"

# CSV 불러오기
train_df = pd.read_csv(train_path)
building_info_df = pd.read_csv(building_info_path)

# 병합 (건물번호 기준)
merged_df = pd.merge(train_df, building_info_df, on='건물번호', how='left')

# 결과 저장
merged_df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')

print("병합 완료! merged_train.csv로 저장됨")


병합 완료! merged_train.csv로 저장됨


In [5]:
def read_csv_smart(path):
    import pandas as pd
    for enc in ['cp949', 'utf-8-sig', 'utf-8', 'euc-kr']:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # 최후의 보루: 깨지는 글자는 � 로 대체
    return pd.read_csv(path, encoding='utf-8', errors='replace')

In [6]:
# CSV 불러오기 (예시)
df = read_csv_smart("C:\\Users\\user\\Downloads\\open (1)\\merged_train.csv")


# '일시'를 문자열로 변환 후 날짜와 시간 분리
df['일시'] = df['일시'].astype(str)

# 날짜(YYYYMMDD)와 시간(HH) 분리
df['날짜'] = df['일시'].str.slice(0, 8)     # 앞 8자리 → 날짜
df['시간'] = df['일시'].str.slice(9, 11)    # 9~10번째 자리 → 시간

# 날짜를 datetime 형식으로 변환
df['날짜'] = pd.to_datetime(df['날짜'], format='%Y%m%d')
df['시간'] = df['시간'].astype(int)

# 확인
print(df[['일시', '날짜', '시간']].head())



            일시         날짜  시간
0  20240601 00 2024-06-01   0
1  20240601 01 2024-06-01   1
2  20240601 02 2024-06-01   2
3  20240601 03 2024-06-01   3
4  20240601 04 2024-06-01   4


In [7]:
df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),날짜,시간
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,호텔,82912.71,77586.0,-,-,-,2024-06-01,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,호텔,82912.71,77586.0,-,-,-,2024-06-01,1
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,호텔,82912.71,77586.0,-,-,-,2024-06-01,2
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,호텔,82912.71,77586.0,-,-,-,2024-06-01,3
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,호텔,82912.71,77586.0,-,-,-,2024-06-01,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,호텔,162070.24,152943.0,-,-,-,2024-08-24,19
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,호텔,162070.24,152943.0,-,-,-,2024-08-24,20
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,호텔,162070.24,152943.0,-,-,-,2024-08-24,21
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,호텔,162070.24,152943.0,-,-,-,2024-08-24,22


In [8]:
# 결측치 대체할 컬럼 목록
cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']

# '-'를 0으로 바꾸고 숫자형으로 변환
for col in cols:
    df[col] = df[col].replace('-', 0).astype(float)

In [9]:
df = df.drop(columns=['num_date_time', '일시'])

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['건물유형'] = le.fit_transform(df['건물유형'])
df['날짜'] = pd.to_datetime(df['날짜'])

In [11]:
import pandas as pd
import numpy as np

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    # 0) 기본 정렬 & datetime 만들기
    #    (이미 df['날짜']와 df['시간']이 있다면 그대로 쓰되, 한 줄짜리 datetime을 만들어두면 편함)
    df = df.copy()
    df['날짜'] = pd.to_datetime(df['날짜'])
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # -------------------------------------------------------
    # 1) 최근 24시간 평균, 최근 7일(같은 시각) 평균  → 전부 "과거만" 보도록 shift 사용
    # -------------------------------------------------------
    grp = df.groupby('건물번호', group_keys=False)

    # (a) 최근 24시간 평균 (전력소비량 기준)
    #  - window=24, past-only를 위해 shift(1) 후 rolling
    df['cons_lag1'] = grp['전력소비량(kWh)'].shift(1)
    df['cons_mean_24h'] = grp['cons_lag1'].rolling(window=24, min_periods=1).mean()

    # (b) 최근 7일 같은 시각 평균 (24시간 간격으로 7개)
    #  - 1일 전 같은 시각부터 7일 전 같은 시각까지 평균
    same_hour_lag = grp['전력소비량(kWh)'].shift(24)
    df['cons_samehour_mean_7d'] = same_hour_lag.rolling(window=7, min_periods=1).mean()

    # 참고로 모델에 바로 쓰진 않아도 되는 추가 라그들(원하면 활성화)
    df['cons_lag_24h'] = grp['전력소비량(kWh)'].shift(24)
    df['cons_lag_48h'] = grp['전력소비량(kWh)'].shift(48)
    df['cons_lag_72h'] = grp['전력소비량(kWh)'].shift(72)
    df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전 같은 시각

    # -------------------------------------------------------
    # 2) 기온·일사 기반 냉방 수요 지표 (CDD류)
    # -------------------------------------------------------
    # 한국 여름 기준 base temp 24°C 가합리(필요시 23~26으로 튜닝)
    base_temp = 24.0
    # ‘냉방도수’(Cooling Degree) 시간 단위
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    # 일사량(MJ/m2)과의 상호작용: 햇볕이 강할수록 체감 부하↑
    # 일사량이 0~상위 99퍼센타일 사이로 정규화(robust)
    q99 = df['일사(MJ/m2)'].quantile(0.99)
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / (q99 + 1e-6))
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    # 습도(%)와의 상호작용: 습도가 높으면 동일 온도에서도 냉방 부하↑
    # 간단히 (1 + 습도/100*알파) 가중. 알파=0.3 정도로 시작(튜닝 가능)
    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # -------------------------------------------------------
    # 3) 주말/평일, 공휴일
    # -------------------------------------------------------
    df['weekday'] = df['dt'].dt.weekday  # 월=0 ... 일=6
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    # 2024-06~08 사이 한국 공휴일: 현충일(6/6), 광복절(8/15)
    kr_holidays = {
        pd.Timestamp(2024, 6, 6),  # 현충일
        pd.Timestamp(2024, 8, 15), # 광복절
    }
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # -------------------------------------------------------
    # 4) 태양광·ESS·PCS 용량 대비 “동작 가능성” 지표
    #    (실제 제어 로그가 없으니 ‘가능성/잠재력’을 피처로 넣는다)
    # -------------------------------------------------------
    # 설비 유무 이진
    df['has_pv'] = (df['태양광용량(kW)'] > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)'] > 0).astype(int)

    # 낮/밤 플래그 (대략 일사량>0이면 주간으로 간주)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    # 태양광 ‘동작 가능성’ (설비 있고 + 주간/일사>0)
    df['pv_active_potential'] = ((df['has_pv'] == 1) & (df['is_daylight'] == 1)).astype(int)

    # 피크/오프피크 (현실 요금제와 다를 수 있지만 합리적 초기값)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    # ESS 충방전 ‘가능성’ 피처
    df['ess_charge_potential']   = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    # 용량 스케일 자체도 피처로 사용(로그 스케일로 완만화; 0은 0으로 유지)
    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    # 누설 방지: 타깃 기반 비율은 과거 라그로만 계산
    # ESS 대비 부하 비율(전일 같은 시각 소비량 사용)
    df['ess_to_load_lag_ratio'] = np.where(
        df['cons_lag_24h'].notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # -------------------------------------------------------
    # 5) 기타 유틸리티 파생
    # -------------------------------------------------------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']  # 가독성용 복사
    df['dayofyear'] = df['dt'].dt.dayofyear

    # 모델 입력 전에 의미 없는 원본(또는 중복) 컬럼 정리 원하면 아래 사용
    # drop_cols = ['dt']  # 학습 시 굳이 안 써도 되면 제거
    # df = df.drop(columns=drop_cols)

    return df

# 사용 예시:
# df_feat = make_features(df)
# df_feat.head()


In [12]:
# =========================
# 0. 라이브러리 & 경로 설정
# =========================
import os
import numpy as np
import pandas as pd

# pip install lightgbm 먼저 (처음 1번만)
# pip install lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

DATA_DIR = r"C:\Users\user\Downloads\open (1)"
TRAIN_MERGED_PATH = os.path.join(DATA_DIR, "merged_train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")
BUILD_PATH = os.path.join(DATA_DIR, "building_info.csv")
SAMPLE_SUB = os.path.join(DATA_DIR, "sample_submission.csv")
OUT_SUB    = os.path.join(DATA_DIR, "baseline_lgbm_submission.csv")


In [13]:
# ===== IMPORTS (필요시 중복 있어도 무방) =====
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

# ===== 피처 선택 헬퍼: 숫자/카테고리만 남기기 + 불필요 컬럼 드롭 =====
def get_feature_cols(df: pd.DataFrame) -> list:
    base_drop = ['전력소비량(kWh)', 'dt', '날짜', '시간', '일시', 'num_date_time']
    cols = [c for c in df.columns if c not in base_drop]
    cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]
    return cols

# =========================
# 1. 유틸 함수들
# =========================
def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    """일시 → 날짜/시간 분리(또는 이미 분리돼 있으면 그대로) + dt 생성"""
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' in df.columns:
            s = df['일시'].astype(str)
            df['날짜'] = pd.to_datetime(s.str.slice(0, 8), format='%Y%m%d')
            df['시간'] = s.str.slice(9, 11).astype(int)
        else:
            raise ValueError("날짜/시간 정보가 없습니다. ('일시' 또는 '날짜','시간' 필요)")
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    """설비 용량에 '-'가 있으면 0으로 치환 후 float 변환"""
    df = df.copy()
    cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

# =========================
# 2. 특징 엔지니어링(수정 버전)
# =========================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: 전부 shift/rolling(=transform)로 과거만 사용
    - groupby().rolling() 대신 groupby().transform(...) 사용
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)

    # 그룹 객체
    grp = df.groupby('건물번호', sort=False)

    # ---------- 타깃 라그 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전
        # 최근 24시간 평균 (과거만 참고)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())
        # 최근 7일 같은 시각 평균
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)

    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)

    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    return df

# =========================
# 3. 학습 데이터 로드 & 피처 생성
# =========================
train_df = read_csv_smart(TRAIN_MERGED_PATH)
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)
train_feat = make_features(train_df)

# =========================
# 4. 시계열 검증 분할 (2024-08-17 ~ 2024-08-24)
# =========================
VAL_START = pd.Timestamp(2024,8,17,0)
VAL_END   = pd.Timestamp(2024,8,24,23)

is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# === 피처 선택 (object/문자열 배제) ===
features = get_feature_cols(train_feat)
target_col = '전력소비량(kWh)'

# 카테고리 지정(피처에 포함된 컬럼만)
cat_cols = [c for c in ['건물번호','건물유형'] if c in features]
for c in cat_cols:
    train_part[c] = train_part[c].astype('category')
    valid_part[c] = valid_part[c].astype('category')

X_tr, y_tr = train_part[features], train_part[target_col]
X_va, y_va = valid_part[features], valid_part[target_col]

# =========================
# 5. LightGBM 학습 & 검증 (콜백 방식 조기종료)
# =========================
lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols or None)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}

callbacks = [
    lgb.early_stopping(stopping_rounds=200),
    lgb.log_evaluation(period=200)
]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

pred_va = model.predict(X_va, num_iteration=model.best_iteration)
print("VALID RMSE:", rmse(y_va, pred_va))
print("VALID MAE :", mean_absolute_error(y_va, pred_va))



  cols = [c for c in cols if is_numeric_dtype(df[c]) or is_categorical_dtype(df[c])]


Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 173.366	valid's rmse: 230.791
[400]	train's rmse: 154.914	valid's rmse: 221.188
[600]	train's rmse: 143.383	valid's rmse: 216.656
[800]	train's rmse: 134.649	valid's rmse: 214.773
[1000]	train's rmse: 127.614	valid's rmse: 213.857
[1200]	train's rmse: 121.745	valid's rmse: 213.28
[1400]	train's rmse: 116.665	valid's rmse: 213.123
[1600]	train's rmse: 112.139	valid's rmse: 212.795
[1800]	train's rmse: 108.081	valid's rmse: 212.451
Early stopping, best iteration is:
[1739]	train's rmse: 109.199	valid's rmse: 212.38
VALID RMSE: 212.38019335781
VALID MAE : 101.67560328341894


In [14]:
def backfill_solar_by_time(all_df: pd.DataFrame) -> pd.DataFrame:
    # dt/시간 보장
    if 'dt' not in all_df.columns:
        raise ValueError("dt 없으면 ensure_datetime_cols 먼저 호출")
    all_df = all_df.copy()
    all_df['month'] = all_df['dt'].dt.month
    all_df['hour']  = all_df['시간']

    # train/test 구분: 타깃 존재 여부로 판별
    is_train = all_df['전력소비량(kWh)'].notna() if '전력소비량(kWh)' in all_df.columns else pd.Series(False, index=all_df.index)

    # 기준 통계 (train에서만)
    rad_ref = all_df.loc[is_train].groupby(['month','hour'])['일사(MJ/m2)'].median()
    sun_ref = all_df.loc[is_train].groupby(['month','hour'])['일조(hr)'].median()

    # index 매칭해서 test 행만 채움
    idx = pd.MultiIndex.from_frame(all_df[['month','hour']])
    fill_rad = rad_ref.reindex(idx).values
    fill_sun = sun_ref.reindex(idx).values

    need_fill_rad = (~is_train) & (all_df['일사(MJ/m2)'].isna() | (all_df['일사(MJ/m2)'] == 0))
    need_fill_sun = (~is_train) & (all_df['일조(hr)'].isna()     | (all_df['일조(hr)'] == 0))

    all_df.loc[need_fill_rad, '일사(MJ/m2)'] = fill_rad[need_fill_rad.values]
    all_df.loc[need_fill_sun, '일조(hr)']    = fill_sun[need_fill_sun.values]

    # 혹시라도 남은 결측은 0으로
    all_df['일사(MJ/m2)'] = all_df['일사(MJ/m2)'].fillna(0)
    all_df['일조(hr)']    = all_df['일조(hr)'].fillna(0)
    return all_df


In [15]:
# train + test concat
all_df = pd.concat([train_df, test_df], ignore_index=True)
all_df = clean_capacity_fields(all_df)
all_df = ensure_datetime_cols(all_df)

# ★ 추가
all_df = backfill_solar_by_time(all_df)

# 그 다음 피처 생성
all_feat = make_features(all_df)


NameError: name 'test_df' is not defined

In [None]:
# make_features() 끝부분에 추가
df['hour_sin'] = np.sin(2*np.pi*df['시간']/24)
df['hour_cos'] = np.cos(2*np.pi*df['시간']/24)


In [None]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    - 미래정보 누설 방지: shift/rolling 모두 과거만 사용
    - groupby().transform(...) 으로 인덱스 정렬 유지
    """
    df = df.copy()

    # 일사/일조 안전 처리(테스트에 없을 수 있음)
    if '일사(MJ/m2)' not in df.columns:
        df['일사(MJ/m2)'] = 0.0
    if '일조(hr)' not in df.columns:
        df['일조(hr)'] = 0.0

    # 정렬 후 그룹 객체 생성
    df = df.sort_values(['건물번호', 'dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)  # ← 여기서 grp 정의

    # ---------- 타깃 라그 & 롤링 ----------
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)  # 7일 전

        # 최근 24시간 평균(과거만; shift(1) 후 rolling)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(1).rolling(window=24, min_periods=1).mean())

        # 최근 7일 같은 시각 평균(24시간 간격 7개)
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'] \
            .transform(lambda s: s.shift(24).rolling(window=7, min_periods=1).mean())

        # ✅ 최근 24시간 표준편차(변동성) — cons_lag1(과거값) 기반
        df['cons_std_24h'] = grp['cons_lag1'] \
            .transform(lambda s: s.rolling(window=24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # ---------- 냉방 수요 지표 ----------
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)

    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm

    alpha = 0.3
    df['CDD_humid_adj'] = df['CDD'] * (1 + alpha * (df['습도(%)'] / 100.0))

    # ---------- 달력 피처 ----------
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)

    # ---------- 설비 가능성 피처 ----------
    df['has_pv']  = (df['태양광용량(kW)']  > 0).astype(int)
    df['has_ess'] = (df['ESS저장용량(kWh)'] > 0).astype(int)
    df['has_pcs'] = (df['PCS용량(kW)']    > 0).astype(int)
    df['is_daylight'] = (df['일사(MJ/m2)'] > 0).astype(int)
    df['is_offpeak'] = df['시간'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['시간'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)

    for c in ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    df['ess_to_load_lag_ratio'] = np.where(
        df.get('cons_lag_24h', pd.Series(index=df.index)).notna(),
        df['ESS저장용량(kWh)'] / (df['cons_lag_24h'] + 1e-6),
        np.nan
    )

    # ---------- 기타 ----------
    df['month'] = df['dt'].dt.month
    df['hour']  = df['시간']
    df['dayofyear'] = df['dt'].dt.dayofyear

    # (옵션) 시간 사이클릭 인코딩 원하면 활성화
    # df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
    # df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    return df




In [None]:
params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.4,  # 1.2~1.6 사이 튜닝
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'seed': 42,
    'verbosity': -1
}


In [None]:
MAX_ABS = 1e12  # 과도한 값 하드 클립 임계

def sanitize_matrix(X: pd.DataFrame) -> pd.DataFrame:
    """Inf 제거 + 과대값 클립 (LightGBM은 NaN은 허용, Inf는 불가)"""
    X = X.copy()
    num_cols = X.select_dtypes(include=[np.number]).columns
    X[num_cols] = X[num_cols].replace([np.inf, -np.inf], np.nan)
    X[num_cols] = X[num_cols].clip(lower=-MAX_ABS, upper=MAX_ABS)
    return X

def safe_log1p_vec(a):
    """음수/비정상값 방지 후 log1p"""
    a = np.asarray(a, dtype=float)
    # 비정상(y에 NaN/inf) → 0으로 대체 (혹은 np.nan 유지하고 마스킹하려면 전략 바꿔도 됨)
    a = np.where(np.isfinite(a), a, 0.0)
    a = np.clip(a, 0, None)  # -0 방지
    return np.log1p(a)



In [None]:
# ----- a, b 동시 추정(최소제곱) -----
pred_va_resid = model.predict(X_va, num_iteration=model.best_iteration)
true_resid_va = (y_va - baseline_va).values

A = np.column_stack([pred_va_resid, np.ones_like(pred_va_resid)])
a, b = np.linalg.lstsq(A, true_resid_va, rcond=None)[0]
a = float(np.clip(a, 0.0, 1.5))  # 안정화
b = float(b)

pred_va = baseline_va.values + a * pred_va_resid + b

print(f"[Blending] a={a:.3f}, b={b:.3f}")
print("Baseline-only  RMSE:", rmse(y_va, baseline_va), "MAE:", mean_absolute_error(y_va, baseline_va))
print("BLENDED       RMSE:", rmse(y_va, pred_va), "MAE:", mean_absolute_error(y_va, pred_va))


In [None]:
# 잔차 예측
test_pred_resid = final_model.predict(X_te, num_iteration=final_model.best_iteration)

# 검증에서 구한 a,b로 복원
a_use = a if 'a' in globals() else 1.0
b_use = b if 'b' in globals() else 0.0
test_pred = baseline_te.values + a_use * test_pred_resid + b_use


In [None]:
HARD_BLD = {10,79,3,45,23,1,12,69,64,34}
w_tr = np.ones(len(train_part), dtype=float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = 1.8  # 1.5~2.0 사이 시도

lgb_train = lgb.Dataset(X_tr, label=y_tr_resid, weight=w_tr, categorical_feature=cat_cols or None)


In [None]:
# 1) make_features() 셀(패치 포함) 실행

# 2) 피처 다시 생성
train_feat = make_features(train_df)

# 3) 확인
print({
  'cons_lag1': 'cons_lag1' in train_feat.columns,
  'cons_lag_24h': 'cons_lag_24h' in train_feat.columns,
  'cons_lag_168h': 'cons_lag_168h' in train_feat.columns,
  'delta_1h': 'delta_1h' in train_feat.columns,
  'delta_7d': 'delta_7d' in train_feat.columns
})


In [16]:
# =========================
# Config — 변수/세팅 한 곳에서 관리
# =========================
import os, json, numpy as np, pandas as pd

# --- 경로 (상대 경로 권장) ---
DATA_DIR       = "./data"
ARTIFACTS_DIR  = "./artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

TRAIN_MERGED_PATH = f"{DATA_DIR}/merged_train.csv"   # train+building 병합/전처리본
TEST_MERGED_PATH  = f"{DATA_DIR}/merged_test.csv"    # test+building 병합/전처리본
SAMPLE_SUB        = f"{DATA_DIR}/sample_submission.csv"
BEST_JSON_PATH    = f"{ARTIFACTS_DIR}/best.json"     # 있으면 불러서 오버라이드
OUT_SUB_PATH      = f"{ARTIFACTS_DIR}/submission.csv"

# --- 검증 윈도우(시계열 홀드아웃) ---
VAL_START = pd.Timestamp(2024, 8, 17, 0)
VAL_END   = pd.Timestamp(2024, 8, 24, 23)

# --- 공휴일(한국, 대회 구간) ---
KR_HOLIDAYS = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}

# --- 피처 토글(원하면 바꿔) ---
USE_STD24          = True   # 최근 24h 표준편차
USE_CYCLIC         = False  # hour/dow/dayofyear sin/cos
USE_SLOT_PROFILE   = False  # (건물×요일×시간) 과거 평균
SLOT_PROFILE_ROLLS = 4      # 위 옵션 True일 때, 과거 4회 평균(≈4주)
USE_AREA_NORM      = False  # m² 정규화 파생
WINSOR_P99         = None   # 예: 0.995 로 설정하면 윈저라이즈

# --- 베이스라인 가중(혼합 평균) ---
BASE_W7  = 0.70   # same-hour 7d mean
BASE_W24 = 0.30   # last 24h mean

# --- 제외 컬럼 / 카테고리 지정 ---
EXCLUDE_COLS = {'전력소비량(kWh)', 'dt', '날짜', '시간', '일시', 'num_date_time'}
CAT_COLS_DEFAULT = ['건물번호', '건물유형']  # factor 처리 대상

# --- 하드 빌딩 가중치(난이도 높은 빌딩) ---
HARD_BLD = {64, 1, 34, 3, 6, 10, 45, 79, 54, 23}
HARD_BLD_WEIGHT = 1.8

# --- Day1 베스트 하이퍼파라미터 & 보정(없으면 기본값) ---
LGBM_PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 40,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 2,
    'min_data_in_leaf': 80,
    'lambda_l2': 2.0,
    'seed': 42,
    'verbosity': -1,
    'num_threads': 4,
}
BEST_NUM_BOOST = 2993

CALIB_MODE   = "AB"  # "AB" 또는 "ISO" (또는 "ISO+BLD")
CALIB_PARAMS = {'a': 1.006912374168081, 'b': 3.948418827927597}

# --- 랜덤서치 공간(다시 돌릴 때 사용) ---
HP_SEARCH_SPACE = {
    "learning_rate":   [0.03, 0.04, 0.05, 0.06, 0.07],
    "num_leaves":      list(range(24, 96, 8)),        # 24,32,...,88
    "min_data_in_leaf":[80, 100, 120, 150, 180, 220],
    "feature_fraction":[0.70, 0.75, 0.80, 0.85, 0.90],
    "bagging_fraction":[0.70, 0.75, 0.80, 0.85, 0.90],
    "bagging_freq":    [1, 2],
    "lambda_l2":       [0.0, 1.0, 2.0, 3.0, 5.0],
}
HP_TRIALS = 40  # 탐색 시 시도 횟수

# --- best.json 있으면 우선 적용(재현성 확보) ---
def _override_from_best_json(path: str):
    global LGBM_PARAMS, BEST_NUM_BOOST, CALIB_MODE, CALIB_PARAMS
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            best = json.load(f)
        if 'params' in best:       LGBM_PARAMS = {k: (float(v) if isinstance(v, (int, float)) else v)
                                                  for k, v in best['params'].items()}
        if 'num_boost' in best:    BEST_NUM_BOOST = int(best['num_boost'])
        if 'calib_mode' in best:   CALIB_MODE = best['calib_mode']
        if 'calib_params' in best: CALIB_PARAMS = {k: float(v) for k, v in best['calib_params'].items()}
        print(f"[best.json 적용] mode={CALIB_MODE}, num_boost={BEST_NUM_BOOST}")
    else:
        print("[best.json 없음] 기본 세팅 사용")

_override_from_best_json(BEST_JSON_PATH)

# --- 환경 출력(확인용) ---
print("▶ PATHS")
print("  train :", TRAIN_MERGED_PATH)
print("  test  :", TEST_MERGED_PATH)
print("  sample:", SAMPLE_SUB)
print("  out   :", OUT_SUB_PATH)
print("▶ VAL :", VAL_START, "~", VAL_END)
print("▶ LGB :", LGBM_PARAMS, " | num_boost:", BEST_NUM_BOOST)
print("▶ CAL :", CALIB_MODE, CALIB_PARAMS)
print("▶ TOGGLES | std24:", USE_STD24, "| cyclic:", USE_CYCLIC,
      "| slot_profile:", USE_SLOT_PROFILE, f"(rolls={SLOT_PROFILE_ROLLS})",
      "| area_norm:", USE_AREA_NORM, "| winsor_p:", WINSOR_P99)


[best.json 없음] 기본 세팅 사용
▶ PATHS
  train : ./data/merged_train.csv
  test  : ./data/merged_test.csv
  sample: ./data/sample_submission.csv
  out   : ./artifacts/submission.csv
▶ VAL : 2024-08-17 00:00:00 ~ 2024-08-24 23:00:00
▶ LGB : {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05, 'num_leaves': 40, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 2, 'min_data_in_leaf': 80, 'lambda_l2': 2.0, 'seed': 42, 'verbosity': -1, 'num_threads': 4}  | num_boost: 2993
▶ CAL : AB {'a': 1.006912374168081, 'b': 3.948418827927597}
▶ TOGGLES | std24: True | cyclic: False | slot_profile: False (rolls=4) | area_norm: False | winsor_p: None


In [17]:
import numpy as np
import pandas as pd

def add_seasonality_features(df: pd.DataFrame,
                             use_cyclic: bool = True,
                             use_slot_profile: bool = False,
                             slot_rolls: int = 4) -> pd.DataFrame:
    """
    시계열 주기성(일중/주간/월간)을 명시적으로 피처화.
    - use_cyclic=True: 시간/요일/월/연중 사이클릭(sin/cos)
    - use_slot_profile=True: (건물×요일×시간) 과거 프로파일 평균(shift로 과거만 사용)
    - slot_rolls: 프로파일 평균의 롤링 개수(주 단위 4면 최근 4주 정도)
    """
    df = df.copy()

    # --- 기본 시간 파생 (있으면 재사용) ---
    if 'dt' not in df.columns:
        # '날짜','시간'에서 만들거나 '일시' 파싱 후 ensure_datetime_cols()로 채워둬야 함
        raise ValueError("dt 컬럼이 필요합니다. ensure_datetime_cols() 이후 호출하세요.")
    if 'hour' not in df.columns:
        df['hour'] = df['dt'].dt.hour
    if 'weekday' not in df.columns:
        df['weekday'] = df['dt'].dt.weekday  # 0=월, 6=일
    if 'month' not in df.columns:
        df['month'] = df['dt'].dt.month
    if 'dayofyear' not in df.columns:
        df['dayofyear'] = df['dt'].dt.dayofyear

    # --- 달력/플래그 ---
    df['is_weekend']    = (df['weekday'] >= 5).astype(int)
    df['is_month_start'] = df['dt'].dt.is_month_start.astype(int)
    df['is_month_end']   = df['dt'].dt.is_month_end.astype(int)
    df['week_of_month']  = ((df['dt'].dt.day - 1) // 7 + 1).astype(int)  # 1~5

    # --- 사이클릭 인코딩(주기 신호를 부드럽게) ---
    if use_cyclic:
        # ① 일중(24시간)
        df['hour_sin'] = np.sin(2*np.pi*df['hour'] / 24.0)
        df['hour_cos'] = np.cos(2*np.pi*df['hour'] / 24.0)

        # ② 주간(요일) — 요일만
        df['dow_sin'] = np.sin(2*np.pi*df['weekday'] / 7.0)
        df['dow_cos'] = np.cos(2*np.pi*df['weekday'] / 7.0)

        # ③ 주간(시간까지): 0~167 = weekday*24 + hour
        how = df['weekday'] * 24 + df['hour']
        df['how_sin'] = np.sin(2*np.pi*how / 168.0)
        df['how_cos'] = np.cos(2*np.pi*how / 168.0)

        # ④ 월간(12개월)
        df['mon_sin'] = np.sin(2*np.pi*(df['month']-1) / 12.0)
        df['mon_cos'] = np.cos(2*np.pi*(df['month']-1) / 12.0)

        # ⑤ 연중(365일) — 여름 데이터라도 부드러운 계절성 힌트
        df['doy_sin'] = np.sin(2*np.pi*df['dayofyear'] / 366.0)
        df['doy_cos'] = np.cos(2*np.pi*df['dayofyear'] / 366.0)

    # --- (선택) 슬롯 프로파일: 건물×요일×시간 과거 평균 ---
    if use_slot_profile:
        # 정렬 + 그룹 정의
        sort_cols = ['건물번호', 'dt'] if '건물번호' in df.columns else ['dt']
        df = df.sort_values(sort_cols).reset_index(drop=True)

        if '전력소비량(kWh)' in df.columns:
            grp = df.groupby(['건물번호', 'weekday', 'hour'])['전력소비량(kWh)'] \
                    if '건물번호' in df.columns else \
                    df.groupby(['weekday', 'hour'])['전력소비량(kWh)']

            # 과거만 보도록 shift(1) 후 최근 slot_rolls회 평균
            df['prof_wd_h_mean'] = grp.transform(
                lambda s: s.shift(1).rolling(slot_rolls, min_periods=1).mean()
            )
        else:
            # test 구간 등 타깃이 없으면 결측 생길 수 있으니 0으로
            df['prof_wd_h_mean'] = 0.0

        # 안전 채움
        df['prof_wd_h_mean'] = df['prof_wd_h_mean'].fillna(0.0)

    return df


In [18]:
# =========================================
# 0) 설정 (경로만 네 환경에 맞게 바꿔)
# =========================================
import os, numpy as np, pandas as pd, lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ▶ 필요 시 네 경로로 교체
TRAIN_MERGED_PATH = r"C:\Users\user\Downloads\open (1)\merged_train.csv"
TEST_MERGED_PATH  = r"C:\Users\user\Downloads\open (1)\merged_test.csv"      # test+building 병합본
SAMPLE_SUB        = r"C:\Users\user\Downloads\open (1)\sample_submission.csv"
OUT_SUB           = r"C:\Users\user\Downloads\open (1)\submission_rmse_residual.csv"
os.makedirs(os.path.dirname(OUT_SUB), exist_ok=True)

VAL_START = pd.Timestamp(2024,8,17,0)   # 시계열 홀드아웃
VAL_END   = pd.Timestamp(2024,8,24,23)

# (어제 쓰던 Day1 안전 파라미터)
LGBM_PARAMS = {
    'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05,
    'num_leaves': 48, 'feature_fraction': 0.85, 'bagging_fraction': 0.85,
    'bagging_freq': 1, 'min_data_in_leaf': 150, 'lambda_l2': 3.0,
    'seed': 42, 'verbosity': -1, 'num_threads': 4
}

# 하드 빌딩 가중(있으면 조금 더 학습)
HARD_BLD = {64, 1, 34, 3, 6, 10, 45, 79, 54, 23}
HARD_BLD_WEIGHT = 1.8

BASE_W7, BASE_W24 = 0.70, 0.30  # baseline 혼합 비율

In [19]:
# =========================================
# 1) 유틸
# =========================================
def read_csv_smart(path):
    try:
        return pd.read_csv(path, encoding='utf-8-sig')
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

def ensure_datetime_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '날짜' not in df.columns or '시간' not in df.columns:
        if '일시' not in df.columns:
            raise ValueError("일시 또는 (날짜,시간) 필요")
        s = df['일시'].astype(str)
        df['날짜'] = pd.to_datetime(s.str.slice(0,8), format='%Y%m%d')
        df['시간'] = s.str.slice(9,11).astype(int)
    else:
        df['날짜'] = pd.to_datetime(df['날짜'])
        df['시간'] = df['시간'].astype(int)
    df['dt'] = df['날짜'] + pd.to_timedelta(df['시간'], unit='h')
    return df

def clean_capacity_fields(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        if c in df.columns:
            df[c] = df[c].replace('-', 0).astype(float)
    return df

def add_safe_weather_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '일사(MJ/m2)' not in df.columns: df['일사(MJ/m2)'] = 0.0
    if '일조(hr)'   not in df.columns: df['일조(hr)']   = 0.0
    return df

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = add_safe_weather_cols(df)
    df = df.sort_values(['건물번호','dt']).reset_index(drop=True)
    grp = df.groupby('건물번호', sort=False)

    # 기본 라그/롤링 (과거만)
    if '전력소비량(kWh)' in df.columns:
        df['cons_lag1']     = grp['전력소비량(kWh)'].shift(1)
        df['cons_lag_24h']  = grp['전력소비량(kWh)'].shift(24)
        df['cons_lag_48h']  = grp['전력소비량(kWh)'].shift(48)
        df['cons_lag_72h']  = grp['전력소비량(kWh)'].shift(72)
        df['cons_lag_168h'] = grp['전력소비량(kWh)'].shift(168)
        df['cons_mean_24h'] = grp['전력소비량(kWh)'].transform(lambda s: s.shift(1).rolling(24, min_periods=1).mean())
        df['cons_samehour_mean_7d'] = grp['전력소비량(kWh)'].transform(lambda s: s.shift(24).rolling(7, min_periods=1).mean())
        df['cons_std_24h'] = grp['cons_lag1'].transform(lambda s: s.rolling(24, min_periods=6).std())
    else:
        for c in ['cons_lag1','cons_lag_24h','cons_lag_48h','cons_lag_72h','cons_lag_168h',
                  'cons_mean_24h','cons_samehour_mean_7d','cons_std_24h']:
            df[c] = np.nan

    # 냉방 수요 지표
    base_temp = 24.0
    df['CDD'] = (df['기온(°C)'] - base_temp).clip(lower=0)
    q99 = df['일사(MJ/m2)'].quantile(0.99) + 1e-6
    rad_norm = (df['일사(MJ/m2)'].clip(upper=q99) / q99)
    df['CDD_x_rad'] = df['CDD'] * rad_norm
    df['CDD_humid_adj'] = df['CDD'] * (1 + 0.3 * (df['습도(%)'] / 100.0))

    # 달력/시간
    df['weekday']    = df['dt'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    kr_holidays = {pd.Timestamp(2024,6,6), pd.Timestamp(2024,8,15)}
    df['is_holiday'] = df['날짜'].isin(kr_holidays).astype(int)
    df['month']      = df['dt'].dt.month
    df['hour']       = df['시간']
    df['dayofyear']  = df['dt'].dt.dayofyear

    # 설비·운영 힌트
    for c in ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']:
        if c in df.columns: df[f'log1p_{c}'] = np.log1p(df[c])
    df['has_pv']  = (df.get('태양광용량(kW)',0)  > 0).astype(int)
    df['has_ess'] = (df.get('ESS저장용량(kWh)',0) > 0).astype(int)
    df['has_pcs'] = (df.get('PCS용량(kW)',0)    > 0).astype(int)
    df['is_offpeak'] = df['hour'].isin([0,1,2,3,4,5,6,23]).astype(int)
    df['is_peak']    = df['hour'].isin([13,14,15,16,17]).astype(int)
    df['ess_charge_potential']    = ((df['has_ess']==1) & (df['is_offpeak']==1)).astype(int)
    df['ess_discharge_potential'] = ((df['has_ess']==1) & (df['is_peak']==1)).astype(int)
    df['ess_to_load_lag_ratio'] = np.where(df['cons_lag_24h'].notna(),
                                           df.get('ESS저장용량(kWh)',0.0) / (df['cons_lag_24h'] + 1e-6),
                                           np.nan)
    return df

def build_baseline(df: pd.DataFrame) -> pd.Series:
    """same-hour 7d mean과 24h mean 혼합. 결측은 건물 평균으로 백업"""
    s7  = df['cons_samehour_mean_7d']
    s24 = df['cons_mean_24h']
    base = BASE_W7 * s7 + BASE_W24 * s24
    # 백업: 건물별 과거 평균
    bld_mean = df.groupby('건물번호')['전력소비량(kWh)'].transform('mean') if '전력소비량(kWh)' in df.columns else 0.0
    return base.fillna(bld_mean).fillna(0.0)

def align_for_lgb(df_tr, df_va_or_te, feature_list, cat_cols):
    X1 = df_tr.reindex(columns=feature_list).copy()
    X2 = df_va_or_te.reindex(columns=feature_list).copy()
    for c in (cat_cols or []):
        X1[c] = X1[c].astype('category')
        X2[c] = X2[c].astype('category').cat.set_categories(X1[c].cat.categories)
    num_cols = [c for c in feature_list if c not in (cat_cols or [])]
    X1[num_cols] = X1[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    X2[num_cols] = X2[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    return X1, X2

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

In [20]:
# =========================================
# 2) 데이터 로드 & 전처리 & 피처
# =========================================
train_df = read_csv_smart(TRAIN_MERGED_PATH)
test_df  = read_csv_smart(TEST_MERGED_PATH)

train_df = clean_capacity_fields(train_df)
test_df  = clean_capacity_fields(test_df)

train_df = ensure_datetime_cols(train_df)
test_df  = ensure_datetime_cols(test_df)

train_feat = make_features(train_df)

# 시계열 검증 분할
is_val = (train_feat['dt'] >= VAL_START) & (train_feat['dt'] <= VAL_END)
train_part = train_feat[~is_val].copy()
valid_part = train_feat[ is_val].copy()

# 피처 목록/카테고리
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
feature_list = [c for c in train_feat.columns if c not in EXCLUDE]
cat_cols = [c for c in ['건물번호','건물유형'] if c in feature_list]

# 행렬 정렬
X_tr, X_va = align_for_lgb(train_part, valid_part, feature_list, cat_cols)
y_tr, y_va = train_part['전력소비량(kWh)'], valid_part['전력소비량(kWh)']

# 베이스라인
base_tr = build_baseline(train_part)
base_va = build_baseline(valid_part)

# 잔차 타깃
ytr_resid = (y_tr - base_tr).astype(float)
yva_resid = (y_va - base_va).astype(float)

In [21]:
# =========================================
# 3) 잔차 LightGBM (RMSE 기준, 조기종료)
# =========================================
w_tr = np.ones(len(train_part), float)
w_tr[train_part['건물번호'].isin(HARD_BLD).values] = HARD_BLD_WEIGHT

lgb_train = lgb.Dataset(X_tr, label=ytr_resid, weight=w_tr, categorical_feature=cat_cols or None)
lgb_valid = lgb.Dataset(X_va, label=yva_resid, categorical_feature=cat_cols or None)

callbacks = [lgb.early_stopping(200), lgb.log_evaluation(200)]
model = lgb.train(LGBM_PARAMS, lgb_train, valid_sets=[lgb_valid],
                  num_boost_round=5000, callbacks=callbacks)

pred_resid_va = model.predict(X_va, num_iteration=model.best_iteration)

Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 297.494
[400]	valid_0's rmse: 264.726
[600]	valid_0's rmse: 252.223
[800]	valid_0's rmse: 246.731
[1000]	valid_0's rmse: 242.843
[1200]	valid_0's rmse: 240.791
[1400]	valid_0's rmse: 239.183
[1600]	valid_0's rmse: 238.353
[1800]	valid_0's rmse: 237.522
[2000]	valid_0's rmse: 236.712
[2200]	valid_0's rmse: 236.503
[2400]	valid_0's rmse: 236.116
[2600]	valid_0's rmse: 236.146
[2800]	valid_0's rmse: 235.968
Early stopping, best iteration is:
[2668]	valid_0's rmse: 235.868


In [25]:
# =========================================
# 3.5 여러 모델 랜덤서치 + 설명력 비교 + 베스트 모델 선택
#  - 공통 타깃: residual = y - baseline
#  - 비교 모델: LightGBM, XGBoost, CatBoost, HistGB, ExtraTrees, Ridge
#  - 평가 지표: RMSE(정렬 키), MAE, SMAPE
#  - 설명력: permutation importance (최종 y 기준)
# =========================================
import numpy as np, pandas as pd, warnings, time, random
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor

# ---------- 준비물 체크 ----------
needed = ['X_tr','X_va','y_tr','y_va','base_tr','base_va','cat_cols']
missing = [n for n in needed if n not in globals()]
assert not missing, f"필요 변수 부족: {missing} 셀이 먼저 실행돼야 해."

# ---------- 유틸 ----------
def SMAPE(y, yhat, eps=1e-6):
    y = np.asarray(y, float); yhat = np.asarray(yhat, float)
    return 100.0 * np.mean(2.0 * np.abs(yhat - y) / (np.abs(y) + np.abs(yhat) + eps))

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

def to_sklearn_matrix(X_tr: pd.DataFrame, X_va: pd.DataFrame, cat_cols):
    """sklearn/XGB용: 카테고리→codes, 수치 NaN/inf 방어"""
    Xtr = X_tr.copy(); Xva = X_va.copy()
    use_cats = [c for c in (cat_cols or []) if c in Xtr.columns]
    for c in use_cats:
        Xtr[c] = Xtr[c].astype('category')
        Xva[c] = Xva[c].astype('category').cat.set_categories(Xtr[c].cat.categories)
        Xtr[c] = Xtr[c].cat.codes.replace(-1, np.nan)
        Xva[c] = Xva[c].cat.codes.replace(-1, np.nan)
    Xtr = Xtr.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    Xva = Xva.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
    return Xtr, Xva

class ResidEstimatorWrapper:
    """잔차 예측 모델을 최종 y 예측 모델처럼 보이게 랩핑 (permutation_importance용)"""
    def __init__(self, fitted, base_vector, model_kind="skl", lgb_best_it=None):
        self.fitted = fitted
        self.base = np.asarray(base_vector, float)
        self.kind = model_kind
        self.best_iter = lgb_best_it
    def fit(self, X, y): return self
    def predict(self, X):
        if self.kind == "lgb":
            r = self.fitted.predict(X, num_iteration=self.best_iter)
        else:
            r = self.fitted.predict(X)
        yhat = self.base[:len(r)] + r
        return np.clip(yhat, 0, None)

# ---------- 잔차 타깃 ----------
ytr_resid = (y_tr - base_tr).astype(float)
yva_resid = (y_va - base_va).astype(float)

# ---------- 행렬 준비 ----------
X_tr_skl, X_va_skl = to_sklearn_matrix(X_tr, X_va, cat_cols)  # sklearn/XGB용
X_tr_cb  = X_tr.copy(); X_va_cb = X_va.copy()                 # CatBoost용 (카테고리 유지)
cat_idx_cb = [X_tr_cb.columns.get_loc(c) for c in cat_cols if c in X_tr_cb.columns]

# ---------- (선택) 하드 빌딩 가중치 ----------
w_tr = np.ones(len(X_tr), float)
if 'HARD_BLD' in globals():
    w_tr[X_tr['건물번호'].astype(str).isin(set(map(str, HARD_BLD))) ] = globals().get('HARD_BLD_WEIGHT', 1.8)

# ---------- 1) LightGBM 랜덤서치 ----------
import lightgbm as lgb
lgb_space = {
    "learning_rate":  [0.03, 0.04, 0.05, 0.06, 0.07],
    "num_leaves":     list(range(32, 96, 8)),
    "min_data_in_leaf":[60, 80, 100, 120, 150, 180],
    "feature_fraction":[0.75, 0.80, 0.85, 0.90],
    "bagging_fraction":[0.75, 0.80, 0.85, 0.90],
    "bagging_freq":   [1, 2],
    "lambda_l2":      [0.0, 1.0, 2.0, 3.0, 5.0],
}
LGB_TRIALS = 25  # 필요시 늘려
lgb_best = {"score": 1e18}
for t in range(1, LGB_TRIALS+1):
    params = {
        'objective':'regression','metric':'rmse','seed':42,'verbosity':-1,'num_threads':4,
        **{k: random.choice(v) for k,v in lgb_space.items()}
    }
    lgb_train = lgb.Dataset(X_tr, label=ytr_resid, weight=w_tr, categorical_feature=cat_cols or None)
    lgb_valid = lgb.Dataset(X_va, label=yva_resid, categorical_feature=cat_cols or None)
    mdl = lgb.train(params, lgb_train, valid_sets=[lgb_valid],
                    num_boost_round=4000, callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
    pred_resid = mdl.predict(X_va, num_iteration=mdl.best_iteration)
    yhat = np.clip(base_va.values + pred_resid, 0, None)
    sc = RMSE(y_va, yhat)
    if sc < lgb_best["score"]:
        lgb_best.update(dict(score=sc, model=mdl, params=params, best_it=int(mdl.best_iteration)))
    print(f"[LGB {t:02d}/{LGB_TRIALS}] RMSE={sc:.3f} best={lgb_best['score']:.3f}")
print("LGB Best:", lgb_best["score"], lgb_best["params"], "it", lgb_best["best_it"])

# ---------- 2) XGBoost 랜덤서치 ----------
xgb_best = None
try:
    from xgboost import XGBRegressor
    xgb_space = {
        "learning_rate":  [0.03, 0.04, 0.05, 0.06],
        "max_depth":      [6, 7, 8, 9, 10],
        "min_child_weight":[50, 80, 120, 180, 240],
        "subsample":      [0.75, 0.8, 0.85, 0.9],
        "colsample_bytree":[0.75, 0.8, 0.85, 0.9],
        "reg_lambda":     [1.0, 2.0, 3.0, 5.0],
        "n_estimators":   [2500, 3000, 3500, 4000]
    }
    XGB_TRIALS = 20
    xgb_best = {"score": 1e18}
    for t in range(1, XGB_TRIALS+1):
        hp = {k: random.choice(v) for k,v in xgb_space.items()}
        xgb = XGBRegressor(tree_method="hist", random_state=42, **hp)
        xgb.fit(X_tr_skl, ytr_resid, eval_set=[(X_va_skl, yva_resid.values)],
                early_stopping_rounds=200, verbose=False)
        pred_resid = xgb.predict(X_va_skl, iteration_range=(0, xgb.best_iteration))
        yhat = np.clip(base_va.values + pred_resid, 0, None)
        sc = RMSE(y_va, yhat)
        if sc < xgb_best["score"]:
            xgb_best.update(dict(score=sc, model=xgb, params=hp, best_it=int(xgb.best_iteration)))
        print(f"[XGB {t:02d}/{XGB_TRIALS}] RMSE={sc:.3f} best={xgb_best['score']:.3f}")
    print("XGB Best:", xgb_best["score"], xgb_best["params"], "it", xgb_best["best_it"])
except Exception as e:
    warnings.warn(f"XGBoost 사용 불가: {e}")

# ---------- 3) CatBoost 랜덤서치 ----------
cb_best = None
try:
    from catboost import CatBoostRegressor, Pool
    cb_space = {
        "depth":         [6, 7, 8, 9, 10],
        "learning_rate": [0.03, 0.04, 0.05, 0.06],
        "l2_leaf_reg":   [1.0, 2.0, 3.0, 5.0],
        "bagging_temperature": [0.0, 0.25, 0.5, 0.75, 1.0],
        "iterations":    [3000, 4000, 5000]
    }
    CB_TRIALS = 20
    train_pool = Pool(X_tr_cb, label=ytr_resid, cat_features=cat_idx_cb)
    valid_pool = Pool(X_va_cb, label=yva_resid, cat_features=cat_idx_cb)
    cb_best = {"score": 1e18}
    for t in range(1, CB_TRIALS+1):
        hp = {k: random.choice(v) for k,v in cb_space.items()}
        cb = CatBoostRegressor(loss_function="RMSE", random_seed=42, **hp, verbose=False)
        cb.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=200, verbose=False)
        pred_resid = cb.predict(valid_pool)
        yhat = np.clip(base_va.values + pred_resid, 0, None)
        sc = RMSE(y_va, yhat)
        if sc < cb_best["score"]:
            cb_best.update(dict(score=sc, model=cb, params=hp))
        print(f"[CAT {t:02d}/{CB_TRIALS}] RMSE={sc:.3f} best={cb_best['score']:.3f}")
    print("CAT Best:", cb_best["score"], cb_best["params"])
except Exception as e:
    warnings.warn(f"CatBoost 사용 불가: {e}")

# ---------- 4) HistGradientBoosting 랜덤서치(소형) ----------
hgb_space = {
    "learning_rate":    [0.03, 0.04, 0.05, 0.06],
    "max_leaf_nodes":   [31, 47, 63, 79],
    "l2_regularization":[0.0, 0.05, 0.1, 0.2],
    "max_iter":         [1500, 2500, 3500]
}
HGB_TRIALS = 15
hgb_best = {"score": 1e18}
for t in range(1, HGB_TRIALS+1):
    hp = {k: random.choice(v) for k,v in hgb_space.items()}
    hgb = HistGradientBoostingRegressor(
        early_stopping=True, validation_fraction=0.1, n_iter_no_change=100, random_state=42, **hp
    )
    hgb.fit(X_tr_skl, ytr_resid)
    pred_resid = hgb.predict(X_va_skl)
    yhat = np.clip(base_va.values + pred_resid, 0, None)
    sc = RMSE(y_va, yhat)
    if sc < hgb_best["score"]:
        hgb_best.update(dict(score=sc, model=hgb, params=hp))
    print(f"[HGB {t:02d}/{HGB_TRIALS}] RMSE={sc:.3f} best={hgb_best['score']:.3f}")
print("HGB Best:", hgb_best["score"], hgb_best["params"])

# ---------- 5) ExtraTrees 랜덤서치(소형) + Ridge 고정 그리드 ----------
etr_space = {
    "n_estimators":     [600, 800, 1000],
    "min_samples_leaf": [3, 5, 8],
    "max_depth":        [None, 18, 24]
}
ETR_TRIALS = 12
etr_best = {"score": 1e18}
for t in range(1, ETR_TRIALS+1):
    hp = {k: random.choice(v) for k,v in etr_space.items()}
    etr = ExtraTreesRegressor(n_jobs=-1, random_state=42, **hp)
    etr.fit(X_tr_skl, ytr_resid)
    pred_resid = etr.predict(X_va_skl)
    yhat = np.clip(base_va.values + pred_resid, 0, None)
    sc = RMSE(y_va, yhat)
    if sc < etr_best["score"]:
        etr_best.update(dict(score=sc, model=etr, params=hp))
    print(f"[ETR {t:02d}/{ETR_TRIALS}] RMSE={sc:.3f} best={etr_best['score']:.3f}")
print("ETR Best:", etr_best["score"], etr_best["params"])

ridge = make_pipeline(
    StandardScaler(with_mean=False),
    RidgeCV(alphas=[0.1, 0.3, 1.0, 3.0, 10.0], cv=5)
)
ridge.fit(X_tr_skl, ytr_resid)
pred_resid = ridge.predict(X_va_skl)
yhat = np.clip(base_va.values + pred_resid, 0, None)
ridge_best = {"score": RMSE(y_va, yhat), "model": ridge, "params": {"alphas":[0.1,0.3,1,3,10]}}
print("RIDGE Best:", ridge_best["score"])



Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1732]	valid_0's rmse: 242.881
[LGB 01/25] RMSE=242.881 best=242.881
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[3961]	valid_0's rmse: 238.816
[LGB 02/25] RMSE=238.815 best=238.815
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2032]	valid_0's rmse: 238.345
[LGB 03/25] RMSE=238.345 best=238.345
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1697]	valid_0's rmse: 237.625
[LGB 04/25] RMSE=237.625 best=237.625
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1969]	valid_0's rmse: 235.165
[LGB 05/25] RMSE=235.165 best=235.165
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2166]	valid_0's rmse: 237.192
[LGB 06/25] RMSE=237.192 best

[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, pro

[HGB 01/15] RMSE=240.167 best=240.167
[HGB 02/15] RMSE=223.137 best=223.137
[HGB 03/15] RMSE=238.452 best=223.137
[HGB 04/15] RMSE=220.102 best=220.102
[HGB 05/15] RMSE=234.524 best=220.102
[HGB 06/15] RMSE=242.358 best=220.102
[HGB 07/15] RMSE=234.619 best=220.102
[HGB 08/15] RMSE=237.798 best=220.102
[HGB 09/15] RMSE=235.658 best=220.102
[HGB 10/15] RMSE=229.643 best=220.102
[HGB 11/15] RMSE=235.959 best=220.102
[HGB 12/15] RMSE=226.135 best=220.102
[HGB 13/15] RMSE=225.868 best=220.102
[HGB 14/15] RMSE=239.589 best=220.102
[HGB 15/15] RMSE=226.751 best=220.102
HGB Best: 220.10155058057993 {'learning_rate': 0.06, 'max_leaf_nodes': 31, 'l2_regularization': 0.0, 'max_iter': 3500}
[ETR 01/12] RMSE=316.596 best=316.596
[ETR 02/12] RMSE=287.967 best=287.967
[ETR 03/12] RMSE=303.665 best=287.967
[ETR 04/12] RMSE=303.461 best=287.967
[ETR 05/12] RMSE=323.333 best=287.967
[ETR 06/12] RMSE=303.361 best=287.967
[ETR 07/12] RMSE=316.033 best=287.967
[ETR 08/12] RMSE=331.147 best=287.967
[ETR 09

KeyError: 'model'

In [28]:
# ---------- 종합 비교 ----------
candidates = []
def add_cand(name, pack, kind, best_it=None):
    if pack is None: return
    mdl = pack["model"]; sc = pack["score"]; params = pack.get("params", {})
    candidates.append((name, mdl, kind, sc, params, best_it))

add_cand("LGBM", lgb_best, "lgb", lgb_best.get("best_it"))
add_cand("XGB",  xgb_best, "skl", xgb_best.get("best_it") if xgb_best else None)
add_cand("CAT",  cb_best,  "skl", None)
add_cand("HGB",  hgb_best, "skl", None)
add_cand("ETR",  etr_best, "skl", None)
add_cand("RIDGE",ridge_best,"skl", None)

rows = []
for name, mdl, kind, sc, params, best_it in candidates:
    # 다시 예측(안전)
    if kind == "lgb":
        pred_resid = mdl.predict(X_va, num_iteration=best_it)
    else:
        pred_resid = mdl.predict(X_va_skl if name!="CAT" else X_va_skl)  # CAT은 위에서 평가됨, 아래에서 재평가
        if name=="CAT" and cb_best is not None:
            from catboost import Pool
            pred_resid = cb_best["model"].predict(Pool(X_va_cb, cat_features=cat_idx_cb))
    yhat = np.clip(base_va.values + pred_resid, 0, None)
    rows.append({
        "model": name,
        "RMSE": RMSE(y_va, yhat),
        "MAE": mean_absolute_error(y_va, yhat),
        "SMAPE": SMAPE(y_va, yhat),
        "params": params,
        "best_iter": best_it
    })

results_df = pd.DataFrame(rows).sort_values("RMSE").reset_index(drop=True)
print("\n[VAL 성능 비교] ↓RMSE")
display(results_df)

# ---------- 베스트 모델 설명력(Permutation Importance) ----------
best_row = results_df.iloc[0]
BEST_MODEL_NAME = best_row["model"]
print(f"\n[베스트 모델] {BEST_MODEL_NAME}")

# 어떤 입력행렬/예측 함수를 쓸지 선택
if BEST_MODEL_NAME == "LGBM":
    best_obj = lgb_best["model"]; best_kind = "lgb"; best_it = lgb_best["best_it"]; Xv = X_va
elif BEST_MODEL_NAME == "XGB":
    best_obj = xgb_best["model"]; best_kind = "skl"; best_it = xgb_best["best_it"]; Xv = X_va_skl
elif BEST_MODEL_NAME == "CAT":
    best_obj = cb_best["model"];  best_kind = "cat"; best_it = None; Xv = X_va_cb
elif BEST_MODEL_NAME == "HGB":
    best_obj = hgb_best["model"]; best_kind = "skl"; best_it = None; Xv = X_va_skl
elif BEST_MODEL_NAME == "ETR":
    best_obj = etr_best["model"]; best_kind = "skl"; best_it = None; Xv = X_va_skl
else:
    best_obj = ridge_best["model"]; best_kind = "skl"; best_it = None; Xv = X_va_skl

# perm importance는 래퍼로 최종 y 기준 계산
if BEST_MODEL_NAME == "CAT":
    # CatBoost는 predict가 Pool 필요 → 커스텀 predict로 감쌈
    from catboost import Pool
    class CatResidWrapper(ResidEstimatorWrapper):
        def predict(self, X):
            pool = Pool(X_va_cb, cat_features=cat_idx_cb)  # 검사시 X 무시하고 고정 VAL 사용
            r = cb_best["model"].predict(pool)
            return np.clip(base_va.values + r, 0, None)
    wrapper = CatResidWrapper(best_obj, base_va.reset_index(drop=True), "skl")
    X_for_pi = X_va_cb  # 특성 이름 유지
else:
    wrapper = ResidEstimatorWrapper(best_obj, base_va.reset_index(drop=True),
                                    "lgb" if BEST_MODEL_NAME=="LGBM" else "skl",
                                    lgb_best.get("best_it") if BEST_MODEL_NAME=="LGBM" else None)
    X_for_pi = Xv

pi = permutation_importance(
    wrapper, X_for_pi, y_va.values, n_repeats=5, random_state=42,
    scoring="neg_mean_squared_error"
)
imp_df = pd.DataFrame({
    "feature": getattr(X_for_pi, "columns", pd.Index(range(X_for_pi.shape[1]))),
    "imp": pi.importances_mean,
    "std": pi.importances_std
}).sort_values("imp", ascending=False)

print("\n[베스트 모델 설명력 Top-20]")
display(imp_df.head(20))

# 다음 단계에서 재학습/보정/제출에 쓰라고 전역 저장
BEST_MODEL_OBJ  = best_obj
BEST_MODEL_KIND = best_kind
BEST_NUM_BOOST  = (lgb_best["best_it"] if BEST_MODEL_NAME=="LGBM" else None)
BEST_PARAMS     = [r for r in rows if r["model"]==BEST_MODEL_NAME][0]["params"]
print(f"\n[SAVED] BEST_MODEL_NAME={BEST_MODEL_NAME}, KIND={BEST_MODEL_KIND}, BEST_NUM_BOOST={BEST_NUM_BOOST}")


KeyError: 'model'

In [26]:
# =========================
# 5R) 베스트(HGB) 보정 선택 (AB vs ISO)
# =========================
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np, pandas as pd

def RMSE(y, yhat): 
    return float(np.sqrt(mean_squared_error(y, yhat)))

assert 'hgb_best' in globals() and hgb_best.get("model") is not None, "hgb_best가 없습니다. 위 랜덤서치 셀을 먼저 실행하세요."

# VALID에서 잔차 예측
pred_resid_va = hgb_best["model"].predict(X_va_skl)
base_vec      = base_va.values
yhat_raw      = np.clip(base_vec + pred_resid_va, 0, None)

# AB(선형) 보정
A = np.vstack([pred_resid_va, np.ones_like(pred_resid_va)]).T
a, b = np.linalg.lstsq(A, (y_va - base_va).values, rcond=None)[0]
yhat_ab  = np.clip(base_vec + (a * pred_resid_va + b), 0, None)

# Isotonic 보정
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_resid_va, (y_va - base_va).values)
yhat_iso = np.clip(base_vec + iso.transform(pred_resid_va), 0, None)

rmse_raw = RMSE(y_va, yhat_raw)
rmse_ab  = RMSE(y_va, yhat_ab)
rmse_iso = RMSE(y_va, yhat_iso)

if rmse_iso < rmse_ab and rmse_iso < rmse_raw:
    CALIB_MODE = "ISO"; CALIB_PARAMS = {"iso": iso}
    print(f"[Calibration] best=ISO  RMSE={rmse_iso:.3f}")
elif rmse_ab < rmse_raw:
    CALIB_MODE = "AB";  CALIB_PARAMS = {"a": float(a), "b": float(b)}
    print(f"[Calibration] best=AB   RMSE={rmse_ab:.3f}  (a={a:.3f}, b={b:.3f})")
else:
    CALIB_MODE = "NONE"; CALIB_PARAMS = {}
    print(f"[Calibration] best=NONE RMSE={rmse_raw:.3f}")

# 추후 재사용 위해 저장
BEST_MODEL_NAME = "HGB"
BEST_MODEL_OBJ  = hgb_best["model"]
BEST_MODEL_KIND = "skl"
BEST_PARAMS     = hgb_best.get("params", {})

[Calibration] best=ISO  RMSE=211.611


In [32]:
# =========================
# 6R) 드롭 없이 결측치 0 채움 버전 (HGB 최종)
#  - 선행: all_tr, all_te, FEATS, cat_cols_use, X_full_skl, X_te_skl 가 만들어져 있어야 함
#  - to_sklearn_matrix 가 수치 NaN/inf → 0 처리하므로, y/베이스라인도 0으로 강제
# =========================
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

def RMSE(y, yhat):
    return float(np.sqrt(mean_squared_error(y, yhat)))

# 0) 안전: FEATS 안의 수치컬럼들과 타깃의 결측/inf 0 채움
for _df in (all_tr, all_te):
    num_cols = [c for c in FEATS if c in _df.columns and pd.api.types.is_numeric_dtype(_df[c])]
    # 타깃 컬럼도 함께 처리(학습쪽만 존재)
    if '전력소비량(kWh)' in _df.columns:
        num_cols = list(dict.fromkeys(num_cols + ['전력소비량(kWh)']))
    _df[num_cols] = (_df[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0))

# 1) 베이스라인 테이블 만들기 (train 기준)
g1 = all_tr.groupby(['건물번호','hour','weekday'])['전력소비량(kWh)'].mean()
g2 = all_tr.groupby(['건물번호','hour'])['전력소비량(kWh)'].mean()
g3 = all_tr.groupby(['건물번호'])['전력소비량(kWh)'].mean()
g4 = float(all_tr['전력소비량(kWh)'].mean())  # 전체 평균

# 2) 베이스라인 매핑 + 최종적으로 0 채움
def map_baseline(df: pd.DataFrame):
    s = pd.Series(
        g1.reindex(df.set_index(['건물번호','hour','weekday']).index).to_numpy(),
        index=df.index
    )
    s = s.fillna(pd.Series(df.set_index(['건물번호','hour']).index.map(g2), index=df.index))
    s = s.fillna(df['건물번호'].map(g3))
    s = s.fillna(g4)
    return s.fillna(0.0).astype(float)

base_full = map_baseline(all_tr)
base_te   = map_baseline(all_te)

# 3) 잔차 타깃 구성 (무조건 0으로 채운 상태)
y_full_resid = (all_tr['전력소비량(kWh)'].fillna(0.0) - base_full.fillna(0.0)).astype(float)

# 4) HGB 파라미터 (랜덤서치 결과가 있으면 반영)
best_hp = {'learning_rate':0.06, 'max_leaf_nodes':31, 'l2_regularization':0.0, 'max_iter':3500}
if 'hgb_best' in globals() and isinstance(hgb_best.get('params', {}), dict):
    best_hp.update(hgb_best['params'])

hgb_final = HistGradientBoostingRegressor(
    early_stopping=False, random_state=42, **best_hp
)

# 5) 학습 (to_sklearn_matrix가 X_full_skl/X_te_skl 내 NaN/inf → 0 처리함)
hgb_final.fit(X_full_skl, y_full_resid)

# (옵션) 검증 체크
if 'is_val' in globals():
    va_idx = np.where(is_val.values)[0]
    X_va_chk    = X_full_skl.iloc[va_idx]
    base_va_chk = base_full.iloc[va_idx]
    y_va_chk    = all_tr['전력소비량(kWh)'].iloc[va_idx]
    # 혹시나 있을 결측 0 채움
    y_va_chk = y_va_chk.fillna(0.0)
    base_va_chk = base_va_chk.fillna(0.0)
    pred_resid  = hgb_final.predict(X_va_chk)
    yhat_va     = np.clip(base_va_chk.values + pred_resid, 0, None)
    print(f"[Check] HGB VAL RMSE={RMSE(y_va_chk, yhat_va):.3f}")

# 6) 보정 적용 (있으면), 없으면 생략
CALIB_MODE   = globals().get('CALIB_MODE', 'NONE')
CALIB_PARAMS = globals().get('CALIB_PARAMS', {})

resid_te = hgb_final.predict(X_te_skl)
if CALIB_MODE == "ISO":
    test_pred = base_te.values + CALIB_PARAMS["iso"].transform(resid_te)
elif CALIB_MODE == "AB":
    test_pred = base_te.values + CALIB_PARAMS["a"] * resid_te + CALIB_PARAMS["b"]
else:
    test_pred = base_te.values + resid_te

# 7) 음수 방지 + 저장
test_pred = np.clip(test_pred, 0, None)
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"[DONE] 결측 0 채움 버전 저장: {OUT_SUB}")


[Check] HGB VAL RMSE=99.666
[DONE] 결측 0 채움 버전 저장: C:\Users\user\Downloads\open (1)\submission_rmse_residual.csv


In [30]:
# 진단: 어디에 NaN이 있는지 확인
print("NaN in target (train):", int(all_tr['전력소비량(kWh)'].isna().sum()))
try:
    print("NaN in base_full (before fix):", int(base_full.isna().sum()))
except NameError:
    print("base_full 아직 안만듦")


NaN in target (train): 15456
NaN in base_full (before fix): 0


In [29]:
# =========================================
# (Fix) 6R 전에 all_tr / all_te 만들고 이어서 HGB 최종 추론
#  - 기존 전처리/피처 함수(make_features 등)가 이미 선언돼 있다고 가정
#  - TEST_MERGED_PATH 있으면 그대로 쓰고, 없으면 TEST_PATH+BUILD_PATH merge
# =========================================
import numpy as np, pandas as pd
from pathlib import Path

# ---- 경로 기본값 (이미 정의돼 있으면 그대로 사용) ----
TRAIN_MERGED_PATH = globals().get("TRAIN_MERGED_PATH", r"C:\Users\user\Downloads\open (1)\merged_train.csv")
TEST_MERGED_PATH  = globals().get("TEST_MERGED_PATH",  r"C:\Users\user\Downloads\open (1)\merged_test.csv")  # 있다고 했던 경로
TEST_PATH         = globals().get("TEST_PATH",         r"C:\Users\user\Downloads\open (1)\test.csv")
BUILD_PATH        = globals().get("BUILD_PATH",        r"C:\Users\user\Downloads\open (1)\building_info.csv")
SAMPLE_SUB        = globals().get("SAMPLE_SUB",        r"C:\Users\user\Downloads\open (1)\sample_submission.csv")
OUT_SUB           = globals().get("OUT_SUB",           r"C:\Users\user\Downloads\open (1)\submission.csv")

# ---- 유틸 (존재하면 재정의 안 함) ----
def _read_csv_smart(path, **kwargs):
    try:
        return pd.read_csv(path, encoding=kwargs.get('encoding', 'utf-8-sig'))
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='cp949')

if 'read_csv_smart' not in globals():
    read_csv_smart = _read_csv_smart

if 'to_sklearn_matrix' not in globals():
    # sklearn/HGB용 행렬 변환
    def to_sklearn_matrix(X_tr: pd.DataFrame, X_te: pd.DataFrame, cat_cols):
        Xtr = X_tr.copy(); Xte = X_te.copy()
        use_cats = [c for c in (cat_cols or []) if c in Xtr.columns]
        for c in use_cats:
            Xtr[c] = Xtr[c].astype('category')
            Xte[c] = Xte[c].astype('category').cat.set_categories(Xtr[c].cat.categories)
            Xtr[c] = Xtr[c].cat.codes.replace(-1, np.nan)
            Xte[c] = Xte[c].cat.codes.replace(-1, np.nan)
        Xtr = Xtr.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
        Xte = Xte.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
        return Xtr, Xte

# ---- 필요한 함수들이 있는지 체크 ----
needed_funcs = ['clean_capacity_fields','ensure_datetime_cols','make_features']
missing = [f for f in needed_funcs if f not in globals()]
assert not missing, f"다음 전처리/피처 함수가 필요해: {missing} (앞 셀 실행 먼저)"

# ---- 1) 데이터 로드 ----
train_df = read_csv_smart(TRAIN_MERGED_PATH)

# test는 merged 우선, 없으면 test+building merge
if Path(TEST_MERGED_PATH).exists():
    test_df = read_csv_smart(TEST_MERGED_PATH)
else:
    test_raw = read_csv_smart(TEST_PATH)
    build_df = read_csv_smart(BUILD_PATH)
    test_df = pd.merge(test_raw, build_df, on='건물번호', how='left')

# ---- 2) 동일 전처리 ----
train_df = clean_capacity_fields(train_df)
train_df = ensure_datetime_cols(train_df)

test_df  = clean_capacity_fields(test_df)
test_df  = ensure_datetime_cols(test_df)

# 일사/일조 방어
if '일사(MJ/m2)' not in train_df.columns: train_df['일사(MJ/m2)'] = 0.0
if '일조(hr)'   not in train_df.columns: train_df['일조(hr)']   = 0.0
if '일사(MJ/m2)' not in test_df.columns:  test_df['일사(MJ/m2)']  = 0.0
if '일조(hr)'   not in test_df.columns:   test_df['일조(hr)']    = 0.0

# ---- 3) train+test concat → 동일 피처 생성 ----
all_df   = pd.concat([train_df, test_df], ignore_index=True)
all_feat = make_features(all_df)

# split back
all_tr = all_feat.iloc[:len(train_df)].copy()
all_te = all_feat.iloc[len(train_df):].copy()

# ---- 4) 피처 목록 동기화 ----
#   - 가능하면 기존 학습에 썼던 X_tr.columns를 그대로 사용(일관성 유지)
#   - 없으면 FEATURE_LIST 또는 규칙 기반으로 재구성
EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
if 'X_tr' in globals():
    FEATS = list(X_tr.columns)
else:
    if 'FEATURE_LIST' in globals():
        FEATS = [c for c in FEATURE_LIST if c not in EXCLUDE]
    else:
        from pandas.api.types import is_numeric_dtype
        FEATS = [c for c in all_tr.columns if c not in EXCLUDE and (is_numeric_dtype(all_tr[c]) or str(all_tr[c].dtype)=='category')]

# 카테고리(factor) 지정
if 'cat_cols' in globals():
    cat_cols_use = [c for c in cat_cols if c in FEATS]
else:
    cat_cols_use = [c for c in ['건물번호','건물유형'] if c in FEATS]

# 공통 교집합만 사용(혹시라도 누락 방지)
FEATS = [c for c in FEATS if c in all_tr.columns and c in all_te.columns]

# ---- 5) 스키런 행렬 변환(HGB용) ----
X_full_skl, X_te_skl = to_sklearn_matrix(all_tr[FEATS], all_te[FEATS], cat_cols_use)

# ---- 6) 베이스라인 함수 준비 ----
if 'build_baseline' not in globals():
    # 간단한 베이스라인: 건물번호 × hour × weekday 평균
    def build_baseline(df: pd.DataFrame) -> pd.Series:
        key = ['건물번호','hour','weekday'] if all(k in df.columns for k in ['hour','weekday']) else ['건물번호','시간']
        if '전력소비량(kWh)' in df.columns:
            grp_mean = df.groupby(key)['전력소비량(kWh)'].transform('mean')
            return grp_mean
        else:
            # test용: train 평균을 test 키에 매핑하려면 외부 map이 필요하지만
            # 여기서는 concat 기반이라 transform로도 채워짐(앞단 all_df concat 덕분)
            return df.groupby(key)[key[0]].transform('count')*0.0  # dummy → 아래에서 다시 채움

# concat 기반이라 all_tr에서 평균을 만들고 같은 키가 all_te에도 존재 → transform 결과가 all_df 전 구간에 퍼져 있음
base_full = all_tr.groupby(['건물번호','hour','weekday'])['전력소비량(kWh)'].transform('mean')
# test는 훈련 평균을 map
mean_tbl  = all_tr.groupby(['건물번호','hour','weekday'])['전력소비량(kWh)'].mean()
base_te   = all_te.set_index(['건물번호','hour','weekday']).index.map(mean_tbl).astype(float)
base_te   = pd.Series(base_te, index=all_te.index).fillna(all_tr['전력소비량(kWh)'].mean())

# ---- 7) HGB 최종 재학습 ----
from sklearn.ensemble import HistGradientBoostingRegressor
def RMSE(y, yhat):
    from sklearn.metrics import mean_squared_error
    return float(np.sqrt(mean_squared_error(y, yhat)))

# 랜덤서치에서 얻은 베스트 파라미터를 쓰되, 없으면 기본값
best_hp = {'learning_rate':0.06,'max_leaf_nodes':31,'l2_regularization':0.0,'max_iter':3500}
best_hp.update(globals().get('hgb_best', {}).get('params', {}))

hgb_final = HistGradientBoostingRegressor(
    early_stopping=False, random_state=42, **best_hp
)

y_full_resid = (all_tr['전력소비량(kWh)'] - base_full).astype(float)
hgb_final.fit(X_full_skl, y_full_resid)

# ---- 8) VALID 성능(선택) 확인: VAL 구간 변수 있으면 점검
if 'is_val' in globals():
    val_idx = np.where(is_val.values)[0]
    tr_idx  = np.where(~is_val.values)[0]
    # 같은 FEATS 기준으로 스키런 행렬 다시 슬라이스
    X_va_chk = X_full_skl.iloc[val_idx]
    base_va_chk = base_full.iloc[val_idx]
    y_va_chk = all_tr['전력소비량(kWh)'].iloc[val_idx]
    pred_resid_va = hgb_final.predict(X_va_chk)
    yhat_va = np.clip(base_va_chk.values + pred_resid_va, 0, None)
    print(f"[Check] HGB VAL RMSE={RMSE(y_va_chk, yhat_va):.3f}")

# ---- 9) 보정 적용(CALIB_MODE 있으면), 없으면 생략
CALIB_MODE   = globals().get('CALIB_MODE', 'NONE')
CALIB_PARAMS = globals().get('CALIB_PARAMS', {})

resid_te = hgb_final.predict(X_te_skl)
if CALIB_MODE == "ISO":
    test_pred = base_te.values + CALIB_PARAMS["iso"].transform(resid_te)
elif CALIB_MODE == "AB":
    test_pred = base_te.values + CALIB_PARAMS["a"] * resid_te + CALIB_PARAMS["b"]
else:
    test_pred = base_te.values + resid_te

test_pred = np.clip(test_pred, 0, None)

# ---- 10) 저장
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')

print(f"[DONE] all_tr/all_te 재구성 → HGB 최종 예측 저장: {OUT_SUB}")
print("n_features:", len(FEATS), "  neg_rate:", float((test_pred<0).mean()))


ValueError: Input y contains NaN.

In [33]:
# =========================================
# HGB 최종 재학습 (정확도 회복 버전)
# - 백오프 베이스라인(g1→g2→g3→전체평균)
# - 결측 y/베이스라인은 y_resid=0 + sample_weight=0 (드롭 없음)
# - 기존 FEATS / X_full_skl / X_te_skl 없으면 즉석 생성
# =========================================
import numpy as np, pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

def RMSE(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

# ---- 안전: 스키런 입력행렬이 없으면 즉석 생성
if 'to_sklearn_matrix' not in globals():
    def to_sklearn_matrix(X_tr: pd.DataFrame, X_te: pd.DataFrame, cat_cols):
        Xtr = X_tr.copy(); Xte = X_te.copy()
        use_cats = [c for c in (cat_cols or []) if c in Xtr.columns]
        for c in use_cats:
            Xtr[c] = Xtr[c].astype('category')
            Xte[c] = Xte[c].astype('category').cat.set_categories(Xtr[c].cat.categories)
            Xtr[c] = Xtr[c].cat.codes.replace(-1, np.nan)
            Xte[c] = Xte[c].cat.codes.replace(-1, np.nan)
        Xtr = Xtr.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
        Xte = Xte.replace([np.inf,-np.inf], np.nan).astype(float).fillna(0.0)
        return Xtr, Xte

EXCLUDE = {'전력소비량(kWh)','dt','날짜','시간','일시','num_date_time'}
if 'FEATS' not in globals():
    from pandas.api.types import is_numeric_dtype
    FEATS = [c for c in all_tr.columns if c not in EXCLUDE and (is_numeric_dtype(all_tr[c]) or str(all_tr[c].dtype)=='category')]
if 'cat_cols_use' not in globals():
    cat_cols_use = [c for c in ['건물번호','건물유형'] if c in FEATS]
if 'X_full_skl' not in globals() or 'X_te_skl' not in globals():
    X_full_skl, X_te_skl = to_sklearn_matrix(all_tr[FEATS], all_te[FEATS], cat_cols_use)

# ---- 1) 백오프 베이스라인
g1 = all_tr.groupby(['건물번호','hour','weekday'])['전력소비량(kWh)'].mean()
g2 = all_tr.groupby(['건물번호','hour'])['전력소비량(kWh)'].mean()
g3 = all_tr.groupby(['건물번호'])['전력소비량(kWh)'].mean()
g4 = float(all_tr['전력소비량(kWh)'].mean())

def map_baseline(df: pd.DataFrame):
    idx1 = df.set_index(['건물번호','hour','weekday']).index
    s = pd.Series(g1.reindex(idx1).to_numpy(), index=df.index)
    s = s.fillna(pd.Series(df.set_index(['건물번호','hour']).index.map(g2), index=df.index))
    s = s.fillna(df['건물번호'].map(g3))
    s = s.fillna(g4)
    return s.astype(float)

base_full = map_baseline(all_tr)
base_te   = map_baseline(all_te)

# ---- 2) 잔차 타깃 + 샘플가중치(결측은 가중치 0으로 무시)
y_true = all_tr['전력소비량(kWh)']
mask_ok = y_true.notna() & base_full.notna()
y_resid = (y_true - base_full).where(mask_ok, 0.0).astype(float)  # 결측이면 0으로 채우되,
w_full  = np.where(mask_ok, 1.0, 0.0)                             # 학습 영향은 0(weight)

print(f"[INFO] 결측으로 무시되는 학습행: {(w_full==0).sum()} / {len(w_full)}")

# ---- 3) HGB 하이퍼파라미터 (랜덤서치 베스트 있으면 반영)
best_hp = {'learning_rate':0.06,'max_leaf_nodes':31,'l2_regularization':0.0,'max_iter':3500}
if 'hgb_best' in globals() and isinstance(hgb_best.get('params', {}), dict):
    best_hp.update(hgb_best['params'])

hgb_final = HistGradientBoostingRegressor(
    early_stopping=False, random_state=42, **best_hp
)
hgb_final.fit(X_full_skl, y_resid, sample_weight=w_full)

# ---- 4) (옵션) VAL 체크
if 'is_val' in globals():
    va_idx = np.where(is_val.values)[0]
    y_va_chk    = y_true.iloc[va_idx].fillna(base_full.iloc[va_idx])  # 결측이면 베이스라인으로 대체
    base_va_chk = base_full.iloc[va_idx]
    pred_resid  = hgb_final.predict(X_full_skl.iloc[va_idx])
    yhat_va     = np.clip(base_va_chk.values + pred_resid, 0, None)
    print(f"[VAL] HGB RMSE={RMSE(y_va_chk, yhat_va):.3f}")

# ---- 5) 보정 적용 (있으면)
CALIB_MODE   = globals().get('CALIB_MODE', 'NONE')
CALIB_PARAMS = globals().get('CALIB_PARAMS', {})

resid_te = hgb_final.predict(X_te_skl)
if CALIB_MODE == "ISO":
    test_pred = base_te.values + CALIB_PARAMS["iso"].transform(resid_te)
elif CALIB_MODE == "AB":
    test_pred = base_te.values + CALIB_PARAMS["a"] * resid_te + CALIB_PARAMS["b"]
else:
    test_pred = base_te.values + resid_te

test_pred = np.clip(test_pred, 0, None)

# ---- 6) 저장
sub = read_csv_smart(SAMPLE_SUB)
sub['answer'] = pd.Series(test_pred, index=sub.index)
sub.to_csv(OUT_SUB, index=False, encoding='utf-8-sig')
print(f"[DONE] 정확도 회복 버전 저장 → {OUT_SUB}")



[INFO] 결측으로 무시되는 학습행: 0 / 204000
[VAL] HGB RMSE=99.666
[DONE] 정확도 회복 버전 저장 → C:\Users\user\Downloads\open (1)\submission_rmse_residual.csv


In [30]:
# 결과 저장
df.to_csv(r"C:\Users\user\Downloads\open (1)\merged_train.csv", index=False, encoding = 'cp949')