### 학습률 0.05

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error

# 1. SMAPE 평가 지표 함수 정의 (기존과 동일)
def smape(gt, preds):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)를 계산합니다.
    """
    gt = np.array(gt)
    preds = np.array(preds)
    v = 2 * abs(preds - gt) / (abs(preds) + abs(gt) + 1e-8)
    score = np.mean(v) * 100
    return score

# 2. 데이터 로드 (파일 이름만 수정)
# '건물유형_번호' 컬럼이 포함된 최종 데이터 파일을 사용합니다.
file_path = '/content/drive/MyDrive/DSCover_Guide/modified_dataset_최최종.csv'
try:
    df = pd.read_csv(file_path)
    df['일시'] = pd.to_datetime(df['일시'])
except FileNotFoundError:
    print(f"오류: '{file_path}' 파일을 찾을 수 없습니다.")
    exit()

# 3. 건물 유형 번호 목록 추출 (수정된 부분)
building_type_numbers = sorted(df['건물유형_번호'].unique())

# 4. 학습된 모델과 평가 결과를 저장할 딕셔너리 준비 (기존과 동일)
trained_models = {}
evaluation_results = {}

# 5. for 반복문으로 건물 유형별 모델 학습
print("--- 건물 유형별 모델 학습 시작 ---")
for b_type_num in building_type_numbers:
    # NaN 값이 있을 경우 건너뜁니다.
    if pd.isna(b_type_num):
        continue

    # 건물 유형 번호를 정수로 변환하여 사용합니다.
    b_type_num = int(b_type_num)
    print(f"\n--- [건물유형 {b_type_num}] 모델 학습 ---")

    # (1) 데이터 필터링 (수정된 부분)
    # '건물유형_번호'를 기준으로 데이터를 필터링합니다.
    subset_df = df[df['건물유형_번호'] == b_type_num].copy()

    # (2) 피처(X)와 타겟(y) 설정 (수정된 부분)
    # 이제 '건물유형_번호' 컬럼도 학습 피처에서 제외해야 합니다.
    # 원핫인코딩 컬럼이 있다면 그것도 함께 제거합니다. (안전장치)
    building_type_cols = [col for col in df.columns if '건물유형_' in col]
    drop_cols = ['전력소비량(kWh)', '일시', '날짜', '건물유형','건물유형_번호'] + building_type_cols

    X = subset_df.drop(columns=drop_cols)
    y = subset_df['전력소비량(kWh)']

    # Lag/Rolling 피처 생성으로 인한 결측치가 있는 행 제거
    valid_indices = y.dropna().index
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]
    subset_df = subset_df.loc[valid_indices]

    # (3) 훈련/테스트 데이터 분할 (시간 기반 - 기존과 동일)
    if X.empty:
        print(f"[건물유형 {b_type_num}] 유효한 학습 데이터가 없습니다.")
        continue

    X_train = X[subset_df['일시'] < '2024-08-18']
    X_test = X[subset_df['일시'] >= '2024-08-18']
    y_train = y[subset_df['일시'] < '2024-08-18']
    y_test = y[subset_df['일시'] >= '2024-08-18']

    if X_test.empty:
        print(f"[건물유형 {b_type_num}] 테스트 데이터가 없어 평가를 건너뜁니다.")
        continue

    # (4) XGBoost 모델 학습 (기존과 동일)
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05,
                             max_depth=5, random_state=42, n_jobs=-1,
                             early_stopping_rounds=50)

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    # (5) 모델과 평가 결과 저장 (딕셔너리 키를 번호로 저장)
    trained_models[b_type_num] = model

    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    smape_score = smape(y_test, predictions)

    evaluation_results[b_type_num] = {'R2': r2, 'MAE': mae, 'SMAPE': smape_score}

    print(f"[건물유형 {b_type_num}] 모델 R²: {r2:.4f}, MAE: {mae:.4f}, SMAPE: {smape_score:.4f}%")

print("\n--- 모든 모델 학습 완료 ---")
print("저장된 모델 목록 (건물유형 번호):", list(trained_models.keys()))

# 예시: 건물유형 2번 모델의 전체 평가 결과 확인
# print("\n건물유형 2번 모델 평가 결과:")
# print(evaluation_results[2])

--- 건물 유형별 모델 학습 시작 ---

--- [건물유형 1] 모델 학습 ---
[건물유형 1] 모델 R²: 0.9888, MAE: 170.0410, SMAPE: 5.8696%

--- [건물유형 2] 모델 학습 ---
[건물유형 2] 모델 R²: 0.9969, MAE: 34.8238, SMAPE: 4.5853%

--- [건물유형 3] 모델 학습 ---
[건물유형 3] 모델 R²: 0.9965, MAE: 47.3084, SMAPE: 2.2138%

--- [건물유형 4] 모델 학습 ---
[건물유형 4] 모델 R²: 0.9809, MAE: 79.1427, SMAPE: 4.4838%

--- [건물유형 5] 모델 학습 ---
[건물유형 5] 모델 R²: 0.9984, MAE: 69.7652, SMAPE: 2.3919%

--- [건물유형 6] 모델 학습 ---
[건물유형 6] 모델 R²: 0.9970, MAE: 94.3043, SMAPE: 4.1338%

--- [건물유형 7] 모델 학습 ---
[건물유형 7] 모델 R²: 0.9974, MAE: 112.6787, SMAPE: 1.9484%

--- [건물유형 8] 모델 학습 ---
[건물유형 8] 모델 R²: 0.9977, MAE: 79.6611, SMAPE: 4.0519%

--- [건물유형 9] 모델 학습 ---
[건물유형 9] 모델 R²: 0.9794, MAE: 80.1657, SMAPE: 3.5292%

--- [건물유형 10] 모델 학습 ---
[건물유형 10] 모델 R²: 0.9993, MAE: 63.1178, SMAPE: 0.7575%

--- 모든 모델 학습 완료 ---
저장된 모델 목록 (건물유형 번호): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


### 학습률 0.1

In [6]:
for b_type_num in building_type_numbers:
    # NaN 값이 있을 경우 건너뜁니다.
    if pd.isna(b_type_num):
        continue

    # 건물 유형 번호를 정수로 변환하여 사용합니다.
    b_type_num = int(b_type_num)
    print(f"\n--- [건물유형 {b_type_num}] 모델 학습 ---")

    # (1) 데이터 필터링 (수정된 부분)
    # '건물유형_번호'를 기준으로 데이터를 필터링합니다.
    subset_df = df[df['건물유형_번호'] == b_type_num].copy()

    # (2) 피처(X)와 타겟(y) 설정 (수정된 부분)
    # 이제 '건물유형_번호' 컬럼도 학습 피처에서 제외해야 합니다.
    # 원핫인코딩 컬럼이 있다면 그것도 함께 제거합니다. (안전장치)
    building_type_cols = [col for col in df.columns if '건물유형_' in col]
    drop_cols = ['전력소비량(kWh)', '일시', '날짜', '건물유형','건물유형_번호'] + building_type_cols

    X = subset_df.drop(columns=drop_cols)
    y = subset_df['전력소비량(kWh)']

    # Lag/Rolling 피처 생성으로 인한 결측치가 있는 행 제거
    valid_indices = y.dropna().index
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]
    subset_df = subset_df.loc[valid_indices]

    # (3) 훈련/테스트 데이터 분할 (시간 기반 - 기존과 동일)
    if X.empty:
        print(f"[건물유형 {b_type_num}] 유효한 학습 데이터가 없습니다.")
        continue

    X_train = X[subset_df['일시'] < '2024-08-18']
    X_test = X[subset_df['일시'] >= '2024-08-18']
    y_train = y[subset_df['일시'] < '2024-08-18']
    y_test = y[subset_df['일시'] >= '2024-08-18']

    if X_test.empty:
        print(f"[건물유형 {b_type_num}] 테스트 데이터가 없어 평가를 건너뜁니다.")
        continue

    # (4) XGBoost 모델 학습 (기존과 동일)
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.1,
                             max_depth=5, random_state=42, n_jobs=-1,
                             early_stopping_rounds=50)

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    # (5) 모델과 평가 결과 저장 (딕셔너리 키를 번호로 저장)
    trained_models[b_type_num] = model

    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    smape_score = smape(y_test, predictions)

    evaluation_results[b_type_num] = {'R2': r2, 'MAE': mae, 'SMAPE': smape_score}

    print(f"[건물유형 {b_type_num}] 모델 R²: {r2:.4f}, MAE: {mae:.4f}, SMAPE: {smape_score:.4f}%")

print("\n--- 모든 모델 학습 완료 ---")
print("저장된 모델 목록 (건물유형 번호):", list(trained_models.keys()))

# 예시: 건물유형 2번 모델의 전체 평가 결과 확인
# print("\n건물유형 2번 모델 평가 결과:")
# print(evaluation_results[2])


--- [건물유형 1] 모델 학습 ---
[건물유형 1] 모델 R²: 0.9876, MAE: 176.4902, SMAPE: 6.1284%

--- [건물유형 2] 모델 학습 ---
[건물유형 2] 모델 R²: 0.9969, MAE: 35.5907, SMAPE: 4.7575%

--- [건물유형 3] 모델 학습 ---
[건물유형 3] 모델 R²: 0.9965, MAE: 46.1583, SMAPE: 2.1293%

--- [건물유형 4] 모델 학습 ---
[건물유형 4] 모델 R²: 0.9778, MAE: 81.3726, SMAPE: 4.5480%

--- [건물유형 5] 모델 학습 ---
[건물유형 5] 모델 R²: 0.9985, MAE: 68.7397, SMAPE: 2.3403%

--- [건물유형 6] 모델 학습 ---
[건물유형 6] 모델 R²: 0.9967, MAE: 93.6638, SMAPE: 3.9750%

--- [건물유형 7] 모델 학습 ---
[건물유형 7] 모델 R²: 0.9975, MAE: 112.3276, SMAPE: 1.9501%

--- [건물유형 8] 모델 학습 ---
[건물유형 8] 모델 R²: 0.9977, MAE: 78.6878, SMAPE: 3.9790%

--- [건물유형 9] 모델 학습 ---
[건물유형 9] 모델 R²: 0.9810, MAE: 75.2801, SMAPE: 3.2906%

--- [건물유형 10] 모델 학습 ---
[건물유형 10] 모델 R²: 0.9994, MAE: 61.5929, SMAPE: 0.7255%

--- 모든 모델 학습 완료 ---
저장된 모델 목록 (건물유형 번호): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [27]:
# train_eval_submit_final.py
# End-to-end: build baseline + XGBoost + LightGBM, evaluate on Aug 1–24, create submissions.

import pandas as pd
import numpy as np
from pathlib import Path

# =========================
# Config
# =========================
PATH_TRAIN = "/content/drive/MyDrive/DSCover_Guide/modified_dataset_3.csv"
PATH_TEST  = "/content/drive/MyDrive/DSCover_Guide/test.csv"
PATH_BINFO = "/content/drive/MyDrive/DSCover_Guide/building_info.csv"
PATH_SUB   = "/content/drive/MyDrive/DSCover_Guide/sample_submission.csv"

VAL_START = pd.Timestamp("2024-08-01 00:00:00")
VAL_END   = pd.Timestamp("2024-08-24 23:00:00")
RANDOM_SEED = 42

# =========================
# Utils
# =========================
def parse_dt(s: pd.Series) -> pd.Series:
    try:
        return pd.to_datetime(s, format="%Y%m%d %H")
    except Exception:
        return pd.to_datetime(s, errors="coerce")

def make_unique_cols(cols):
    """Ensure feature_names are unique for XGBoost."""
    seen = {}
    out = []
    for c in cols:
        if c not in seen:
            seen[c] = 0
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}__dup{seen[c]}")
    return out

def rmse(y, p): return float(np.sqrt(np.mean((y - p) ** 2)))
def mae(y, p):  return float(np.mean(np.abs(y - p)))
def wape(y, p): return float(np.sum(np.abs(y - p)) / (np.sum(np.abs(y)) + 1e-9))

def same_hour_last_week_predict(df_target, df_all):
    """Baseline: predict y(t) = y(t-7d) per building, exact dt alignment."""
    ref = df_all[["건물번호","dt","전력소비량(kWh)"]].copy()
    ref = ref.rename(columns={"전력소비량(kWh)":"y_ref"})
    ref["dt_plus_7d"] = ref["dt"] + pd.Timedelta(days=7)
    merged = df_target.merge(
        ref[["건물번호","dt_plus_7d","y_ref"]],
        left_on=["건물번호","dt"],
        right_on=["건물번호","dt_plus_7d"],
        how="left",
    )
    return merged["y_ref"].values

# =========================
# Load
# =========================
train = pd.read_csv(PATH_TRAIN)
test  = pd.read_csv(PATH_TEST)
binfo = pd.read_csv(PATH_BINFO)
sub   = pd.read_csv(PATH_SUB)

# =========================
# Datetime & sort
# =========================
train["dt"] = parse_dt(train["일시"])
test["dt"]  = parse_dt(test["일시"])
train.sort_values(["건물번호","dt"], inplace=True)
test.sort_values(["건물번호","dt"], inplace=True)

# =========================
# Building info cleaning & merge
# =========================
b = binfo.copy()
for c in ["연면적(m2)","냉방면적(m2)","태양광용량(kW)","ESS저장용량(kWh)","PCS용량(kW)"]:
    if c in b.columns:
        b[c] = pd.to_numeric(b[c].replace("-", np.nan), errors="coerce")
for c in ["태양광용량(kW)","ESS저장용량(kWh)","PCS용량(kW)"]:
    if c in b.columns:
        b[c] = b[c].fillna(0.0)

b["cooling_area_ratio"] = (
    b.get("냉방면적(m2)", np.nan) / b.get("연면적(m2)", np.nan)
).replace([np.inf, -np.inf], np.nan)

b["has_solar"] = (b.get("태양광용량(kW)", 0).fillna(0) > 0).astype("int8")
b["has_ess"]   = (b.get("ESS저장용량(kWh)",0).fillna(0) > 0).astype("int8")
b["has_pcs"]   = (b.get("PCS용량(kW)", 0).fillna(0) > 0).astype("int8")

merge_cols = [c for c in [
    "건물번호","건물유형","연면적(m2)","냉방면적(m2)","태양광용량(kW)","ESS저장용량(kWh)","PCS용량(kW)",
    "cooling_area_ratio","has_solar","has_ess","has_pcs"
] if c in b.columns]

train = train.merge(b[merge_cols], on="건물번호", how="left")
test  = test.merge(b[merge_cols],  on="건물번호", how="left")

# =========================
# Drop train-only / leakage cols if present
# =========================
for c in ["일조(hr)","일사(MJ/m2)","전력소비량_to_log"]:
    if c in train.columns:
        train.drop(columns=c, inplace=True, errors="ignore")

# =========================
# Minimal feature engineering (shared)
# =========================
for df in (train, test):
    df["hour"] = df["dt"].dt.hour.astype("int16")
    df["dow"] = df["dt"].dt.dayofweek.astype("int16")
    df["is_weekend"] = (df["dow"] >= 5).astype("int8")
    df["is_daytime"] = ((df["hour"] >= 6) & (df["hour"] <= 18)).astype("int8")
    df["sin_hour"] = np.sin(2*np.pi*df["hour"]/24)
    df["cos_hour"] = np.cos(2*np.pi*df["hour"]/24)
    df["sin_dow"] = np.sin(2*np.pi*df["dow"]/7)
    df["cos_dow"] = np.cos(2*np.pi*df["dow"]/7)
    # weather transforms
    T = df["기온(°C)"]
    df["cdd18"] = (T - 18.0).clip(lower=0)
    df["hdd18"] = (18.0 - T).clip(lower=0)
    df["rain_flag"] = (df["강수량(mm)"] > 0).astype("int8")
    df["temp_x_hum"] = df["기온(°C)"] * df["습도(%)"]

# One-hot 건물유형 then drop original
if "건물유형" in train.columns:
    d_tr = pd.get_dummies(train["건물유형"], prefix="건물유형")
    d_te = pd.get_dummies(test["건물유형"], prefix="건물유형")
    train = pd.concat([train.drop(columns=["건물유형"]), d_tr], axis=1)
    test  = pd.concat([test.drop(columns=["건물유형"]),  d_te], axis=1)
    # align dummies
    for col in set(train.columns) - set(test.columns):
        if col.startswith("건물유형_"): test[col] = 0
    for col in set(test.columns) - set(train.columns):
        if col.startswith("건물유형_"): train[col] = 0

# =========================
# Validation split
# =========================
train_fit = train[train["dt"] < VAL_START].copy()
val       = train[(train["dt"] >= VAL_START) & (train["dt"] <= VAL_END)].copy()

# =========================
# Baseline
# =========================
val_pred_baseline = same_hour_last_week_predict(val, train)
results = [{
    "model": "Baseline_LastWeek",
    "rmse": rmse(val["전력소비량(kWh)"].values, val_pred_baseline),
    "mae":  mae(val["전력소비량(kWh)"].values, val_pred_baseline),
    "wape": wape(val["전력소비량(kWh)"].values, val_pred_baseline),
}]

# =========================
# Feature matrix (numeric intersection, unique names)
# =========================
exclude = {"전력소비량(kWh)","num_date_time","일시","dt"}
num_train = set(train_fit.select_dtypes(include=[np.number]).columns)
num_test  = set(test.select_dtypes(include=[np.number]).columns)
X_cols_raw = sorted(list((num_train & num_test) - exclude))
X_cols = make_unique_cols(X_cols_raw)

# rename to unique names across all splits
rename_map = dict(zip(X_cols_raw, X_cols))
train_fit = train_fit.rename(columns=rename_map)
val       = val.rename(columns=rename_map)
train     = train.rename(columns=rename_map)
test      = test.rename(columns=rename_map)

X_fit = train_fit[X_cols].astype(float)
y_fit = train_fit["전력소비량(kWh)"].astype(float).values
X_val = val[X_cols].astype(float)
y_val = val["전력소비량(kWh)"].astype(float).values
X_full = train[X_cols].astype(float)
y_full = train["전력소비량(kWh)"].astype(float).values
X_test = test[X_cols].astype(float)

# =========================
# XGBoost
# =========================
xgb_ok = False
try:
    from xgboost import XGBRegressor
    xgb = XGBRegressor(
        n_estimators=1200, learning_rate=0.05, max_depth=8,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        n_jobs=4, tree_method="hist", random_state=RANDOM_SEED
    )
    xgb.fit(X_fit, y_fit, eval_set=[(X_val, y_val)], verbose=False)
    pred_val_xgb = np.clip(xgb.predict(X_val), 0, None)
    results.append({
        "model":"XGBoost",
        "rmse": rmse(y_val, pred_val_xgb),
        "mae":  mae(y_val, pred_val_xgb),
        "wape": wape(y_val, pred_val_xgb),
    })
    xgb_ok = True
except Exception as e:
    print("XGBoost failed:", e)

# =========================
# LightGBM (sklearn API, robust)
# =========================
lgb_ok = False
best_iters = None
try:
    from lightgbm import LGBMRegressor
    lgbm = LGBMRegressor(
        objective="regression",
        learning_rate=0.05,
        num_leaves=127,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=RANDOM_SEED,
        n_estimators=5000,         # cap; ES will stop
    )
    lgbm.fit(
        X_fit, y_fit,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        early_stopping_rounds=200,
        verbose=False,
    )
    pred_val_lgb = np.clip(lgbm.predict(X_val), 0, None)
    results.append({
        "model":"LightGBM",
        "rmse": rmse(y_val, pred_val_lgb),
        "mae":  mae(y_val, pred_val_lgb),
        "wape": wape(y_val, pred_val_lgb),
    })
    best_iters = getattr(lgbm, "best_iteration_", None)
    lgb_ok = True
except Exception as e:
    print("LightGBM failed:", e)

# =========================
# Select best & train full
# =========================
res_df = pd.DataFrame(results).sort_values("rmse").reset_index(drop=True)
best = res_df.iloc[0]["model"]
print("\nValidation results:\n", res_df)

# baseline for test
test_pred_baseline = np.clip(same_hour_last_week_predict(test, train), 0, None)

if best == "XGBoost" and xgb_ok:
    xgb_full = XGBRegressor(
        n_estimators=1200, learning_rate=0.05, max_depth=8,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        n_jobs=4, tree_method="hist", random_state=RANDOM_SEED
    )
    xgb_full.fit(X_full, y_full, verbose=False)
    test_pred_model = np.clip(xgb_full.predict(X_test), 0, None)
elif best == "LightGBM" and lgb_ok:
    final_estimators = int(best_iters) if (best_iters is not None and best_iters > 0) else 1200
    lgbm_full = LGBMRegressor(
        objective="regression",
        learning_rate=0.05,
        num_leaves=127,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=RANDOM_SEED,
        n_estimators=final_estimators,
    )
    lgbm_full.fit(X_full, y_full, verbose=False)
    test_pred_model = np.clip(lgbm_full.predict(X_test), 0, None)
else:
    best = "Baseline_LastWeek"
    test_pred_model = test_pred_baseline.copy()

# Non-negative clamp (already clipped) — keep anyway for safety
test_pred_model = np.clip(test_pred_model, 0, None)

# =========================
# Save outputs
# =========================
out_dir = Path(".")
res_df.to_csv(out_dir / "val_metrics.csv", index=False)

sub_base = sub.copy()
sub_base["answer"] = test_pred_baseline.astype(float)
sub_base.to_csv(out_dir / "submission_baseline.csv", index=False)

sub_model = sub.copy()
sub_model["answer"] = test_pred_model.astype(float)
sub_model.to_csv(out_dir / "submission_model.csv", index=False)

print("\nSaved:")
print(" - val_metrics.csv")
print(" - submission_baseline.csv")
print(" - submission_model.csv")
print(f"\nBest model used for submission_model.csv: {best}")
print(f"Features used ({len(X_cols)}): {X_cols[:12]} ...")


LightGBM failed: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

Validation results:
                model        rmse         mae      wape
0            XGBoost  451.321055  242.726658  0.066609
1  Baseline_LastWeek  486.620855  207.137190  0.056843

Saved:
 - val_metrics.csv
 - submission_baseline.csv
 - submission_model.csv

Best model used for submission_model.csv: XGBoost
Features used (21): ['cdd18', 'cooling_area_ratio', 'cos_dow', 'cos_hour', 'dow', 'has_ess', 'has_pcs', 'has_solar', 'hdd18', 'hour', 'is_daytime', 'is_weekend'] ...


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
