In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
import holidays
from scipy.fftpack import fft#푸리에 변환을 위한 코드입니다.
from scipy.stats import boxcox#박스콕스 변환을 위한 코드임
from sklearn.preprocessing import MinMaxScaler
# ===== LightGBM 머신러닝 파이프라인 =====
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
import optuna
from optuna.samplers import TPESampler

#기타
import warnings
warnings.filterwarnings('ignore')

In [4]:
df_train = pd.read_csv("train_heat.csv")
df_test = pd.read_csv("test_heat.csv")
#열이름빼기
df_train.columns = df_train.columns.str.replace('train_heat.', '', regex=False)
#Unnamed:0제거
df_train = df_train.drop(columns=["Unnamed: 0"])
#test데이터 열이름 바꾸기
df_test.columns = [
    "tm", "branch_id", "ta", "wd", "ws",
    "rn_day", "rn_hr1", "hm", "si", "ta_chi","heat_demand"]



def calculate_summer_apparent_temp(ta, hm):
    """여름철 체감온도 계산"""
    try:
        tw = ta * np.arctan(0.151977 * np.sqrt(hm + 8.313659)) \
             + np.arctan(ta + hm) \
             - np.arctan(hm - 1.676331) \
             + 0.00391838 * hm**1.5 * np.arctan(0.023101 * hm) \
             - 4.686035
        return -0.2442 + 0.55399 * tw + 0.45535 * ta - 0.0022 * tw**2 + 0.00278 * tw * ta + 3.0
    except:
        return np.nan

def calculate_winter_apparent_temp(ta, ws):
    """겨울철 체감온도 계산"""
    try:
        v = ws * 3.6  # m/s → km/h
        return 13.12 + 0.6215 * ta - 11.37 * v**0.16 + 0.3965 * ta * v**0.16
    except:
        return np.nan

def add_apparent_temp_features(df):
    df['month'] = df['tm'].dt.month
    df['apparent_temp'] = df.apply(lambda row:
        calculate_summer_apparent_temp(row['ta'], row['hm']) if 5 <= row['month'] <= 9
        else calculate_winter_apparent_temp(row['ta'], row['ws']),
        axis=1
    )
    return df

print("📊 데이터 로드 중...")
df = pd.read_csv("train_heat.csv")



def preprocess_weather_data(df):
    # 날짜 변환
    df['tm'] = pd.to_datetime(df['tm'], format='%Y%m%d%H')
    # 1. si: 08~18시가 아닐 때 -99는 0으로
    mask_outside_8_to_18 = (~df['tm'].dt.hour.between(8, 18)) & (df['si'] == -99)
    df.loc[mask_outside_8_to_18, 'si'] = 0

    # 2. wd에서 9.9는 NaN으로
    df['wd'] = df['wd'].replace(9.9, np.nan)

    # 3. -99 처리
    df.replace(-99, np.nan, inplace=True)

    # 4. 브랜치별 선형보간 #관련해서 연속된거는 일단 앞쪽꺼로 채움
    df = df.sort_values(['branch_id', 'tm'])
    df = df.groupby('branch_id').apply(lambda g: g.interpolate(method='linear', limit_direction='both')).reset_index(drop=True)
    df = df.fillna(method='ffill').fillna(method='ffill')
    # 📌 파생 변수 생성
    df['year'] = df['tm'].dt.year
    df['month'] = df['tm'].dt.month
    df['hour'] = df['tm'].dt.hour
    df['date'] = df['tm'].dt.date
    df['weekday'] = df['tm'].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)

    # 🇰🇷 한국 공휴일
    kr_holidays = holidays.KR()
    df['is_holiday'] = df['tm'].dt.date.apply(lambda x: int(x in kr_holidays))

    # 🕒 시간 지연
    for lag in [1, 2, 3]:
        df[f'ta_lag_{lag}'] = df.groupby('branch_id')['ta'].shift(lag)
        df[f'ta_lag_{lag}'] = df.groupby('branch_id')[f'ta_lag_{lag}'].transform(
        lambda x: x.fillna(method='bfill'))
    # 🔥 HDD / CDD
    df['HDD18'] = np.maximum(0, 18 - df['ta'])
    df['CDD18'] = np.maximum(0, df['ta'] - 18)
    df['HDD20'] = np.maximum(0, 20 - df['ta'])
    df['CDD20'] = np.maximum(0, df['ta'] - 20)

    #직접만든 체감온도
    df = add_apparent_temp_features(df)


    # 지점별 온도 편차
    branch_mean = df.groupby('branch_id')['ta'].transform('mean')
    df['branch_temp_abs_deviation'] = np.abs(df['ta'] - branch_mean)



    # 이동 평균 (3시간 단위 최대 24시간 = 8개)
    for n in [3, 6, 9, 12, 15, 18, 21, 24]:
        df[f'ta_3h_avg_{n}'] = df.groupby('branch_id')['ta'].transform(lambda x: x.rolling(n, min_periods=1).mean())

    # 불쾌지수
    df['DCI'] = 0.81 * df['ta'] + 0.01 * df['hm'] * (0.99 * df['ta'] - 14.3) + 46.3

    # 풍속 냉지수 (wchi)
    ws_kmh = df['ws'] * 3.6  # m/s -> km/h 변환
    df['wchi'] = 13.12 + 0.6215 * df['ta'] - 11.37 * ws_kmh**0.16 + 0.3965 * df['ta'] * ws_kmh**0.16


    # 실효온도
    df['e'] = (df['hm'] / 100) * 6.105 * np.exp((17.27 * df['ta']) / (237.7 + df['ta']))
    df['atemphi'] = df['ta'] + 0.33 * df['e'] - 0.70 * df['ws'] - 4.00

    # 주기성 인코딩
    df['dayofyear'] = df['tm'].dt.dayofyear
    df['dayofmonth'] = df['tm'].dt.day
    df['weekofyear'] = df['tm'].dt.isocalendar().week.astype(int)

    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)

    # 하루 5구간
    def time_slot(h): return int(h // 5)
    df['hour_slot_5'] = df['hour'].apply(time_slot)

    def compute_fft_feature(series, n=10):
        fft_vals = np.abs(fft(series.fillna(0)))
        # 인덱스 이름을 명확히 지정
        s = pd.Series(fft_vals[:n], index=[f'fft_{i}' for i in range(n)])
        return s

    def compute_fft_feature(series, n=10):
        fft_vals = np.abs(fft(series.fillna(0)))
        s = pd.Series(fft_vals[:n], index=pd.Index([f'fft_{i}' for i in range(n)], name='fft_idx'))
        return s

    fft_cols = ['ta', 'hm', 'ws', 'ta_chi', 'apparent_temp']
    fft_features = []
    branch_ids = df['branch_id'].unique()
    fft_feature_dict = {bid: {} for bid in branch_ids}
    for col in fft_cols:
        if col not in df.columns:
            continue
        for branch_id in branch_ids:
            arr = df.loc[df['branch_id'] == branch_id, col].fillna(0).values
            fft_vals = np.abs(fft(arr))[:10]
            for i, val in enumerate(fft_vals):
                fft_feature_dict[branch_id][f'Nph_{col}_{i}'] = val
                
    # DataFrame으로 변환
    fft_features_df = pd.DataFrame.from_dict(fft_feature_dict, orient='index')
    # 원본 df와 merge
    df = df.merge(fft_features_df, left_on='branch_id', right_index=True, how='left')
    # 기온 차분
    df['ta_diff_6h'] = df.groupby('branch_id')['ta'].diff(6).bfill()
    df['ta_diff_12h'] = df.groupby('branch_id')['ta'].diff(12).bfill()
    df['ta_diff_24h'] = df.groupby('branch_id')['ta'].diff(24).bfill()

    # 일교차
    df['day_ta_max'] = df.groupby(['branch_id', df['tm'].dt.date])['ta'].transform('max')
    df['day_ta_min'] = df.groupby(['branch_id', df['tm'].dt.date])['ta'].transform('min')
    df['daily_range'] = df['day_ta_max'] - df['day_ta_min']

    # 일교차 변화량
    df['daily_range_shift'] = df.groupby('branch_id')['daily_range'].shift(1).bfill()

    # 피크타임1
    df['peak_time1'] = 0
    df.loc[(df['hour'] >= 0) & (df['hour'] <= 6), 'peak_time1'] = 1
    df.loc[(df['hour'] > 6) & (df['hour'] <= 11), 'peak_time1'] = 2
    df.loc[(df['hour'] > 11) & (df['hour'] <= 18), 'peak_time1'] = 3
    df.loc[(df['hour'] > 18) & (df['hour'] <= 23), 'peak_time1'] = 4

    # 피크타임2
    df['peak_time2'] = 0
    df.loc[(df['hour'] >= 2) & (df['hour'] <= 10), 'peak_time2'] = 1


    # heating season
    df['heating_season'] = df['month'].isin([10,11,12,1, 2, 3,4]).astype(int)

    # 온도 범주화
    df['temp_category20'] = pd.cut(df['ta'], bins=[-np.inf, 20, np.inf], labels=['low', 'high'])
    df['temp_category18'] = pd.cut(df['ta'], bins=[-np.inf, 18, np.inf], labels=['low', 'high'])
    df['temp_category16'] = pd.cut(df['ta'], bins=[-np.inf, 16, np.inf], labels=['low', 'high'])

    # 오전/오후
    df['afternoon'] = (df['hour'] >= 12).astype(int)

    # 계절
    def get_season(month):
        return {
            12: 'winter', 1: 'winter', 2: 'winter',
            3: 'spring', 4: 'spring', 5: 'spring',
            6: 'summer', 7: 'summer', 8: 'summer',
            9: 'fall', 10: 'fall', 11: 'fall'
        }.get(month, 'unknown')
    df['season'] = df['month'].apply(get_season)

    # 한파 주의보/경보
    df['cold_watch'] = (df['ta'] <= -12).astype(int)  # 주의보
    df['cold_warning'] = (df['ta'] <= -15).astype(int)  # 경보

    # 풍속 고려 체감온도 (wind chill)
    df['wind_chill'] = 13.12 + 0.6215 * df['ta'] - 11.37 * df['ws']**0.16 + 0.3965 * df['ta'] * df['ws']**0.16

    # 변환 대상 변수
    col = 'ta'

    df['ta_boxcox'] = np.nan
    df['ta_boxcox_lambda'] = np.nan
    df['ta_boxcox_shift'] = np.nan  # shift 값도 저장

    for branch, group in df.groupby('branch_id'):
        col = 'ta'
        min_val = group[col].min()
        if min_val <= 0:
            shift = abs(min_val) + 1e-4
        else:
            shift = 0
        shifted = group[col] + shift
        shifted = shifted.dropna()
        if shifted.nunique() > 1 and len(shifted) >= 2:
            transformed, fitted_lambda = boxcox(shifted)
            df.loc[shifted.index, 'ta_boxcox'] = transformed
            df.loc[shifted.index, 'ta_boxcox_lambda'] = fitted_lambda
            df.loc[shifted.index, 'ta_boxcox_shift'] = shift
        else:
            df.loc[group.index, 'ta_boxcox'] = np.nan
            df.loc[group.index, 'ta_boxcox_lambda'] = np.nan
            df.loc[group.index, 'ta_boxcox_shift'] = shift
    df = df.drop(columns=['month','hour','date'])



    return df
#상호작용 처리못함
#군집화된 전처리 못함


#정규화 일단 min max +원핫인코딩
def scale_encode(df):
    cat_cols = [
         'peak_time1', 'peak_time2', 'heating_season',
        'temp_category16', 'temp_category18', 'temp_category20',
        'afternoon', 'season'
    ]

    # 범주형 변수 category화
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    # 원-핫 인코딩
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    # 연속형 변수만 추출 (타겟, 날짜 등 제외)
    exclude_cols = ['heat_demand', 'peak_time1', 'peak_time2', 'heating_season',
        'temp_category16', 'temp_category18', 'temp_category20','afternoon', 'season']
    num_cols = [col for col in df.columns
                if (df[col].dtype in [np.float64, np.int64]) and (col not in exclude_cols)]

    # MinMaxScaler 적용
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])


    return df



df_train = preprocess_weather_data(df_train)
df_test = preprocess_weather_data(df_test)
df_train = scale_encode(df_train)
df_test = scale_encode(df_test)

📊 데이터 로드 중...


In [5]:
df=df_train.copy()
df_train = df[df['year'] <= 2022]
df_test = df[df['year'] >= 2023]
df_train = df_train.drop(columns=['year'])
df_test = df_test.drop(columns=['year'])
df_train = df_train.set_index('tm')
df_test = df_test.set_index('tm')
df_train = df_train.sort_index()
df_test = df_test.sort_index()



In [6]:
def run_model_pipeline(df_train, df_test, target_col='heat_demand'):
    # ===== 1. 데이터 준비 =====
    features = [col for col in df_train.columns if col != target_col]
    X_trainval = df_train[features]
    y_trainval = df_train[target_col]
    X_test = df_test[features]
    y_test = df_test[target_col]

    # ===== 2. 데이터 분할 =====
    print("🔄 데이터 분할 중...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.2, shuffle=False
    )
    print(f"✅ 데이터 분할 완료: Train({len(X_train)}) | Val({len(X_val)}) | Test({len(X_test)})")

    # ===== 3. 기본 모델 학습 =====
    print("\n🚀 기본 LightGBM 모델 학습 중...")
    baseline_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'n_estimators': 1000,
        'random_state': 42,
        'n_jobs': -1,
        'colsample_bytree': None,
        'subsample': None,
        'subsample_freq': None,
        'min_child_samples': None
    }

    baseline_model = lgb.LGBMRegressor(**baseline_params)
    baseline_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )

    baseline_val_pred = baseline_model.predict(X_val)
    baseline_test_pred = baseline_model.predict(X_test)
    baseline_val_rmse = np.sqrt(mean_squared_error(y_val, baseline_val_pred))
    baseline_test_rmse = np.sqrt(mean_squared_error(y_test, baseline_test_pred))

    print(f"📈 기본 모델 성능:")
    print(f"  - Validation RMSE: {baseline_val_rmse:.4f}")
    print(f"  - Test RMSE: {baseline_test_rmse:.4f}")

    # ===== 4. 베이지안 최적화 =====
    print("\n🔍 베이지안 최적화로 하이퍼파라미터 튜닝 시작...")

    def objective(trial):
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 10, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 200),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'n_estimators': 1000,
            'random_state': 42,
            'n_jobs': -1,
            'colsample_bytree': None,
            'subsample': None,
            'subsample_freq': None,
            'min_child_samples': None
        }
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
        val_pred = model.predict(X_val)
        return np.sqrt(mean_squared_error(y_val, val_pred))

    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=50, show_progress_bar=True)

    print(f"✅ 최적화 완료!")
    print(f"🏆 최적 RMSE: {study.best_value:.4f}")
    print("📊 최적 파라미터:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")

    # ===== 5. 최적 모델 학습 =====
    print("\n🚀 최적 파라미터로 최종 모델 학습 중...")
    best_params = study.best_params.copy()
    best_params.update({
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'random_state': 42,
        'n_jobs': -1,
        'colsample_bytree': None,
        'subsample': None,
        'subsample_freq': None,
        'min_child_samples': None
    })
    optimized_model = lgb.LGBMRegressor(**best_params)
    optimized_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )

    val_pred = optimized_model.predict(X_val)
    test_pred = optimized_model.predict(X_test)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))

    print(f"\n📈 최적화된 모델 성능:")
    print(f"  - Validation RMSE: {val_rmse:.4f}")
    print(f"  - Test RMSE: {test_rmse:.4f}")

    print(f"\n📊 성능 개선:")
    print(f"  - Validation: {baseline_val_rmse:.4f} → {val_rmse:.4f} (개선: {baseline_val_rmse - val_rmse:.4f})")
    print(f"  - Test: {baseline_test_rmse:.4f} → {test_rmse:.4f} (개선: {baseline_test_rmse - test_rmse:.4f})")

    return {
        'val_rmse': val_rmse,
        'test_rmse': test_rmse
    }

In [None]:
branch_rmse_results = {}

branch_ids = df_train['branch_id'].unique()

for branch in branch_ids:
    train_branch = df_train[df_train['branch_id'] == branch].copy()
    test_branch = df_test[df_test['branch_id'] == branch].copy()
    
    # branch_id는 모델에 불필요하면 제거
    train_branch = train_branch.drop(columns=['branch_id'])
    test_branch = test_branch.drop(columns=['branch_id'])
    
    target_col = 'heat_demand'
    
    results = run_model_pipeline(train_branch, test_branch, target_col)
    
    branch_rmse_results[branch] = {
        'val_rmse': results['val_rmse'],
        'test_rmse': results['test_rmse']
    }

# 지점별 성능 요약 출력
print("\n📊 지점별 모델 성능 요약 (RMSE):")
val_rmse_list = []
test_rmse_list = []
for branch, scores in branch_rmse_results.items():
    print(f"📍 {branch} | Val RMSE: {scores['val_rmse']:.4f} | Test RMSE: {scores['test_rmse']:.4f}")
    val_rmse_list.append(scores['val_rmse'])
    test_rmse_list.append(scores['test_rmse'])

# 전체 평균 RMSE 출력
mean_val_rmse = sum(val_rmse_list) / len(val_rmse_list)
mean_test_rmse = sum(test_rmse_list) / len(test_rmse_list)
print("\n📈 전체 지점 평균 RMSE")
print(f"  - Validation 평균 RMSE: {mean_val_rmse:.4f}")
print(f"  - Test 평균 RMSE: {mean_test_rmse:.4f}")

🔄 데이터 분할 중...
✅ 데이터 분할 완료: Train(14015) | Val(3504) | Test(8760)

🚀 기본 LightGBM 모델 학습 중...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006631 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9581
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 65
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 17.174
[200]	valid_0's rmse: 16.9639
Early stopping, best iteration is:
[195]	valid_0's rmse: 16.9593


[I 2025-06-17 17:46:46,301] A new study created in memory with name: no-name-0f4e0d4e-6c3d-4ecf-8051-8601a85e0478


📈 기본 모델 성능:
  - Validation RMSE: 16.9593
  - Test RMSE: 17.6619

🔍 베이지안 최적화로 하이퍼파라미터 튜닝 시작...


  0%|          | 0/50 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 0. Best value: 16.3258:   2%|▏         | 1/50 [00:00<00:45,  1.08it/s]

Early stopping, best iteration is:
[224]	valid_0's rmse: 16.3258
[I 2025-06-17 17:46:47,223] Trial 0 finished with value: 16.32576742831643 and parameters: {'learning_rate': 0.03574712922600244, 'num_leaves': 286, 'max_depth': 12, 'min_data_in_leaf': 124, 'feature_fraction': 0.4936111842654619, 'bagging_fraction': 0.49359671220172163, 'bagging_freq': 1, 'reg_alpha': 8.661761457749352, 'reg_lambda': 6.011150117432088}. Best is trial 0 with value: 16.32576742831643.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 0. Best value: 16.3258:   4%|▍         | 2/50 [00:01<00:30,  1.59it/s]

Early stopping, best iteration is:
[146]	valid_0's rmse: 17.0677
[I 2025-06-17 17:46:47,647] Trial 1 finished with value: 17.06772287548988 and parameters: {'learning_rate': 0.11114989443094977, 'num_leaves': 15, 'max_depth': 15, 'min_data_in_leaf': 168, 'feature_fraction': 0.5274034664069657, 'bagging_fraction': 0.5090949803242604, 'bagging_freq': 2, 'reg_alpha': 3.0424224295953772, 'reg_lambda': 5.247564316322379}. Best is trial 0 with value: 16.32576742831643.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9581
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 65
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[117]	valid_0's rmse: 16.799


Best trial: 0. Best value: 16.3258:   6%|▌         | 3/50 [00:02<00:38,  1.22it/s]

[I 2025-06-17 17:46:48,693] Trial 2 finished with value: 16.798953453037402 and parameters: {'learning_rate': 0.04345454109729477, 'num_leaves': 94, 'max_depth': 10, 'min_data_in_leaf': 36, 'feature_fraction': 0.5752867891211308, 'bagging_fraction': 0.619817105976215, 'bagging_freq': 4, 'reg_alpha': 7.851759613930136, 'reg_lambda': 1.9967378215835974}. Best is trial 0 with value: 16.32576742831643.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:   8%|▊         | 4/50 [00:02<00:29,  1.56it/s]

Early stopping, best iteration is:
[210]	valid_0's rmse: 16.3155
[I 2025-06-17 17:46:49,063] Trial 3 finished with value: 16.31546142982132 and parameters: {'learning_rate': 0.05748924681991978, 'num_leaves': 182, 'max_depth': 3, 'min_data_in_leaf': 126, 'feature_fraction': 0.502314474212375, 'bagging_fraction': 0.43903095579116774, 'bagging_freq': 7, 'reg_alpha': 9.656320330745594, 'reg_lambda': 8.08397348116461}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9579
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 64
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  10%|█         | 5/50 [00:03<00:33,  1.33it/s]

Early stopping, best iteration is:
[236]	valid_0's rmse: 16.6706
[I 2025-06-17 17:46:50,004] Trial 4 finished with value: 16.670573713392482 and parameters: {'learning_rate': 0.028180680291847244, 'num_leaves': 38, 'max_depth': 11, 'min_data_in_leaf': 94, 'feature_fraction': 0.47322294090686734, 'bagging_fraction': 0.6971061460667621, 'bagging_freq': 1, 'reg_alpha': 9.093204020787821, 'reg_lambda': 2.587799816000169}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003856 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  12%|█▏        | 6/50 [00:04<00:29,  1.49it/s]

Early stopping, best iteration is:
[90]	valid_0's rmse: 17.1519
[I 2025-06-17 17:46:50,517] Trial 5 finished with value: 17.151916701669677 and parameters: {'learning_rate': 0.09519754482692679, 'num_leaves': 100, 'max_depth': 9, 'min_data_in_leaf': 114, 'feature_fraction': 0.5109126733153162, 'bagging_fraction': 0.9817507766587351, 'bagging_freq': 6, 'reg_alpha': 9.394989415641891, 'reg_lambda': 8.948273504276488}. Best is trial 3 with value: 16.31546142982132.


Best trial: 3. Best value: 16.3155:  14%|█▍        | 7/50 [00:04<00:23,  1.83it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9579
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 64
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[94]	valid_0's rmse: 16.3713
[I 2025-06-17 17:46:50,815] Trial 6 finished with value: 16.371311590509436 and parameters: {'learning_rate': 0.0764136186923332, 'num_leaves': 278, 'max_depth': 4, 'min_data_in_leaf': 47, 'feature_fraction': 0.4271363733463229, 'bagging_fraction': 0.5951981984579586, 'bagging_freq': 3, 'reg_alpha': 2.713490317738959, 'reg_lambda': 8.287375091519294}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003999 seconds.
You can set `force_col_wis

Best trial: 3. Best value: 16.3155:  16%|█▌        | 8/50 [00:05<00:25,  1.62it/s]

[I 2025-06-17 17:46:51,584] Trial 7 finished with value: 17.354979231896866 and parameters: {'learning_rate': 0.03364867144187954, 'num_leaves': 91, 'max_depth': 10, 'min_data_in_leaf': 36, 'feature_fraction': 0.8813181884524238, 'bagging_fraction': 0.44473038620786254, 'bagging_freq': 7, 'reg_alpha': 7.722447692966574, 'reg_lambda': 1.987156815341724}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  18%|█▊        | 9/50 [00:06<00:38,  1.06it/s]

Early stopping, best iteration is:
[687]	valid_0's rmse: 16.6389
[I 2025-06-17 17:46:53,255] Trial 8 finished with value: 16.63893151256421 and parameters: {'learning_rate': 0.010189592979395137, 'num_leaves': 247, 'max_depth': 12, 'min_data_in_leaf': 149, 'feature_fraction': 0.8627622080115674, 'bagging_fraction': 0.44442679104045424, 'bagging_freq': 3, 'reg_alpha': 1.1586905952512971, 'reg_lambda': 8.631034258755935}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9579
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 64
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[149]	valid_0's rmse: 16.5173


Best trial: 3. Best value: 16.3155:  20%|██        | 10/50 [00:07<00:30,  1.31it/s]

[I 2025-06-17 17:46:53,615] Trial 9 finished with value: 16.517277983655553 and parameters: {'learning_rate': 0.08330803890301997, 'num_leaves': 106, 'max_depth': 3, 'min_data_in_leaf': 69, 'feature_fraction': 0.5951099932160482, 'bagging_fraction': 0.8377637070028385, 'bagging_freq': 5, 'reg_alpha': 8.872127425763265, 'reg_lambda': 4.722149251619493}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  22%|██▏       | 11/50 [00:07<00:24,  1.58it/s]

Early stopping, best iteration is:
[35]	valid_0's rmse: 16.801
[I 2025-06-17 17:46:53,940] Trial 10 finished with value: 16.800960059233926 and parameters: {'learning_rate': 0.24893231508461813, 'num_leaves': 199, 'max_depth': 6, 'min_data_in_leaf': 188, 'feature_fraction': 0.7114864019556889, 'bagging_fraction': 0.8141482441414871, 'bagging_freq': 7, 'reg_alpha': 5.9991563681257585, 'reg_lambda': 6.579621849710365}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  24%|██▍       | 12/50 [00:09<00:32,  1.16it/s]

Early stopping, best iteration is:
[437]	valid_0's rmse: 16.4084
[I 2025-06-17 17:46:55,331] Trial 11 finished with value: 16.408393302985893 and parameters: {'learning_rate': 0.01841029200937388, 'num_leaves': 192, 'max_depth': 14, 'min_data_in_leaf': 123, 'feature_fraction': 0.7057321027355615, 'bagging_fraction': 0.4120319393864633, 'bagging_freq': 1, 'reg_alpha': 6.071842740332256, 'reg_lambda': 7.075836575292849}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9577
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 63
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[340]	valid_0's rmse: 16.5585


Best trial: 3. Best value: 16.3155:  26%|██▌       | 13/50 [00:10<00:36,  1.02it/s]

[I 2025-06-17 17:46:56,587] Trial 12 finished with value: 16.558543221961333 and parameters: {'learning_rate': 0.02052410089329706, 'num_leaves': 288, 'max_depth': 7, 'min_data_in_leaf': 141, 'feature_fraction': 0.6191581427204631, 'bagging_fraction': 0.5422487379944317, 'bagging_freq': 5, 'reg_alpha': 9.943867078340297, 'reg_lambda': 4.6081191772580015}. Best is trial 3 with value: 16.31546142982132.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9579
[LightGBM] [Info] Number of data points in the train set: 14015, number of used features: 64
[LightGBM] [Info] Start training from score 111.733036
Training until validation scores don't improve for 50 rounds


Best trial: 3. Best value: 16.3155:  28%|██▊       | 14/50 [00:10<00:30,  1.19it/s]

Early stopping, best iteration is:
[36]	valid_0's rmse: 17.0432
[I 2025-06-17 17:46:57,097] Trial 13 finished with value: 17.043249533716693 and parameters: {'learning_rate': 0.17795141500669145, 'num_leaves': 224, 'max_depth': 13, 'min_data_in_leaf': 87, 'feature_fraction': 0.4029524280129683, 'bagging_fraction': 0.5253478365535988, 'bagging_freq': 3, 'reg_alpha': 7.108795291829056, 'reg_lambda': 9.978478348299165}. Best is trial 3 with value: 16.31546142982132.


In [None]:
# 지점별 성능 요약 출력
print("\n📊 지점별 모델 성능 요약 (RMSE):")
val_rmse_list = []
test_rmse_list = []
for branch, scores in branch_rmse_results.items():
    print(f"📍 {branch} | Val RMSE: {scores['val_rmse']:.4f} | Test RMSE: {scores['test_rmse']:.4f}")
    val_rmse_list.append(scores['val_rmse'])
    test_rmse_list.append(scores['test_rmse'])

# 전체 평균 RMSE 출력
mean_val_rmse = sum(val_rmse_list) / len(val_rmse_list)
mean_test_rmse = sum(test_rmse_list) / len(test_rmse_list)
print("\n📈 전체 지점 평균 RMSE")
print(f"  - Validation 평균 RMSE: {mean_val_rmse:.4f}")
print(f"  - Test 평균 RMSE: {mean_test_rmse:.4f}")