In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from autogluon.tabular import TabularPredictor
from sklearn.metrics import  mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

In [15]:
#!pip install autogluon

In [2]:
df_train = pd.read_csv("train_heat.csv")
df_test = pd.read_csv("test_heat.csv")
#열이름빼기
df_train.columns = df_train.columns.str.replace('train_heat.', '', regex=False)
#Unnamed:0제거
df_train = df_train.drop(columns=["Unnamed: 0"])
df_test.columns = [
    "tm", "branch_id", "ta", "wd", "ws",
    "rn_day", "rn_hr1", "hm", "si", "ta_chi","heat_demand"
]

In [3]:
#체감온도 직접만든 변수
def calc_wind_chill(Ta, RH, WS):
    # 겨울철 조건: 기온 10℃ 이하, 풍속 1.3m/s 이상
    cond = (Ta <= 10) & (WS >= 1.3)
    Tw = np.full_like(Ta, np.nan, dtype=np.float64)
    # 공식 적용
    Ta_ = Ta[cond]
    RH_ = RH[cond]
    WS_ = WS[cond]
    # 공식 (수정된 식)
    Tw_calc = (
        Ta_ * np.arctan(0.151977 * np.sqrt(RH_ + 8.313659))
        + np.arctan(Ta_ + RH_)
        - np.arctan(RH_ - 1.67633)
        + 0.00391838 * RH_ ** 1.5 * np.arctan(0.023101 * RH_)
        - 4.686035
    )
    # 겨울형 기간 동안 산출된 체감온도가 기온보다 높으면 기온과 같게
    Tw_calc = np.where(Tw_calc > Ta_, Ta_, Tw_calc)
    Tw[cond] = Tw_calc
    return Tw

In [14]:
def preprocess_df(df):
    # 접두사 제거: 'train_heat.' 컬럼명 변경
    df = df.rename(columns=lambda x: x.replace('train_heat.', '') if 'train_heat.' in x else x)

    # 결측치 처리 (-99, -9.9)
    df = df.replace(-99, np.nan)
    if 'wd' in df.columns:
        df['wd'] = df['wd'].replace(-9.9, np.nan)

    # 시간 전처리
    df['month'] = df['tm'].astype(str).str[4:6].astype(int)
    df['hour'] = df['tm'].astype(str).str[8:10].astype(int)
    df['date'] = pd.to_datetime(df['tm'].astype(str).str[:8])
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['tm'] = pd.to_datetime(df['tm'].astype(str), format='%Y%m%d%H')
    df = df.sort_values('tm').set_index('tm')
    if 'si' in df.columns:
        df.loc[~df['hour'].between(8, 18), 'si'] = 0
    # 계절 구분
    df['heating_season'] = df['month'].apply(lambda x: 1 if x in [10,11,12,1,2,3,4] else 0)
    df['temp_category'] = df['ta'].apply(lambda x: 1 if x >= 20 else 0)

    # 피크타임 함수
    def peak_time_category(hour):
        if 0 <= hour <= 6:
            return 0  # 심야
        elif 6 < hour <= 11:
            return 1  # 오전
        elif 11 < hour <= 18:
            return 2  # 오후
        else:
            return 3  # 저녁

    df['peak_time'] = df['hour'].apply(peak_time_category)

    # lag feature (1,2,3시간)
    for lag in [1,2,3]:
        df[f'ta_lag_{lag}'] = df['ta'].shift(lag).interpolate(method='linear')

    # HDD, CDD
    base_temp = 18.0
    df['HDD'] = (base_temp - df['ta']).clip(lower=0)
    df['CDD'] = (df['ta'] - base_temp).clip(lower=0)

    # 편차
    df['branch_temp_abs_deviation'] = (
        df['ta'] - df.groupby('branch_id')['ta'].transform('mean')
    ).abs()

    # 변수별 diff 및 24시간 이동평균
    vars_to_process = ['ta', 'si', 'ta_chi', 'ws', 'wd', 'rn_day', 'rn_hr1']
    for var in vars_to_process:
        if var in df.columns:
            df[f'{var}_diff'] = df[var].diff().fillna(0)
            df[f'{var}_ma24'] = df[var].rolling(window=24, min_periods=1).mean()

    # 누적 강수량(최근 3/6/12/24시간)
    for h in [3,6,12,24]:
        if 'rn_hr1' in df.columns:
            df[f'rn_hr1_sum{h}'] = df['rn_hr1'].rolling(h, min_periods=1).sum()

    # 최고/최저 기온(최근 6/12/24시간)
    for h in [6,12,24]:
        if 'ta' in df.columns:
            df[f'ta_max{h}'] = df['ta'].rolling(h, min_periods=1).max()
            df[f'ta_min{h}'] = df['ta'].rolling(h, min_periods=1).min()

    # 네가 만든 체감온도 공식으로 계산한 변수 추가
    df['ta_chi_formula'] = calc_wind_chill(df['ta'].values, df['hm'].values, df['ws'].values)

    # 체감온도 - 실제온도 차이 변수
    if 'ta_chi' in df.columns and 'ta' in df.columns:
        df['diff_ta_chi'] = df['ta_chi'] - df['ta']

    # 풍속 급변(2-step 차분)
    if 'ws' in df.columns:
        df['ws_diff2'] = df['ws'].diff(2).fillna(0)

    # 당일 누적 강수량
    if 'rn_day' in df.columns:
        df['rn_day_cumsum'] = df['rn_day'].cumsum()

    # 전일 평균/최고/최저 기온
    if 'ta' in df.columns:
        df['ta_yesterday_avg'] = df['ta'].shift(24).rolling(24).mean()
        df['ta_yesterday_max'] = df['ta'].shift(24).rolling(24).max()
        df['ta_yesterday_min'] = df['ta'].shift(24).rolling(24).min()

    # 임계치 이탈 플래그
    if 'ta' in df.columns:
        df['cold_flag'] = (df['ta'] < 5).astype(int)
        df['hot_flag'] = (df['ta'] > 25).astype(int)

    # 최근 6시간 기온 표준편차
    df['ta_std6'] = df['ta'].rolling(6, min_periods=1).std().fillna(0)

    # 최근 6시간 풍속 표준편차
    df['ws_std6'] = df['ws'].rolling(6, min_periods=1).std().fillna(0)

    # 최근 24시간 내 강수 발생 여부 (이진 플래그)
    df['rain_flag_24h'] = (df['rn_hr1'].rolling(24, min_periods=1).sum() > 0).astype(int)

    # 월별 평균 기온과의 차이
    df['monthly_avg_ta'] = df.groupby('month')['ta'].transform('mean')
    df['ta_monthly_dev'] = df['ta'] - df['monthly_avg_ta']

    df['ta_ma3'] = df['ta'].rolling(3, min_periods=1).mean()
    df['ta_max3'] = df['ta'].rolling(3, min_periods=1).max()
    df['ta_min3'] = df['ta'].rolling(3, min_periods=1).min()
    df['ta_lastweek'] = df['ta'].shift(24*7)
    df['hourly_avg_ta'] = df.groupby('hour')['ta'].transform('mean')
    df['ta_hourly_dev'] = df['ta'] - df['hourly_avg_ta']
    df['ta_yesterday'] = df['ta'].shift(24)
    df['ta_trend12'] = df['ta'] - df['ta'].shift(12)

    # 결측치 보간 및 0으로 채우기
    for col in df.columns[df.isnull().any()]:
        df[col] = df[col].interpolate(method='linear')
    for col in df.columns[df.isnull().any()]:
        df[col] = df[col].fillna(0)

    return df

In [15]:
df_trainval = preprocess_df(df_train)
df_test = preprocess_df(df_test)

# 피처 정의
categorical_cols = [
    'branch_id', 'month', 'weekday', 'is_weekend', 'heating_season',
    'temp_category', 'peak_time', 'cold_flag', 'hot_flag'
]
target = 'heat_demand'

numerical_cols = [
    'hour', 'ta', 'si', 'ta_chi', 'ws', 'wd', 'rn_day', 'rn_hr1', 'hm',
    'ta_lag_1', 'ta_lag_2', 'ta_lag_3', 'HDD', 'CDD', 'branch_temp_abs_deviation',
    'ta_diff', 'si_diff', 'ta_chi_diff', 'ws_diff', 'wd_diff', 'rn_day_diff', 'rn_hr1_diff',
    'ta_ma24', 'si_ma24', 'ta_chi_ma24', 'ws_ma24', 'wd_ma24', 'rn_day_ma24', 'rn_hr1_ma24',
    'rn_hr1_sum3', 'rn_hr1_sum6', 'rn_hr1_sum12', 'rn_hr1_sum24',
    'ta_max6', 'ta_min6', 'ta_max12', 'ta_min12', 'ta_max24', 'ta_min24',
    'ta_chi_formula', 'diff_ta_chi', 'ws_diff2', 'rn_day_cumsum',
    'ta_yesterday_avg', 'ta_yesterday_max', 'ta_yesterday_min',
    'ta_std6', 'ws_std6', 'rain_flag_24h', 'monthly_avg_ta', 'ta_monthly_dev',
    'ta_ma3', 'ta_max3', 'ta_min3', 'ta_lastweek',
    'hourly_avg_ta', 'ta_hourly_dev', 'ta_yesterday', 'ta_trend12'
]

features = categorical_cols + numerical_cols


KeyError: 'tm'

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import pandas as pd

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
results = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(df_trainval)):
    print(f"\n===== Fold {fold+1} =====")

    df_train = df_trainval.iloc[train_idx].copy()
    df_val = df_trainval.iloc[val_idx].copy()

    # 실제 존재하는 범주형 컬럼만 사용
    actual_categorical_cols = [col for col in categorical_cols if col in df_train.columns]

    # prefix 없이 get_dummies 사용 (가장 안전)
    if actual_categorical_cols:
        df_train_cat = pd.get_dummies(df_train[actual_categorical_cols])
        df_val_cat = pd.get_dummies(df_val[actual_categorical_cols])
        df_test_cat = pd.get_dummies(df_test[actual_categorical_cols])

        # df_train 기준으로 컬럼 정렬 및 누락 컬럼 0 채우기
        df_val_cat = df_val_cat.reindex(columns=df_train_cat.columns, fill_value=0)
        df_test_cat = df_test_cat.reindex(columns=df_train_cat.columns, fill_value=0)
    else:
        df_train_cat = pd.DataFrame(index=df_train.index)
        df_val_cat = pd.DataFrame(index=df_val.index)
        df_test_cat = pd.DataFrame(index=df_test.index)

    # 연속형 변수 스케일링
    scaler = MinMaxScaler()
    df_train_num_scaled = pd.DataFrame(scaler.fit_transform(df_train[numerical_cols]), columns=numerical_cols)
    df_val_num_scaled = pd.DataFrame(scaler.transform(df_val[numerical_cols]), columns=numerical_cols)
    df_test_num_scaled = pd.DataFrame(scaler.transform(df_test[numerical_cols]), columns=numerical_cols)

    # 최종 feature set 결합
    df_train_final = pd.concat([df_train_num_scaled.reset_index(drop=True), df_train_cat.reset_index(drop=True)], axis=1)
    df_val_final = pd.concat([df_val_num_scaled.reset_index(drop=True), df_val_cat.reset_index(drop=True)], axis=1)
    df_test_final = pd.concat([df_test_num_scaled.reset_index(drop=True), df_test_cat.reset_index(drop=True)], axis=1)

    # target 추가
    df_train_final[target] = df_train[target].reset_index(drop=True)
    df_val_final[target] = df_val[target].reset_index(drop=True)

    features_final = df_train_final.columns.drop(target)

    # 모델 학습
    predictor = TabularPredictor(
        label=target,
        problem_type='regression',
        path=f"AutogluonModels_fold{fold+1}"
    ).fit(
        train_data=df_train_final,
        tuning_data=df_val_final,
        presets='best_quality',
        use_bag_holdout=True,
        ag_args_fit={'num_gpus': 0}
    )

    # 검증 RMSE
    y_val_true = df_val_final[target]
    y_val_pred = predictor.predict(df_val_final[features_final])
    val_rmse = mean_squared_error(y_val_true, y_val_pred, squared=False)

    # 테스트 RMSE (target 컬럼이 있다면)
    if target in df_test.columns and df_test[target].notnull().all():
        y_test_true = df_test[target]
        y_test_pred = predictor.predict(df_test_final[features_final])
        test_rmse = mean_squared_error(y_test_true, y_test_pred, squared=False)
    else:
        test_rmse = None

    print(f"Fold {fold+1} | Val RMSE: {val_rmse:.4f} | Test RMSE: {test_rmse if test_rmse is not None else 'N/A'}")

    # 마지막 fold 예측 저장
    if fold == n_splits - 1:
        df_test[target] = predictor.predict(df_test_final[features_final])
        last_val_true = y_val_true
        last_val_pred = y_val_pred
        last_val_index = df_val.index  # 시계열 index 기준

    results.append({
        'fold': fold+1,
        'val_rmse': val_rmse,
        'test_rmse': test_rmse
    })

# 전체 Fold 결과 출력
results_df = pd.DataFrame(results)
print("\n===== 전체 Fold 결과 =====")
print(results_df)

# 예측 결과 저장
df_test.to_csv('df_test_with_predicted_heat_demand.csv', index=False)
print("df_test_with_predicted_heat_demand.csv 파일이 저장되었습니다.")

# 마지막 Fold 시각화
plt.figure(figsize=(18, 6))
plt.plot(last_val_index, last_val_true.values, label='실제값 (Validation True)', linewidth=2)
plt.plot(last_val_index, last_val_pred.values, label='예측값 (Validation Predicted)', alpha=0.7)
plt.title('마지막 Fold 검증셋 실제값 vs 예측값')
plt.xlabel('Time Index')
plt.ylabel('Heat Demand')
plt.legend()
plt.grid()
plt.show()


===== Fold 1 =====


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.0
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       3.96 GB / 31.74 GB (12.5%)
Disk Space Avail:   244.59 GB / 476.28 GB (51.4%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to False. Reason: Skip dynamic_stacking when use_bag_holdout is enabled. (use_bag_holdout=True)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "c:\Users\Admin\Desktop\신윤식\기상청공모전\data\AutogluonModels_fold1"
Train Data Rows:    83221
Train Data Columns: 86
Tuning Data Rows:    83216
Tuning Data Columns: 86
Label Column:       heat_demand
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:       

In [None]:
# 마지막 fold의 검증 데이터 준비
X_val = df_val_final[features_final]
y_val = df_val_final[target]

# Autogluon predictor는 scikit-learn API를 직접 지원하지 않으므로, 예측 함수를 래핑
class PredictorWrapper:
    def __init__(self, predictor, features):
        self.predictor = predictor
        self.features = features
    def predict(self, X):
        # DataFrame이 들어오면 features만 추출
        return self.predictor.predict(X[self.features])

# 래퍼 생성
wrapped_predictor = PredictorWrapper(predictor, features_final)

# permutation importance 계산
result = permutation_importance(
    wrapped_predictor,
    X_val,
    y_val,
    n_repeats=10,
    random_state=42,
    scoring='neg_root_mean_squared_error'
)

# 중요도 정렬 및 시각화
sorted_idx = result.importances_mean.argsort()[::-1]
top_n = 20  # 상위 20개만 시각화

plt.figure(figsize=(10, 8))
plt.barh(
    np.array(features_final)[sorted_idx][:top_n][::-1],
    result.importances_mean[sorted_idx][:top_n][::-1],
    xerr=result.importances_std[sorted_idx][:top_n][::-1],
    color='skyblue'
)
plt.xlabel('Permutation Importance (Decrease in RMSE)')
plt.title('Permutation Feature Importance (Top 20)')
plt.tight_layout()
plt.show()

# 중요도 점수 표 출력
importance_df = pd.DataFrame({
    'feature': np.array(features_final)[sorted_idx],
    'importance_mean': result.importances_mean[sorted_idx],
    'importance_std': result.importances_std[sorted_idx]
})
print("\n===== Permutation Importance (Top 20) =====")
print(importance_df.head(20))

NameError: name 'df_val_final' is not defined