In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
#!pip install autogluon

Collecting autogluon
  Downloading autogluon-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.3.1 (from autogluon.core[all]==1.3.1->autogluon)
  Downloading autogluon.core-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.3.1 (from autogluon)
  Downloading autogluon.features-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.3.1 (from autogluon.tabular[all]==1.3.1->autogluon)
  Downloading autogluon.tabular-1.3.1-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.3.1 (from autogluon)
  Downloading autogluon.multimodal-1.3.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.timeseries==1.3.1 (from autogluon.timeseries[all]==1.3.1->autogluon)
  Downloading autogluon.timeseries-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3<2,>=1.10 (from autogluon.core==1.3.1->autogluon.core[all]==1.3.1->autogluon)
  Downloading boto3-1.38.30-py3-none-any.whl.metadata (6.6 kB)
Collecting autogluon.common==1.3

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from autogluon.tabular import TabularPredictor
from sklearn.metrics import  mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [6]:
#전처리
df = pd.read_csv('/content/gdrive/MyDrive/train_heat.csv',encoding='CP949')


#값중 -99있으면 null로 대체
df = df.replace(-99, np.nan)
df = df.drop(columns='Unnamed: 0')
#08~18시 외의 값을 null로 처리
# 시(hour) 추출
hour = df['train_heat.tm'].astype(str).str[-2:].astype(int)
# 해당 조건에 대해 0으로 대체
df.loc[~hour.between(8, 18) , 'train_heat.si'] = 0
df['train_heat.tm'] = pd.to_datetime(df['train_heat.tm'].astype(str), format='%Y%m%d%H') #datetime형태로 변환
df = df.sort_values('train_heat.tm').set_index('train_heat.tm')#순서정렬~
#선형보간
def linear_impute(series):
    return series.interpolate(method='linear')
# 결측치가 있는 컬럼만 자동으로 찾기
cols_to_impute = df.columns[df.isnull().any()].tolist()

# 결측치 있는 컬럼에 대해 선형보간 적용
for col in cols_to_impute:
    df[col] = linear_impute(df[col])

# 결과 확인
print(df.isnull().sum())

train_heat.branch_id      0
train_heat.ta             0
train_heat.wd             0
train_heat.ws             0
train_heat.rn_day         0
train_heat.rn_hr1         0
train_heat.hm             0
train_heat.si             0
train_heat.ta_chi         0
train_heat.heat_demand    0
dtype: int64


In [13]:
#파생변수만들기
#Autoglone은 시계열 반영안해서 파생변수 만들어야됨.
# train_heat.tm은 2021010101 이런 숫자형태니까 datetime으로 변환

# 1. 날짜 파생변수
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday   # 0:월요일, ..., 6:일요일

#2.계절성 2분할 파생변수
# 간단히 10~4월은 'warm', 나머지는 'cold' 로 구분
df['heating_season'] = df['month'].apply(lambda x: 1 if x in [10, 11, 12, 1, 2, 3, 4] else 0)

#3. 온도 Category
df['temp_category'] = df['train_heat.ta'].apply(lambda x: 1 if x >= 20 else 0)


#4. Peak시간대 4분할
def peak_time_category(hour):
    if 0 <= hour <= 6:
        return 0  # 심야
    elif 6 <= hour <= 112:
        return 1  # 오전
    elif 12 <= hour <= 18:
        return 2  # 오후
    else:
        return 3  # 저녁

df['peak_time'] = df['hour'].apply(peak_time_category)


#5.Lag데이터 #lag생기면 선형보간으로 채움
for lag in [1, 2, 3]:
    lag_col = f'ta_lag_{lag}'
    df[lag_col] = df['train_heat.ta'].shift(lag)
    df[lag_col] = df[lag_col].interpolate(method='linear')  # 또는 .ewm().mean()


#6.HDD CDD
base_temp = 18.0  # 한국 실내 쾌적 온도 기준?

df['HDD'] = (base_temp - df['train_heat.ta']).clip(lower=0)
df['CDD'] = (df['train_heat.ta'] - base_temp).clip(lower=0)


#7. 열수요편차
#branch_id별로 온도(train_heat.ta)의 평균을 구한다.
#각 행별 train_heat.ta와 그 branch_id 그룹 평균 온도의 차이(편차)를 계산한다.
#이 편차 값을 새로운 파생변수로 추가한다.
# branch_id별 평균 온도 계산 후 절댓값 편차 변수 생성
df['branch_temp_abs_deviation'] = (df['train_heat.ta'] - df.groupby('train_heat.branch_id')['train_heat.ta'].transform('mean')).abs()

df = df.query('`train_heat.branch_id` in ["A", "B", "D"]')
df_train = df.query('year == 2021')
df_test = df.query('year == 2022')
df_train = df_train.reset_index()
df_test =  df_test.reset_index()

# 예측 대상
target = 'train_heat.heat_demand'


# 11. 사용 feature
features = [
    "train_heat.branch_id", "train_heat.ta", "train_heat.wd", "train_heat.ws", "train_heat.rn_day",
    "train_heat.rn_hr1", "train_heat.hm", "train_heat.si", "train_heat.ta_chi",
    "month", "weekday", "heating_season", "temp_category", "peak_time",
    "ta_lag_1", "ta_lag_2", "ta_lag_3", "HDD", "CDD", "branch_temp_abs_deviation"
]

# 12. 원핫 인코딩 대상
categorical_cols = ["train_heat.branch_id", "month", "weekday", "heating_season", "temp_category", "peak_time"]

# 13. 원핫 인코딩
df_train_encoded = pd.get_dummies(df_train[features + [target]], columns=categorical_cols)
df_test_encoded = pd.get_dummies(df_test[features + [target]], columns=categorical_cols)

# 14. 열 맞추기
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

# 15. 스케일링 대상 열
features_encoded = [col for col in df_train_encoded.columns if col != target]

# 16. MinMax Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(df_train_encoded[features_encoded])
X_test_scaled = scaler.transform(df_test_encoded[features_encoded])

# 17. 최종 DataFrame
df_train_scaled = pd.DataFrame(X_train_scaled, columns=features_encoded)
df_train_scaled[target] = df_train_encoded[target].values

df_test_scaled = pd.DataFrame(X_test_scaled, columns=features_encoded)
df_test_scaled[target] = df_test_encoded[target].values

# 18. AutoGluon 학습
predictor = TabularPredictor(label=target, problem_type='regression').fit(
    train_data=df_train_scaled,
    presets='best_quality'
)


No path specified. Models will be saved in: "AutogluonModels/ag-20250605_045123"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
Memory Avail:       10.94 GB / 12.67 GB (86.3%)
Disk Space Avail:   183.28 GB / 225.83 GB (81.2%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validatio

[36m(_ray_fit pid=4656)[0m [1000]	valid_set's rmse: 29.0919
[36m(_ray_fit pid=4656)[0m [2000]	valid_set's rmse: 28.1408[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=4656)[0m [3000]	valid_set's rmse: 27.5585[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=4656)[0m [4000]	valid_set's rmse: 27.268[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=4657)[0m [5000]	valid_set's rmse: 27.0154[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=4656)[0m [7000]	valid_set's rmse: 26.8622[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=4656)[0m [8000]	valid_set's rmse: 26.8038[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=4656)[0m [10000]	valid_set's rmse: 26.7374[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=5063)

[36m(_dystack pid=4414)[0m 	-27.7883	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4414)[0m 	293.28s	 = Training   runtime
[36m(_dystack pid=4414)[0m 	99.01s	 = Validation runtime
[36m(_dystack pid=4414)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 267.97s of the 564.88s of remaining time.
[36m(_dystack pid=4414)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (2 workers, per: cpus=1, gpus=0, memory=0.31%)


[36m(_ray_fit pid=6254)[0m [1000]	valid_set's rmse: 30.3836[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=6254)[0m [2000]	valid_set's rmse: 30.1298[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=6254)[0m [3000]	valid_set's rmse: 30.0272[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=6257)[0m [5000]	valid_set's rmse: 29.2876[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=6490)[0m [1000]	valid_set's rmse: 32.7944
[36m(_ray_fit pid=6490)[0m [2000]	valid_set's rmse: 32.497
[36m(_ray_fit pid=6546)[0m [3000]	valid_set's rmse: 30.6647[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=6490)[0m [5000]	valid_set's rmse: 32.2867[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=6490)[0m [6000]	valid_set's rmse: 32.2461
[36m(_ray_fit pid=6490)[0m [7000]	valid_set's rmse: 32.2621
[36m(_ray_fit pid=6714)[0m [2000]	valid_set's rmse: 30.4957[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=6714)[0m [3000]	valid_set's rmse:

[36m(_dystack pid=4414)[0m 	-30.418	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4414)[0m 	151.78s	 = Training   runtime
[36m(_dystack pid=4414)[0m 	21.18s	 = Validation runtime
[36m(_dystack pid=4414)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 108.51s of the 405.42s of remaining time.
[36m(_dystack pid=4414)[0m 	-31.2191	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4414)[0m 	113.98s	 = Training   runtime
[36m(_dystack pid=4414)[0m 	2.13s	 = Validation runtime
[36m(_dystack pid=4414)[0m Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 283.60s of remaining time.
[36m(_dystack pid=4414)[0m 	Ensemble Weights: {'LightGBMXT_BAG_L1': 0.8, 'RandomForestMSE_BAG_L1': 0.2}
[36m(_dystack pid=4414)[0m 	-27.5493	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4414)[0m 	0.04s	 = Training   runtime
[36m(_dystack pid=4414)[0m 	0.0s	 = Validation runtime
[36m(

In [16]:
# 3. 예측
df_test_features = df_test_scaled.drop(columns=[target])  # target만 제거
preds = predictor.predict(df_test_features)

# 4. 결과 저장 및 평가
df_test_scaled['predicted_demand'] = preds
y_true = df_test_scaled[target]
y_pred = df_test_scaled['predicted_demand']

print("RMSE:", mean_squared_error(y_true, y_pred) ** 0.5)

RMSE: 39.423834963145524
