In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 전처리
# ---------------------
df = pd.read_csv('/content/gdrive/MyDrive/train_heat.csv', encoding='CP949')
df = df.replace(-99, np.nan)
df = df.drop(columns='Unnamed: 0')

df['train_heat.tm'] = pd.to_datetime(df['train_heat.tm'].astype(str), format='%Y%m%d%H')
df = df.sort_values('train_heat.tm').set_index('train_heat.tm')

hour = df.index.hour
df.loc[~((hour >= 8) & (hour <= 18)), 'train_heat.si'] = 0

# 선형보간 함수 정의
def linear_impute(series):
    return series.interpolate(method='linear')

# 결측치가 있는 컬럼 자동 탐색 및 선형보간 적용
cols_to_impute = df.columns[df.isnull().any()].tolist()
for col in cols_to_impute:
    df[col] = linear_impute(df[col])

# 파생 변수 생성
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday

df['heating_season'] = df['month'].apply(lambda x: 1 if x in [10, 11, 12, 1, 2, 3, 4] else 0)
df['temp_category'] = df['train_heat.ta'].apply(lambda x: 1 if x >= 20 else 0)

def peak_time_category(hour):
    if 0 <= hour <= 6:
        return 0
    elif 6 < hour <= 12:
        return 1
    elif 12 < hour <= 18:
        return 2
    else:
        return 3

df['peak_time'] = df['hour'].apply(peak_time_category)

for lag in [1, 2, 3]:
    lag_col = f'ta_lag_{lag}'
    df[lag_col] = df['train_heat.ta'].shift(lag)
    df[lag_col] = df[lag_col].interpolate(method='linear', limit_direction='both')  # 또는 .ewm().mean()

base_temp = 18.0
df['HDD'] = (base_temp - df['train_heat.ta']).clip(lower=0)
df['CDD'] = (df['train_heat.ta'] - base_temp).clip(lower=0)
df['branch_temp_abs_deviation'] = (df['train_heat.ta'] - df.groupby('train_heat.branch_id')['train_heat.ta'].transform('mean')).abs()

df = df.query('`train_heat.branch_id` in ["A", "B", "D"]')

# 타겟 변수
target = 'train_heat.heat_demand'

# 피처 선택
features = [
    "train_heat.ta", "train_heat.wd", "train_heat.ws", "train_heat.rn_day", "train_heat.rn_hr1",
    "train_heat.hm", "train_heat.si", "train_heat.ta_chi",
    "ta_lag_1", "ta_lag_2", "ta_lag_3", "HDD", "CDD", "branch_temp_abs_deviation"
]

# 범주형 인코딩
df = pd.get_dummies(df, columns=["month", "weekday", "heating_season", "temp_category", "peak_time"])

# 연도별 분리
df_train = df[df['year'] == 2021]
df_test = df[df['year'] == 2022]

X_train = df_train[features + [col for col in df.columns if col.startswith(('month_', 'weekday_', 'heating_season_', 'temp_category_', 'peak_time_'))]]
y_train = df_train[target]
X_test = df_test[X_train.columns]
y_test = df_test[target]

# 스케일링
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#결측치확인
print(np.isnan(X_train).sum(), np.isinf(X_train).sum())
print(np.isnan(y_train).sum(), np.isinf(y_train).sum())


0 0
0 0


In [9]:
# Dataset 정의
# ---------------------
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, window_size):
        self.X = X
        self.y = y
        self.window_size = window_size

    def __len__(self):
        return len(self.X) - self.window_size

    def __getitem__(self, idx):
        x_seq = self.X[idx:idx+self.window_size]
        y_target = self.y[idx + self.window_size]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_target, dtype=torch.float32)

# ---------------------
# CNN + Transformer 모델
# ---------------------
class CNNTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=8, num_layers=3):
        super().__init__()
        self.cnn = nn.Conv1d(in_channels=input_dim, out_channels=d_model, kernel_size=3, padding=1)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * window_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
      x = x.permute(0, 2, 1)        # [B, F, T]
      x = self.cnn(x)               # [B, D, T]
      x = x.permute(0, 2, 1)        # [B, T, D]
      x = self.transformer(x)       # [B, T, D]
      x = x.permute(0, 2, 1)        # [B, D, T]  ← 추가!
      return self.head(x)           # → [B, 1]

# ---------------------
# 학습 파라미터 및 실행
# ---------------------
window_size = 48  # 24시간 단위로 학습
train_dataset = TimeSeriesDataset(X_train, y_train.values, window_size)
test_dataset = TimeSeriesDataset(X_test, y_test.values, window_size)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNTransformer(input_dim=X_train.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

# ---------------------
# 학습 루프
# ---------------------
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device).unsqueeze(1)
        optimizer.zero_grad()
        preds = model(x_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# ---------------------
# 평가
# ---------------------
model.eval()
preds, trues = [], []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        pred = model(x_batch).cpu().squeeze()
        preds.append(pred)
        trues.append(y_batch)

y_pred = torch.cat(preds).numpy()
y_true = torch.cat(trues).numpy()

print("RMSE:", mean_squared_error(y_true, y_pred) ** 0.5)

Epoch 1/20, Loss: 7335.8108
Epoch 2/20, Loss: 5524.2522
Epoch 3/20, Loss: 5396.4467
Epoch 4/20, Loss: 5354.6651
Epoch 5/20, Loss: 5301.1365
Epoch 6/20, Loss: 5256.9323
Epoch 7/20, Loss: 5204.3312
Epoch 8/20, Loss: 5191.0447
Epoch 9/20, Loss: 5137.0592
Epoch 10/20, Loss: 5140.5681
Epoch 11/20, Loss: 5151.2264
Epoch 12/20, Loss: 5107.1200
Epoch 13/20, Loss: 5112.4250
Epoch 14/20, Loss: 5088.7737
Epoch 15/20, Loss: 5083.4311
Epoch 16/20, Loss: 5094.9944
Epoch 17/20, Loss: 5068.3824
Epoch 18/20, Loss: 5052.5831
Epoch 19/20, Loss: 5071.7730
Epoch 20/20, Loss: 5028.6116
RMSE: 77.84707439709922


In [12]:
# Dataset 정의
# ---------------------
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, window_size):
        self.X = X
        self.y = y
        self.window_size = window_size

    def __len__(self):
        return len(self.X) - self.window_size

    def __getitem__(self, idx):
        x_seq = self.X[idx:idx+self.window_size]
        y_target = self.y[idx + self.window_size]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_target, dtype=torch.float32)

# ---------------------
# CNN + Transformer 모델
# ---------------------
class CNNTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=8, num_layers=3):
        super().__init__()
        self.cnn = nn.Conv1d(in_channels=input_dim, out_channels=d_model, kernel_size=3, padding=1)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),  # [B, D, T] → [B, D, 1]
            nn.Flatten(),             # [B, D, 1] → [B, D]
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)  # [B, F, T] → [B, T, F] → CNN expects [B, F, T]
        x = self.cnn(x)         # [B, D, T]
        x = x.permute(0, 2, 1)  # [B, D, T] → [B, T, D] (for transformer)
        x = self.transformer(x) # [B, T, D]
        x = x.permute(0, 2, 1)  # [B, T, D] → [B, D, T] (for pooling)
        x = self.head(x)        # [B, D, T] → [B, D, 1] → [B, D] → [B, 1]
        return x

# ---------------------
# 학습 파라미터 및 실행
# ---------------------
window_size = 48  # 24시간 단위로 학습
train_dataset = TimeSeriesDataset(X_train, y_train.values, window_size)
test_dataset = TimeSeriesDataset(X_test, y_test.values, window_size)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNTransformer(input_dim=X_train.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

# ---------------------
# 학습 루프
# ---------------------
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device).unsqueeze(1)
        optimizer.zero_grad()
        preds = model(x_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# ---------------------
# 평가
# ---------------------
model.eval()
preds, trues = [], []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        pred = model(x_batch).cpu().squeeze()
        preds.append(pred)
        trues.append(y_batch)

y_pred = torch.cat(preds).numpy()
y_true = torch.cat(trues).numpy()

print("RMSE:", mean_squared_error(y_true, y_pred) ** 0.5)

Epoch 1/20, Loss: 16449.7594
Epoch 2/20, Loss: 5619.9570
Epoch 3/20, Loss: 5411.8238
Epoch 4/20, Loss: 5358.6079
Epoch 5/20, Loss: 5318.5205
Epoch 6/20, Loss: 5254.1346
Epoch 7/20, Loss: 5215.4381
Epoch 8/20, Loss: 5202.4277
Epoch 9/20, Loss: 5213.3516
Epoch 10/20, Loss: 5182.5410
Epoch 11/20, Loss: 5168.2464
Epoch 12/20, Loss: 5107.3182
Epoch 13/20, Loss: 5112.1123
Epoch 14/20, Loss: 5123.1483
Epoch 15/20, Loss: 5097.3559
Epoch 16/20, Loss: 5083.6693
Epoch 17/20, Loss: 5071.6963
Epoch 18/20, Loss: 5102.3098
Epoch 19/20, Loss: 5041.2941
Epoch 20/20, Loss: 5048.3259
RMSE: 77.89128165470446
