In [1]:
# TFT 모델을 위한 데이터 전처리

In [58]:
# 1. 데이터 로드
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('../DATA/train.csv')
# train_df.head()

# 2. 데이터 전처리
# train : num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
#   - 1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8
# test : num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
#   - 1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0
# target : 전력소비량(kWh)
train_df.set_index('num_date_time', inplace=True)
train_df.columns = ['building_num', 'datetime', 'temp', 'rain', 'wind', 'humid', 'sun', 'solar', 'power']

# 2.1 날짜시간 변환
# '20240601 00' 형식의 문자열을 시간 인덱스로 변환 (예: 20240601 00 → 2024060100 → int)
train_df['time_idx'] = train_df['datetime'].apply(lambda x: int(str(x).replace(' ', '')))
train_df['datetime'] = pd.to_datetime(train_df['datetime'])
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day
train_df['hour'] = train_df['datetime'].dt.hour
train_df['weekday'] = train_df['datetime'].dt.weekday
train_df['is_weekend'] = (train_df['datetime'].dt.weekday >= 5).astype(int)
train_df['season'] = ((train_df['datetime'].dt.month % 12) // 3 + 1).astype(str)

train_df.head()

Unnamed: 0_level_0,building_num,datetime,temp,rain,wind,humid,sun,solar,power,time_idx,month,day,hour,weekday,is_weekend,season
num_date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1_20240601 00,1,2024-06-01 00:00:00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8,2024060100,6,1,0,5,1,3
1_20240601 01,1,2024-06-01 01:00:00,18.3,0.0,2.7,82.0,0.0,0.0,5591.85,2024060101,6,1,1,5,1,3
1_20240601 02,1,2024-06-01 02:00:00,18.1,0.0,2.6,80.0,0.0,0.0,5338.17,2024060102,6,1,2,5,1,3
1_20240601 03,1,2024-06-01 03:00:00,18.0,0.0,2.6,81.0,0.0,0.0,4554.42,2024060103,6,1,3,5,1,3
1_20240601 04,1,2024-06-01 04:00:00,17.8,0.0,1.3,81.0,0.0,0.0,3602.25,2024060104,6,1,4,5,1,3


In [62]:
# 3. 결측치 처리
train_df.isnull().sum()

# 4. 저장
train_df.to_csv('../DATA/train_prep.csv', index=True)

In [61]:
# Test 데이터 전처리
# 1. 데이터 로드
test_df = pd.read_csv('../DATA/test.csv')

# 2. 데이터 전처리
# 2.1 컬럼명 변경
test_df.columns = ['num_date_time', 'building_num', 'datetime', 'temp', 'rain', 'wind', 'humid']

# 2.2 날짜시간 변환
test_df['time_idx'] = test_df['datetime'].apply(lambda x: int(str(x).replace(' ', '')))
test_df['datetime'] = pd.to_datetime(test_df['datetime'])

# 2.3 시간 특성 생성
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day
test_df['hour'] = test_df['datetime'].dt.hour
test_df['weekday'] = test_df['datetime'].dt.weekday
test_df['is_weekend'] = (test_df['datetime'].dt.weekday >= 5).astype(int)
test_df['season'] = ((test_df['datetime'].dt.month % 12) // 3 + 1).astype(str)

# 3. 타겟 컬럼 추가
test_df['answer'] = 0

# 2.4 저장
test_df.to_csv('../DATA/test_prep.csv', index=True)
test_df.head()

Unnamed: 0,num_date_time,building_num,datetime,temp,rain,wind,humid,time_idx,month,day,hour,weekday,is_weekend,season,answer
0,1_20240825 00,1,2024-08-25 00:00:00,26.5,0.0,0.7,80.0,2024082500,8,25,0,6,1,3,0
1,1_20240825 01,1,2024-08-25 01:00:00,26.1,0.0,0.0,80.0,2024082501,8,25,1,6,1,3,0
2,1_20240825 02,1,2024-08-25 02:00:00,25.9,0.0,0.3,83.0,2024082502,8,25,2,6,1,3,0
3,1_20240825 03,1,2024-08-25 03:00:00,25.7,0.0,1.1,83.0,2024082503,8,25,3,6,1,3,0
4,1_20240825 04,1,2024-08-25 04:00:00,25.5,0.0,1.0,86.0,2024082504,8,25,4,6,1,3,0
