## Library version check

In [None]:
import sys
import sktime
import tqdm as tq
import lightgbm as lgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
import math

-------------------------- Python & library version --------------------------
Python version: 3.8.16 | packaged by conda-forge | (default, Feb  1 2023, 16:01:13) 
[Clang 14.0.6 ]
pandas version: 1.5.3
numpy version: 1.23.5
matplotlib version: 3.7.0
tqdm version: 4.64.1
sktime version: 0.16.1
lightgbm version: 4.3.0
seaborn version: 0.13.2
scikit-learn version: 1.2.2
------------------------------------------------------------------------------


In [2]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from lightgbm import LGBMRegressor

pd.set_option('display.max_columns', 30)

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
info = pd.read_csv('./data/building_info.csv')


#전력소비량 이상치 처리(건물번호를 기준)
for num in tqdm(range(train['건물번호'].nunique())):
    train.loc[train['건물번호'] == num+1, '전력소비량(kWh)'] = train.loc[train['건물번호'] == num+1, '전력소비량(kWh)'].clip(train.loc[train['건물번호'] == num+1, '전력소비량(kWh)'].quantile(.03), train.loc[train['건물번호'] == num+1, '전력소비량(kWh)'].quantile(.97))

info = info.iloc[:,:4]
train = train.merge(info,on='건물번호',how='left')
test = test.merge(info,on='건물번호',how='left')

## 변수들을 영문명으로 변경
cols = ['num_date_time', 'build_num', 'date_time', 'temp' ,'prec', 'wind', 'hum', 'isolation', 'sunshine', 'power','use', 'area_1', 'area_2']
train.columns = cols

def summer_cos(date):
    start_date = datetime.strptime("2024-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2024-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

    period = (end_date - start_date).total_seconds()

    return math.cos(2 * math.pi * (date - start_date).total_seconds() / period)

def summer_sin(date):
    start_date = datetime.strptime("2024-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2024-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

    period = (end_date - start_date).total_seconds()

    return math.sin(2 * math.pi * (date - start_date).total_seconds() / period)

# 시간 관련 변수들 생성
date = pd.to_datetime(train.date_time)
train['date_time'] = pd.to_datetime(train['date_time'])
train['date'] = date.dt.date
train['hour'] = date.dt.hour
train['day'] = date.dt.day
train['weekday'] = date.dt.weekday
train['month'] = date.dt.month

train.loc[13238:13826, 'power'] += 3500
train.loc[19161:20343, 'power'] -= 4000
train = train.drop(index=range(114240, 114408))
train = train.reset_index(drop=True)

100%|██████████| 100/100 [00:00<00:00, 461.37it/s]


In [4]:
def batch_interpolate_building_power(df, targets, target_col='power', method='time'):
    """
    여러 건물/시점(단일 or 구간)에 대해 시간 기반 보간을 한 번에 적용하는 함수.

    Parameters:
        df (pd.DataFrame): 전체 데이터프레임
        targets (list of tuples): 
            [(build_num, start_time), (build_num, start_time, end_time), ...] 형태의 리스트
            - end_time이 없으면 단일 시점 처리
        target_col (str): 보간할 컬럼명
        method (str): pandas.interpolate method

    Returns:
        pd.DataFrame: 보간이 적용된 원본 데이터프레임
    """
    for item in targets:
        # 튜플 길이에 따라 단일 시점/구간 처리
        if len(item) == 2:
            build_num, start_time = item
            end_time = None
        elif len(item) == 3:
            build_num, start_time, end_time = item
        else:
            raise ValueError("targets는 (build_num, start_time) 또는 (build_num, start_time, end_time) 형식이어야 합니다.")

        # 대상 건물 데이터 추출
        building = df[df['build_num'] == build_num].sort_values('date_time').copy()

        # 결측 처리
        if end_time is None:
            mask_missing = (building['date_time'] == pd.Timestamp(start_time))
        else:
            mask_missing = (
                (building['date_time'] >= pd.Timestamp(start_time)) &
                (building['date_time'] <= pd.Timestamp(end_time))
            )
        building.loc[mask_missing, target_col] = np.nan

        # 시간 기반 보간
        building.set_index('date_time', inplace=True)
        building[target_col] = building[target_col].interpolate(method=method)
        building.reset_index(inplace=True)

        # 원본 반영
        df.loc[df['build_num'] == build_num, target_col] = building[target_col].values

    return df

targets = [
    (3, '2024-07-17 14:00'),
    (7, '2024-08-06 03:00'),
    (18, '2024-07-17 14:00'),
    (30, '2024-07-13 20:00'),
    (30, '2024-07-25 00:00'),
    (42, '2024-07-17 14:00'),
    (47, '2024-07-17 14:00'),
    (55, '2024-07-17 14:00'),
    (76, '2024-08-22 21:00'),
    (81, '2024-06-27 14:00'),
    (81, '2024-07-17 14:00'),
    (82, '2024-07-17 14:00'),
    (83, '2024-07-17 14:00'), 
    (5, '2024-08-04 06:00', '2024-08-04 08:00'), 
    (18, '2024-06-11 17:00', '2024-06-11 18:00'), 
    (18, '2024-08-08 15:00', '2024-08-08 16:00'), 
    (28, '2024-07-17 14:00', '2024-07-17 15:00'), 
    (38, '2024-07-17 14:00', '2024-07-17 15:00'), 
    (41, '2024-07-17 09:00', '2024-07-17 15:00'), 
    (60, '2024-07-17 14:00', '2024-07-17 15:00'), 
    (62, '2024-07-17 13:00', '2024-07-17 15:00'), 
    (69, '2024-07-17 14:00', '2024-07-17 15:00'),  
    (76, '2024-06-20 12:00', '2024-06-20 16:00'),  
    (78, '2024-07-17 13:00', '2024-07-17 14:00'),

    # (81, '2024-07-25 13:00', '2024-07-25 17:00'), 
    # (81, '2024-07-26 13:00', '2024-07-26 17:00'), 
    # (81, '2024-07-29 13:00', '2024-07-29 17:00'), 
    # (81, '2024-07-30 13:00', '2024-07-30 17:00'), 
    # (81, '2024-08-01 13:00', '2024-08-01 17:00'), 
    # (81, '2024-08-02 13:00', '2024-08-02 17:00'), 
    # (81, '2024-08-05 13:00', '2024-08-05 17:00'), 
    # (81, '2024-08-06 13:00', '2024-08-06 16:00'), 
    # (81, '2024-08-07 13:00', '2024-08-07 17:00'), 
    # (81, '2024-08-09 13:00', '2024-08-09 17:00'), 

    # (81, '2024-08-12 10:00', '2024-08-12 15:00'), 
    # (81, '2024-08-13 13:00', '2024-08-13 17:00'), 
    # (81, '2024-08-14 10:00', '2024-08-14 17:00'), 
    # (81, '2024-08-16 10:00', '2024-08-16 17:00'), 
    # (81, '2024-08-19 10:00', '2024-08-19 17:00'), 
    # (81, '2024-08-23 11:00', '2024-08-23 16:00'), 

    (89, '2024-07-12 08:00', '2024-07-12 10:00'), 
    (97, '2024-07-17 13:00', '2024-07-17 15:00'), 
]

train = batch_interpolate_building_power(train, targets)

In [5]:
# 교체할 건물과 기간
build_num = 17
target_start = pd.Timestamp("2024-06-25 15:00")
target_end = pd.Timestamp("2024-06-26 09:00")

# 복사할 원본 패턴 (전 주 동일 구간)
source_start = target_start - pd.Timedelta(days=7)
source_end = target_end - pd.Timedelta(days=7)

# 마스크
mask_source = (
    (train['build_num'] == build_num) &
    (train['date_time'] >= source_start) &
    (train['date_time'] <= source_end)
)
mask_target = (
    (train['build_num'] == build_num) &
    (train['date_time'] >= target_start) &
    (train['date_time'] <= target_end)
)

# 패턴 추출
pattern_data = train.loc[mask_source].sort_values('date_time')['power'].values
target_indices = train.loc[mask_target].sort_values('date_time').index

# 길이 맞춰서 덮어쓰기
length = min(len(pattern_data), len(target_indices))
train.loc[target_indices[:length], 'power'] = pattern_data[:length]

In [6]:
def batch_apply_pattern_scaling(df, tasks, target_col='power'):
    """
    여러 패턴 복사 작업을 한 번에 처리.
    Parameters:
        df (pd.DataFrame): 전체 데이터프레임
        tasks (list of tuples): 
            [
                (build_num, source_start, source_end, value_start, value_end, target_start, target_end),
                ...
            ]
        target_col (str): 수정할 컬럼명
    Returns:
        pd.DataFrame: 수정된 데이터프레임
    """
    for build_num, source_start, source_end, value_start_time, value_end_time, target_start, target_end in tasks:
        # 1. 원본 패턴 추출
        pattern_mask = (
            (df['build_num'] == build_num) &
            (df['date_time'] >= pd.Timestamp(source_start)) &
            (df['date_time'] <= pd.Timestamp(source_end))
        )
        P_source = df.loc[pattern_mask].sort_values('date_time')[target_col].values
        if len(P_source) == 0:
            continue  # 패턴 없으면 스킵

        # 2. 시작/종료 값
        V_start = df.loc[
            (df['build_num'] == build_num) & (df['date_time'] == pd.Timestamp(value_start_time)),
            target_col
        ].values[0]
        V_end = df.loc[
            (df['build_num'] == build_num) & (df['date_time'] == pd.Timestamp(value_end_time)),
            target_col
        ].values[0]

        # 3. 정규화 및 스케일링
        P_min, P_max = P_source.min(), P_source.max()
        P_scaled = (P_source - P_min) / (P_max - P_min + 1e-8)
        P_target = V_start + (V_end - V_start) * P_scaled

        # 4. 대상 구간 인덱스
        target_mask = (
            (df['build_num'] == build_num) &
            (df['date_time'] >= pd.Timestamp(target_start)) &
            (df['date_time'] <= pd.Timestamp(target_end))
        )
        target_indices = df.loc[target_mask].sort_values('date_time').index

        # 5. 길이 맞춰 삽입
        length = min(len(P_target), len(target_indices))
        df.loc[target_indices[:length], target_col] = P_target[:length]

    return df

tasks = [
    (7, '2024-06-30 10:00', '2024-07-01 11:00', '2024-07-07 09:00', '2024-07-08 12:00', '2024-07-07 10:00', '2024-07-08 11:00'),
    (7, '2024-07-05 14:00', '2024-07-05 23:00', '2024-07-12 13:00', '2024-07-13 00:00', '2024-07-12 14:00', '2024-07-12 23:00'),
    # (17, '2024-06-18 15:00', '2024-06-19 09:00', '2024-06-25 14:00', '2024-06-26 10:00', '2024-06-25 15:00', '2024-06-26 09:00')
]

train = batch_apply_pattern_scaling(train, tasks)

In [7]:
def batch_fill_hourly_means(df, tasks, target_col='power'):
    """
    여러 건물/기간/시간대 평균을 다른 날짜로 삽입하는 배치 함수.

    Parameters:
        df (pd.DataFrame): 전체 데이터프레임
        tasks (list of tuples): 
            [
                (build_num, source_dates, source_hours, target_date),
                ...
            ]
            - source_dates: ('start_date','end_date') or ['date1','date2',...]
            - source_hours: [hour1, hour2, ...]
            - target_date: 단일 날짜
        target_col (str): 수정할 컬럼명 (기본 'power')
    Returns:
        pd.DataFrame: 수정된 데이터프레임
    """
    for build_num, source_dates, source_hours, target_date in tasks:
        building = df[df['build_num'] == build_num].copy()

        # 날짜 마스크 생성
        if isinstance(source_dates, (tuple, list)) and len(source_dates) == 2 and not isinstance(source_dates[0], (pd.Timestamp, str)):
            # 범위일 경우
            start_date, end_date = pd.to_datetime(source_dates[0]).date(), pd.to_datetime(source_dates[1]).date()
            mask_range = (
                (building['date_time'].dt.date >= start_date) &
                (building['date_time'].dt.date <= end_date) &
                (building['date_time'].dt.hour.isin(source_hours))
            )
        else:
            # 날짜 리스트일 경우
            date_list = [pd.to_datetime(d).date() for d in source_dates]
            mask_range = (
                (building['date_time'].dt.date.isin(date_list)) &
                (building['date_time'].dt.hour.isin(source_hours))
            )

        # 시간별 평균 계산
        hourly_means = (
            building[mask_range]
            .groupby(building['date_time'].dt.hour)[target_col]
            .mean()
            .to_dict()
        )

        # 타겟 날짜에 삽입
        for hour, mean_val in hourly_means.items():
            mask_fill = (
                (df['build_num'] == build_num) &
                (df['date_time'].dt.date == pd.to_datetime(target_date).date()) &
                (df['date_time'].dt.hour == hour)
            )
            df.loc[mask_fill, target_col] = mean_val

    return df

tasks = [
    (67, ('2024-06-03', '2024-06-07'), [16, 17, 18], '2024-06-10'),          # 6/3~6/7 오후 4~6시 평균 → 6/10
    (67, ('2024-07-29', '2024-07-31'), [15, 16], '2024-08-01'),              # 7/29~7/31 오후 3~4시 평균 → 8/1
    (67, ['2024-08-13', '2024-08-14', '2024-08-16'], [16, 17], '2024-08-12'), # 8/13,14,16 오후 4~5시 평균 → 8/12
    (80, ('2024-07-01', '2024-07-05'), [11,12,13,14,19,20], '2024-07-08')    # 7/1~7/5 11~14시,19~20시 평균 → 7/8
]

train = batch_fill_hourly_means(train, tasks)

In [8]:
def scale_power_segments(train: pd.DataFrame, segments: list):
    """
    주어진 구간의 power 값을 앞뒤 하루 같은 시간대 min-max 스케일로 보정.
    보정된 값은 train['power']에 덮어씌움.

    Parameters:
        train (pd.DataFrame): 'build_num', 'date_time', 'power' 컬럼 포함 데이터프레임
        segments (list): [(build_num, start_datetime, end_datetime), ...] 형식의 튜플 리스트
    Returns:
        pd.DataFrame: 보정된 train 데이터프레임
    """
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    for build_num, start_str, end_str in segments:
        target_start = pd.Timestamp(start_str)
        target_end = pd.Timestamp(end_str)

        # 대상 건물 데이터
        building_data = train[train['build_num'] == build_num].sort_values(by='date_time')
        target_mask = (building_data['date_time'] >= target_start) & (building_data['date_time'] <= target_end)
        
        # 참조 구간: 앞뒤 하루 동일 시간대
        ref_mask = (
            ((building_data['date_time'] >= target_start - pd.Timedelta(days=1)) & (building_data['date_time'] <= target_end - pd.Timedelta(days=1))) |
            ((building_data['date_time'] >= target_start + pd.Timedelta(days=1)) & (building_data['date_time'] <= target_end + pd.Timedelta(days=1)))
        )
        ref_data = building_data.loc[ref_mask, 'power']
        if ref_data.empty:
            continue  # 참조 데이터가 없으면 스킵

        ref_min, ref_max = ref_data.min(), ref_data.max()
        target_data = building_data.loc[target_mask, 'power']
        if target_data.empty or target_data.max() == target_data.min():
            continue  # 대상 데이터가 없거나 변동이 없으면 스킵

        # 스케일 조정
        scaled = (target_data - target_data.min()) / (target_data.max() - target_data.min())  # 0~1 정규화
        scaled = scaled * (ref_max - ref_min) + ref_min

        # train에 덮어쓰기
        train.loc[target_mask & (train['build_num'] == build_num), 'power'] = scaled

    return train

segments = [
    (30, '2024-06-20 06:00', '2024-06-20 23:00'),
    (30, '2024-07-06 06:00', '2024-07-06 23:00'),
]

train = scale_power_segments(train, segments)

In [9]:
def fill_with_weekly_pattern(train: pd.DataFrame, build_num: int, start_str: str, end_str: str):
    """
    특정 구간을 앞주+다음주 동일 시각 데이터 평균으로 채우고 train['power']에 덮어씀.
    """
    train['date_time'] = pd.to_datetime(train['date_time'])
    building_data = train[train['build_num'] == build_num].sort_values(by='date_time')

    # 대상 구간
    target_start = pd.Timestamp(start_str)
    target_end = pd.Timestamp(end_str)
    target_mask = (building_data['date_time'] >= target_start) & (building_data['date_time'] <= target_end)
    target_range = building_data.loc[target_mask, ['date_time']].copy()
    if target_range.empty:
        print(f"⚠️ 대상 구간({start_str}~{end_str}) 데이터 없음")
        return train

    # 앞주 & 다음주 동일 시각 데이터 가져오기
    week_offset = pd.Timedelta(days=7)
    ref1 = building_data.set_index('date_time').loc[target_start - week_offset : target_end - week_offset, ['power']].reset_index()
    ref2 = building_data.set_index('date_time').loc[target_start + week_offset : target_end + week_offset, ['power']].reset_index()

    # 두 주 패턴 align (길이가 다를 경우 보정)
    if len(ref1) != len(target_range):
        ref1 = ref1.reindex(range(len(target_range)), method='nearest')
    if len(ref2) != len(target_range):
        ref2 = ref2.reindex(range(len(target_range)), method='nearest')

    # 두 주 평균 패턴 생성
    ref_mean = (ref1['power'].values + ref2['power'].values) / 2
    target_range['power_filled'] = ref_mean

    # 덮어쓰기
    for idx, row in target_range.iterrows():
        train.loc[
            (train['build_num'] == build_num) & (train['date_time'] == row['date_time']),
            'power'
        ] = row['power_filled']

    return train

# 7월 20일 02시 ~ 7월 22일 10시, 건물 49
train = fill_with_weekly_pattern(
    train, 
    build_num=43, 
    start_str="2024-07-20 02:00", 
    end_str="2024-07-22 10:00"
)

train = fill_with_weekly_pattern(
    train, 
    build_num=53, 
    start_str="2024-06-14 16:00", 
    end_str="2024-06-17 09:00"
)

train = fill_with_weekly_pattern(
    train, 
    build_num=67, 
    start_str="2024-07-27 00:00", 
    end_str="2024-07-28 00:00"
)

train = fill_with_weekly_pattern(
    train, 
    build_num=94, 
    start_str="2024-07-27 00:00", 
    end_str="2024-07-28 00:00"
)

In [10]:
def add_value_to_segment(train: pd.DataFrame, build_num: int, start_str: str, end_str: str, add_value: float):
    """
    특정 건물의 지정 구간에 일정 값을 더해 train['power']에 덮어씀.
    """
    train['date_time'] = pd.to_datetime(train['date_time'])
    target_start = pd.Timestamp(start_str)
    target_end = pd.Timestamp(end_str)

    mask = (
        (train['build_num'] == build_num) &
        (train['date_time'] >= target_start) &
        (train['date_time'] <= target_end)
    )

    train.loc[mask, 'power'] = train.loc[mask, 'power'] + add_value
    return train

train = add_value_to_segment(
    train,
    build_num=53,
    start_str="2024-08-18 16:00",
    end_str="2024-08-19 07:00",
    add_value=400
)

train = add_value_to_segment(
    train,
    build_num=67,
    start_str="2024-06-01 00:00",
    end_str="2024-06-03 09:00",
    add_value=780
)

# train = add_value_to_segment(
#     train,
#     build_num=10,
#     start_str="2024-06-01 00:00",
#     end_str="2024-07-04 07:00",
#     add_value=900
# )

In [11]:
def fill_with_prev_next_day_avg(train: pd.DataFrame, build_num: int, start_str: str, end_str: str):
    """
    특정 건물의 지정 구간을 하루 전/하루 뒤 동일 시간대의 평균 값으로 채움.
    train['power']에 덮어씀.
    """
    train['date_time'] = pd.to_datetime(train['date_time'])
    building_data = train[train['build_num'] == build_num].sort_values(by='date_time')

    # 대상 구간
    target_start = pd.Timestamp(start_str)
    target_end = pd.Timestamp(end_str)
    target_mask = (building_data['date_time'] >= target_start) & (building_data['date_time'] <= target_end)
    target_times = building_data.loc[target_mask, 'date_time']
    if target_times.empty:
        print(f"⚠️ 대상 구간({start_str}~{end_str}) 데이터 없음")
        return train

    # 하루 전/하루 뒤 동일 시간대 구간
    prev_day_mask = (building_data['date_time'] >= target_start - pd.Timedelta(days=1)) & (building_data['date_time'] <= target_end - pd.Timedelta(days=1))
    next_day_mask = (building_data['date_time'] >= target_start + pd.Timedelta(days=1)) & (building_data['date_time'] <= target_end + pd.Timedelta(days=1))
    prev_data = building_data.loc[prev_day_mask, ['date_time', 'power']]
    next_data = building_data.loc[next_day_mask, ['date_time', 'power']]

    if prev_data.empty and next_data.empty:
        print(f"⚠️ 참조 데이터 없음({start_str}~{end_str})")
        return train

    # 평균 패턴 계산 (있으면 합쳐서 평균)
    ref_values = []
    if not prev_data.empty:
        ref_values.append(prev_data['power'].values)
    if not next_data.empty:
        ref_values.append(next_data['power'].values)
    ref_mean = sum(ref_values) / len(ref_values)  # 두 날짜 평균

    # 길이가 다르면 맞춰서 채움
    ref_mean_series = pd.Series(ref_mean)
    ref_mean_series = ref_mean_series.reindex(range(len(target_times)), method='nearest')

    # 덮어쓰기
    train.loc[
        (train['build_num'] == build_num) & (train['date_time'] >= target_start) & (train['date_time'] <= target_end),
        'power'
    ] = ref_mean_series.values

    return train

train = fill_with_prev_next_day_avg(
    train,
    build_num=70,
    start_str="2024-06-04 09:00",
    end_str="2024-06-05 09:00"
)

In [12]:
def fill_power_with_holiday_pattern(train, build_num, date_ranges):
    """
    특정 건물의 지정 날짜 구간 전력 사용량을
    전후 holiday 패턴(같은 시간대 평균)으로 대체하는 함수.

    Parameters:
        train (pd.DataFrame): 전체 데이터프레임
        build_num (int): 건물 번호
        date_ranges (list of tuples): [(start_date, end_date), ...] 형식의 구간 리스트 (문자열 or Timestamp)
    Returns:
        pd.DataFrame: power가 덮어씌워진 원본 train DataFrame
    """
    # 데이터 정렬 및 시간 변환
    train['date_time'] = pd.to_datetime(train['date_time'])
    building_data = train[train['build_num'] == build_num].sort_values(by='date_time').copy()
    building_data['hour'] = building_data['date_time'].dt.hour

    # holiday 날짜
    holiday_dates = building_data[building_data['holiday'] == 1]['date_time']

    for start_date, end_date in date_ranges:
        target_start = pd.Timestamp(start_date)
        target_end = pd.Timestamp(end_date)

        # 전후 holiday 추출
        prev_holiday = holiday_dates[holiday_dates < target_start].max()
        next_holiday = holiday_dates[holiday_dates > target_end].min()
        if pd.isna(prev_holiday) or pd.isna(next_holiday):
            continue  # holiday 없으면 skip

        # 전후 holiday 패턴
        prev_pattern = building_data[building_data['date_time'].dt.date == prev_holiday.date()]
        next_pattern = building_data[building_data['date_time'].dt.date == next_holiday.date()]
        holiday_pattern = (prev_pattern.groupby('hour')['power'].mean() +
                           next_pattern.groupby('hour')['power'].mean()) / 2

        # 대체
        target_mask = (building_data['date_time'] >= target_start) & (building_data['date_time'] <= target_end)
        building_data.loc[target_mask, 'power'] = building_data.loc[target_mask].apply(
            lambda row: holiday_pattern.loc[row['hour']] if row['hour'] in holiday_pattern.index else row['power'],
            axis=1
        )

    # train에 반영
    train.loc[building_data.index, 'power'] = building_data['power']
    return train

In [13]:
import pandas as pd

def copy_pattern_by_days(
    df,
    build_num,
    target_start,
    target_end,
    offset_days,          # 예: -7(이전 주), +7(다음 주), +3(3일 뒤) 등
    col='power',
    dt_col='date_time',
    inplace=False
):
    """
    [target_start ~ target_end] 구간의 값을
    (offset_days 만큼 이동한 구간)의 패턴으로 덮어쓰기.
    """
    _df = df if inplace else df.copy()

    ts, te = pd.to_datetime(target_start), pd.to_datetime(target_end)
    ss, se = ts + pd.Timedelta(days=offset_days), te + pd.Timedelta(days=offset_days)

    m_src = (_df['build_num'] == build_num) & (_df[dt_col] >= ss) & (_df[dt_col] <= se)
    m_tgt = (_df['build_num'] == build_num) & (_df[dt_col] >= ts) & (_df[dt_col] <= te)

    src_vals = _df.loc[m_src].sort_values(dt_col)[col].values
    tgt_idx  = _df.loc[m_tgt].sort_values(dt_col).index

    if len(src_vals) == 0 or len(tgt_idx) == 0:
        return _df  # 소스/타겟이 없으면 그대로 반환

    n = min(len(src_vals), len(tgt_idx))
    _df.loc[tgt_idx[:n], col] = src_vals[:n]
    return _df


def batch_copy_patterns_by_days(
    df,
    jobs,                 # [(build_num, t_start, t_end, offset_days), ...] 또는 dict 리스트
    col='power',
    dt_col='date_time',
    inplace=False,
    verbose=False
):
    """
    여러 건을 한 번에 처리하는 배치 함수.
    jobs 원소 형태:
      - 튜플: (build_num, target_start, target_end, offset_days)
      - 딕셔너리: {
            "build_num": ...,
            "target_start": ...,
            "target_end": ...,
            # 아래 중 하나
            "offset_days": ...,
            "week_offset": ...  # 있으면 7*week_offset으로 변환
        }
      ※ offset_days가 있으면 week_offset보다 우선
    """
    def _parse(job):
        if isinstance(job, (list, tuple)) and len(job) == 4:
            b, ts, te, od = job
            return b, ts, te, od
        if isinstance(job, dict):
            b  = job['build_num']
            ts = job['target_start']
            te = job['target_end']
            if 'offset_days' in job:
                od = job['offset_days']
            elif 'week_offset' in job:
                od = 7 * job['week_offset']
            else:
                raise ValueError("dict job에는 'offset_days' 또는 'week_offset' 중 하나가 필요합니다.")
            return b, ts, te, od
        raise ValueError("jobs 항목은 (build_num, start, end, offset_days) 튜플 또는 해당 키를 가진 dict여야 합니다.")

    _df = df if inplace else df.copy()

    for job in jobs:
        b, ts, te, od = _parse(job)
        if verbose:
            print(f"[batch] build_num={b}, target=({ts}~{te}), offset_days={od}")
        _df = copy_pattern_by_days(
            _df, b, ts, te, od, col=col, dt_col=dt_col, inplace=True
        )
    return _df


jobs = [
    (5, "2024-08-04 00:00", "2024-08-04 23:00", -7),
    (6, "2024-08-15 00:00", "2024-08-15 23:00", -4),
    (6, "2024-08-16 00:00", "2024-08-16 23:00", -7),
    (6, "2024-08-17 00:00", "2024-08-17 23:00", -7),
    (6, "2024-08-18 00:00", "2024-08-18 23:00", -7),
    (7, "2024-07-07 10:00", "2024-07-08 11:00", -7),
    (8,  "2024-07-21 08:00", "2024-07-21 11:00", -7),
    (8,  "2024-08-24 00:00", "2024-08-24 23:00", -7),
    (12, "2024-07-21 00:00", "2024-07-21 23:00", +7),
    (12, "2024-08-24 00:00", "2024-08-24 23:00", -7),
    (17, "2024-06-25 15:00", "2024-06-26 09:00", -7),
    (20, "2024-06-01 00:00", "2024-06-01 23:00", +7),
    (25, "2024-07-04 12:00", "2024-07-04 14:00", +7),
    (26, "2024-06-17 14:00", "2024-06-18 11:00", -7),
    (29, "2024-06-15 22:00", "2024-06-15 23:00", -7),
    (29, "2024-06-27 00:00", "2024-06-27 01:00", -7),
    (30, "2024-08-04 00:00", "2024-08-04 23:00", -1),
    (30, "2024-08-05 00:00", "2024-08-05 23:00", -1),
    (30, "2024-08-07 00:00", "2024-08-07 23:00", -1),
    (40, "2024-07-14 00:00", "2024-07-14 01:00", -1),
    (41, "2024-06-22 01:00", "2024-06-22 04:00", -7),
    (41, "2024-07-17 00:00", "2024-07-17 23:00", -7),
    (42, "2024-07-17 00:00", "2024-07-17 23:00", -1),
    (43, "2024-06-10 17:00", "2024-06-10 18:00", -7),
    (43, "2024-08-12 16:00", "2024-08-12 17:00", -7),
    (43, "2024-07-20 00:00", "2024-07-21 23:00", -7)
]

train = batch_copy_patterns_by_days(train, jobs, col='power', dt_col='date_time', inplace=False, verbose=False)

In [14]:
isolation = pd.pivot_table(train, values = 'isolation', index = ['build_num', 'hour', 'month'], aggfunc = np.mean).reset_index()
sunshine = pd.pivot_table(train, values = 'sunshine', index = ['build_num', 'hour', 'month'], aggfunc = np.mean).reset_index()

## 공휴일 변수 추가
train['holiday'] = train.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)
train.loc[('20240606'<=train.date_time)&(train.date_time<'20240607'),'holiday'] = 1
train.loc[('20240815'<=train.date_time)&(train.date_time<'20240816'),'holiday'] = 1

# 규칙 정의 함수
def apply_holiday_rules(row):
    bn = row['build_num']
    wd = row['weekday']
    day = row['day']
    week = (row['day'] - 1) // 7 + 1  # 몇째 주인지 계산

    # 📌 개별 규칙 적용
    if bn == 2:   # 상용: 토요일 쉼 → holiday = 1 if 토요일 else 0
        return 1 if wd == 5 else 0
    elif bn == 7:   # 건물기타: 일요일 쉼
        return 1 if wd == 6 else 0
    elif bn == 18:  # 백화점: 일요일 쉼
        return 1 if wd == 6 else 0
    # elif bn == 19:  # 백화점: 둘째주 월요일 쉼
    #     return 1 if wd == 0 and week == 2 else 0
    elif bn == 25:  # 아파트: 토요일에 적게 씀 (평일로 간주)
        return 0
    # elif bn == 26:  # 건물기타: 주말에 더 씀 → 평일로 간주
    #     return 0 if wd in [5, 6] else 1
    elif bn == 26:  # 건물기타: 주말에 더 씀 → 평일로 간주
        return 0
    elif bn == 27:  # 백화점: 둘째, 넷째주 일요일 쉼
        return 1 if wd == 6 and week in [2, 4] else 0
    elif bn == 29:  # 백화점: 매달 10일 쉼
        return 1 if day == 10 else 0
    elif bn == 31:  # 아파트: 휴일 없음 → 평일 취급
        return 0
    elif bn == 32:  # 백화점: 둘째, 넷째주 월요일 쉼
        return 1 if wd == 0 and week in [2, 4] else 0
    elif bn == 34:  # 백화점: 휴일 없음 → 평일 취급
        return 0
    elif bn == 35:  # 전화국: 휴일 없음 -> 평일 취급
        return 0
    elif bn == 36:  # 전화국: 휴일 없음 -> 평일 취급
        return 1 if wd in [5, 6] else 0
    elif bn == 40:  # 백화점: 둘째, 넷째주 월요일 쉼
        return 1 if wd == 6 and week in [2, 4] else 0
    elif bn == 41:
        return 0
    # elif bn == 45:
    #     return 1 if day == 10 else 0
    elif bn == 54:
        return 0
    elif bn == 57:
        return 0
    elif bn == 58:
        return 0
    elif bn == 59:
        return 1 if wd == 6 and week in [2, 4] else 0
    elif bn == 61:
        return 0
    elif bn == 63:
        return 1 if wd == 6 and week in [2, 4] else 0
    elif bn in [97]:  # 토요일쉼
        return 1 if wd == 5 else 0
    elif bn in [1,4,9,10,11,19]:
        return 0
    elif bn in [10,28,30,33,45,65,70,71,73,74,76,77,78,79,82,84,85,88,89,91,92,93,95,96,98,99,100]:
        return 0
    else:
        # 기본 규칙 유지
        return row['holiday']

# 규칙 적용
train['holiday'] = train.apply(apply_holiday_rules, axis=1)

single_day_holidays = [
    (19, '2024-06-10'),
    (19, '2024-07-08'),
    (19, '2024-08-19'),
    (23, '2024-06-07'),
    (23, '2024-08-16'),
    (29, '2024-06-23'),
    (29, '2024-07-28'),
    (45, '2024-06-10'),
    (45, '2024-07-08'),
    (45, '2024-08-19'),
    (49, '2024-08-22'),
    (54, '2024-06-17'),
    (54, '2024-07-01'),
    (54, '2024-08-19'),
    (56, '2024-06-07'),
    (56, '2024-08-16'),
    (67, '2024-07-26'),
    (74, '2024-06-17'),
    (74, '2024-07-01'),
    (79, '2024-06-17'),
    (79, '2024-07-01'),
    (79, '2024-08-19'),
    (94, '2024-06-07'),
    (94, '2024-08-16'),
    (95, '2024-07-08'),
    (95, '2024-08-05'),
]

for build_num, date_str in single_day_holidays:
    target_date = pd.to_datetime(date_str).date()  # 날짜만 비교
    train.loc[
        (train['build_num'] == build_num) &
        (train['date_time'].dt.date == target_date),
        'holiday'
    ] = 1

single_day_no_holiday = [
    (67, '2024-06-06')
]

for build_num, date_str in single_day_no_holiday:
    target_date = pd.to_datetime(date_str).date()  # 날짜만 비교
    train.loc[
        (train['build_num'] == build_num) &
        (train['date_time'].dt.date == target_date),
        'holiday'
    ] = 0

date_ranges = [
    ('2024-07-26 00:00:00', '2024-07-26 23:59:59'),
]
train = fill_power_with_holiday_pattern(train, build_num=67, date_ranges=date_ranges)

## 건물별, 요일별, 시간별 전력소비량 평균
power_mean_1 = pd.pivot_table(train, values = 'power', index = ['build_num', 'hour', 'weekday'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train['target_mean_1'] = train.progress_apply(lambda x : power_mean_1.loc[(power_mean_1.build_num == x['build_num']) & (power_mean_1.hour == x['hour']) & (power_mean_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

## 건물별, 요일별, 시간별 전력소비량 표준편차
power_std_1 = pd.pivot_table(train, values = 'power', index = ['build_num', 'hour', 'weekday'], aggfunc = np.std).reset_index()
tqdm.pandas()
train['target_std_1'] = train.progress_apply(lambda x : power_std_1.loc[(power_std_1.build_num == x['build_num']) & (power_std_1.hour == x['hour']) & (power_std_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

## 건물별, 요일별, 시간별 전력소비량 표준편차
power_min_1 = pd.pivot_table(train, values = 'power', index = ['build_num', 'hour', 'weekday'], aggfunc = np.min).reset_index()
tqdm.pandas()
train['target_min_1'] = train.progress_apply(lambda x : power_min_1.loc[(power_min_1.build_num == x['build_num']) & (power_min_1.hour == x['hour']) & (power_min_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

## 건물별, 요일별, 시간별 전력소비량 표준편차
power_max_1 = pd.pivot_table(train, values = 'power', index = ['build_num', 'hour', 'weekday'], aggfunc = np.max).reset_index()
tqdm.pandas()
train['target_max_1'] = train.progress_apply(lambda x : power_max_1.loc[(power_max_1.build_num == x['build_num']) & (power_max_1.hour == x['hour']) & (power_max_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
train['sin_hour'] = np.sin(2*np.pi*train.hour/24)
train['cos_hour'] = np.cos(2*np.pi*train.hour/24)
train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
train['sin_weekday'] = -np.sin(2 * np.pi * (train['weekday']+1)/7.0)
train['cos_weekday'] = -np.cos(2 * np.pi * (train['weekday']+1)/7.0)

#summer_sin, cos
train['summer_sin'] = train['date_time'].apply(summer_sin)
train['summer_cos'] = train['date_time'].apply(summer_cos)

## 화씨 온도
train['temp_F'] = (train['temp'] * 9/5) + 32

## 체감 온도
train['temp2'] = 13.12 + 0.6215*train['temp'] - 11.37*(train['wind']*3.6)**0.16 + 0.3965*(train['wind']*3.6)**0.16*train['temp']

## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
train['THI'] = 9/5*train['temp'] - 0.55*(1-train['hum']/100)*(9/5*train['hum']-26)+32
train['WC']=13.12+0.6215*train['temp']-13.947*train['wind']**0.16+0.486*train['temp']*train['wind']**0.16

train['is_rain'] = (train['prec'] > 0).astype(int)
train['log_temp'] = np.log1p(train['temp'])
train['wind_power'] = train['wind'] ** 2
train['dew_point'] = train['temp'] - (100 - train['hum']) / 5
train['solar_per_hour'] = train['isolation'] / (train['sunshine'] + 1e-3)
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = train[train['build_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

## min temperature
train = train.merge(train.groupby(['build_num','date'])['temp'].min().reset_index().rename(columns = {'temp':'min_temp'}), on = ['build_num','date'], how = 'left')

## max temperature
train = train.merge(train.groupby(['build_num','date'])['temp'].max().reset_index().rename(columns = {'temp':'max_temp'}), on = ['build_num','date'], how = 'left')

## min windspeed
train = train.merge(train.groupby(['build_num','date'])['wind'].min().reset_index().rename(columns = {'wind':'min_wind'}), on = ['build_num','date'], how = 'left')

## max windspeed
train = train.merge(train.groupby(['build_num','date'])['wind'].max().reset_index().rename(columns = {'wind':'max_wind'}), on = ['build_num','date'], how = 'left')

## min humidity
train = train.merge(train.groupby(['build_num','date'])['hum'].min().reset_index().rename(columns = {'hum':'min_hum'}), on = ['build_num','date'], how = 'left')

## max humidity
train = train.merge(train.groupby(['build_num','date'])['hum'].max().reset_index().rename(columns = {'hum':'max_hum'}), on = ['build_num','date'], how = 'left')

## mean THI
train = train.merge(train.groupby(['build_num','date'])['THI'].mean().reset_index().rename(columns = {'THI':'mean_THI'}), on = ['build_num','date'], how = 'left')

## mean CDH
train = train.merge(train.groupby(['build_num','date'])['CDH'].mean().reset_index().rename(columns = {'CDH':'mean_CDH'}), on = ['build_num','date'], how = 'left')

train = train.merge(train.groupby(['build_num','date'])['log_temp'].min().reset_index().rename(columns = {'log_temp':'min_log_temp'}), on = ['build_num','date'], how = 'left')

train = train.merge(train.groupby(['build_num','date'])['log_temp'].max().reset_index().rename(columns = {'log_temp':'max_log_temp'}), on = ['build_num','date'], how = 'left')

train = train.merge(train.groupby(['build_num','date'])['WC'].mean().reset_index().rename(columns = {'WC':'mean_WC'}), on = ['build_num','date'], how = 'left')

train['z_score'] = train['target_mean_1'] / train['target_std_1']
train.drop(columns=['date','day','weekday'], inplace=True)
train.head()

100%|██████████| 203832/203832 [00:48<00:00, 4190.52it/s]
100%|██████████| 203832/203832 [00:49<00:00, 4134.05it/s]
100%|██████████| 203832/203832 [00:49<00:00, 4100.26it/s]
100%|██████████| 203832/203832 [00:50<00:00, 4038.99it/s]


Unnamed: 0,num_date_time,build_num,date_time,temp,prec,wind,hum,isolation,sunshine,power,use,area_1,area_2,hour,month,...,dew_point,solar_per_hour,CDH,min_temp,max_temp,min_wind,max_wind,min_hum,max_hum,mean_THI,mean_CDH,min_log_temp,max_log_temp,mean_WC,z_score
0,1_20240601 00,1,2024-06-01 00:00:00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8,호텔,82912.71,77586.0,0,6,...,14.7,0.0,-7.7,17.6,24.8,1.2,3.9,36.0,85.0,53.788788,-53.0625,2.923162,3.250374,21.342341,11.353622
1,1_20240601 01,1,2024-06-01 01:00:00,18.3,0.0,2.7,82.0,0.0,0.0,5591.85,호텔,82912.71,77586.0,1,6,...,14.7,0.0,-15.4,17.6,24.8,1.2,3.9,36.0,85.0,53.788788,-53.0625,2.923162,3.250374,21.342341,10.540611
2,1_20240601 02,1,2024-06-01 02:00:00,18.1,0.0,2.6,80.0,0.0,0.0,5338.17,호텔,82912.71,77586.0,2,6,...,14.1,0.0,-23.3,17.6,24.8,1.2,3.9,36.0,85.0,53.788788,-53.0625,2.923162,3.250374,21.342341,9.252619
3,1_20240601 03,1,2024-06-01 03:00:00,18.0,0.0,2.6,81.0,0.0,0.0,4554.42,호텔,82912.71,77586.0,3,6,...,14.2,0.0,-31.3,17.6,24.8,1.2,3.9,36.0,85.0,53.788788,-53.0625,2.923162,3.250374,21.342341,7.926321
4,1_20240601 04,1,2024-06-01 04:00:00,17.8,0.0,1.3,81.0,0.0,0.0,3602.25,호텔,82912.71,77586.0,4,6,...,14.0,0.0,-39.5,17.6,24.8,1.2,3.9,36.0,85.0,53.788788,-53.0625,2.923162,3.250374,21.342341,7.884329


In [19]:
# test = pd.read_csv('./data/test.csv')
# train set과 동일한 전처리 과정
test['일시'] = pd.to_datetime(test['일시'])
cols = ['num_date_time', 'build_num', 'date_time', 'temp' , 'prec', 'wind', 'hum', 'use', 'area_1', 'area_2']
test.columns = cols

# 시간 관련 변수들 생성
date = pd.to_datetime(test.date_time)
test['date'] = date.dt.date
test['hour'] = date.dt.hour
test['day'] = date.dt.day
test['weekday'] = date.dt.weekday
test['month'] = date.dt.month

test['holiday'] = test.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)
test['holiday'] = test.apply(apply_holiday_rules, axis=1)

# for k, nums in CLUSTER.items():
#     test.loc[test.build_num.isin(nums), 'cluster'] = k

## 일조
tqdm.pandas()
test['isolation'] = np.round(test.progress_apply(lambda x : isolation.loc[(isolation.build_num == x['build_num']) & (isolation.hour == x['hour']) & (isolation.month == x['month']) ,'isolation'].values[0], axis = 1), 1)

## 일사
tqdm.pandas()
test['sunshine'] = np.round(test.progress_apply(lambda x : sunshine.loc[(sunshine.build_num == x['build_num']) & (sunshine.hour == x['hour']) & (sunshine.month == x['month']) ,'sunshine'].values[0], axis = 1), 2)

## 건물별, 요일별, 시간별 전력소비량 평균
tqdm.pandas()
test['target_mean_1'] = test.progress_apply(lambda x : power_mean_1.loc[(power_mean_1.build_num == x['build_num']) & (power_mean_1.hour == x['hour']) & (power_mean_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

## 건물별, 요일별, 시간별 전력소비량 표준편차
tqdm.pandas()
test['target_std_1'] = test.progress_apply(lambda x : power_std_1.loc[(power_std_1.build_num == x['build_num']) & (power_std_1.hour == x['hour']) & (power_std_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

tqdm.pandas()
test['target_min_1'] = test.progress_apply(lambda x : power_min_1.loc[(power_min_1.build_num == x['build_num']) & (power_min_1.hour == x['hour']) & (power_min_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

tqdm.pandas()
test['target_max_1'] = test.progress_apply(lambda x : power_max_1.loc[(power_max_1.build_num == x['build_num']) & (power_max_1.hour == x['hour']) & (power_max_1.weekday == x['weekday']) ,'power'].values[0], axis = 1)

test['sin_hour'] = np.sin(2*np.pi*test.hour/24)
test['cos_hour'] = np.cos(2*np.pi*test.hour/24)
test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)
test['sin_weekday'] = -np.sin(2 * np.pi * (test['weekday']+1)/7.0)
test['cos_weekday'] = -np.cos(2 * np.pi * (test['weekday']+1)/7.0)

#summer_sin, cos
test['summer_sin'] = test['date_time'].apply(summer_sin)
test['summer_cos'] = test['date_time'].apply(summer_cos)

## 화씨 온도
test['temp_F'] = (test['temp'] * 9/5) + 32

## 체감 온도
test['temp2'] = 13.12 + 0.6215*test['temp'] - 11.37*(test['wind']*3.6)**0.16 + 0.3965*(test['wind']*3.6)**0.16*test['temp']

test['THI'] = 9/5*test['temp'] - 0.55*(1-test['hum']/100)*(9/5*test['hum']-26)+32
test['WC']=13.12+0.6215*test['temp']-13.947*test['wind']**0.16+0.486*test['temp']*test['wind']**0.16

test['is_rain'] = (test['prec'] > 0).astype(int)
test['log_temp'] = np.log1p(test['temp'])
test['wind_power'] = test['wind'] ** 2
test['dew_point'] = test['temp'] - (100 - test['hum']) / 5
test['solar_per_hour'] = test['isolation'] / (test['sunshine'] + 1e-3)
cdhs = np.array([])
for num in range(1,101,1):
    temp = test[test['build_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
test['CDH'] = cdhs

## min temperature
test = test.merge(test.groupby(['build_num','date'])['temp'].min().reset_index().rename(columns = {'temp':'min_temp'}), on = ['build_num','date'], how = 'left')

## max temperature
test = test.merge(test.groupby(['build_num','date'])['temp'].max().reset_index().rename(columns = {'temp':'max_temp'}), on = ['build_num','date'], how = 'left')

## min windspeed
test = test.merge(test.groupby(['build_num','date'])['wind'].min().reset_index().rename(columns = {'wind':'min_wind'}), on = ['build_num','date'], how = 'left')

## max windspeed
test = test.merge(test.groupby(['build_num','date'])['wind'].max().reset_index().rename(columns = {'wind':'max_wind'}), on = ['build_num','date'], how = 'left')

## min humidity
test = test.merge(test.groupby(['build_num','date'])['hum'].min().reset_index().rename(columns = {'hum':'min_hum'}), on = ['build_num','date'], how = 'left')

## max humidity
test = test.merge(test.groupby(['build_num','date'])['hum'].max().reset_index().rename(columns = {'hum':'max_hum'}), on = ['build_num','date'], how = 'left')

## mean THI
test = test.merge(test.groupby(['build_num','date'])['THI'].mean().reset_index().rename(columns = {'THI':'mean_THI'}), on = ['build_num','date'], how = 'left')

## mean CDH
test = test.merge(test.groupby(['build_num','date'])['CDH'].mean().reset_index().rename(columns = {'CDH':'mean_CDH'}), on = ['build_num','date'], how = 'left')

test = test.merge(test.groupby(['build_num','date'])['log_temp'].min().reset_index().rename(columns = {'log_temp':'min_log_temp'}), on = ['build_num','date'], how = 'left')

test = test.merge(test.groupby(['build_num','date'])['log_temp'].max().reset_index().rename(columns = {'log_temp':'max_log_temp'}), on = ['build_num','date'], how = 'left')

test = test.merge(test.groupby(['build_num','date'])['WC'].mean().reset_index().rename(columns = {'WC':'mean_WC'}), on = ['build_num','date'], how = 'left')

test['z_score'] = test['target_mean_1'] / test['target_std_1']
# test.drop(['date_time','date','day','weekday'], axis = 1, inplace = True)
test.drop(['date','day','weekday'], axis = 1, inplace = True)

test.head()

100%|██████████| 16800/16800 [00:03<00:00, 4423.81it/s]
100%|██████████| 16800/16800 [00:03<00:00, 4470.59it/s]
100%|██████████| 16800/16800 [00:04<00:00, 4115.90it/s]
100%|██████████| 16800/16800 [00:04<00:00, 4116.48it/s]
100%|██████████| 16800/16800 [00:03<00:00, 4228.00it/s]
100%|██████████| 16800/16800 [00:03<00:00, 4223.16it/s]


Unnamed: 0,num_date_time,build_num,date_time,temp,prec,wind,hum,use,area_1,area_2,hour,month,holiday,isolation,sunshine,...,dew_point,solar_per_hour,CDH,min_temp,max_temp,min_wind,max_wind,min_hum,max_hum,mean_THI,mean_CDH,min_log_temp,max_log_temp,mean_WC,z_score
0,1_20240825 00,1,2024-08-25 00:00:00,26.5,0.0,0.7,80.0,호텔,82912.71,77586.0,0,8,0,0.0,0.0,...,22.5,0.0,0.5,25.0,32.6,0.0,3.6,60.0,90.0,69.47355,18.695833,3.258097,3.514526,30.722386,7.36546
1,1_20240825 01,1,2024-08-25 01:00:00,26.1,0.0,0.0,80.0,호텔,82912.71,77586.0,1,8,0,0.0,0.0,...,22.1,0.0,0.6,25.0,32.6,0.0,3.6,60.0,90.0,69.47355,18.695833,3.258097,3.514526,30.722386,6.376051
2,1_20240825 02,1,2024-08-25 02:00:00,25.9,0.0,0.3,83.0,호텔,82912.71,77586.0,2,8,0,0.0,0.0,...,22.5,0.0,0.5,25.0,32.6,0.0,3.6,60.0,90.0,69.47355,18.695833,3.258097,3.514526,30.722386,6.212821
3,1_20240825 03,1,2024-08-25 03:00:00,25.7,0.0,1.1,83.0,호텔,82912.71,77586.0,3,8,0,0.0,0.0,...,22.3,0.0,0.2,25.0,32.6,0.0,3.6,60.0,90.0,69.47355,18.695833,3.258097,3.514526,30.722386,6.006666
4,1_20240825 04,1,2024-08-25 04:00:00,25.5,0.0,1.0,86.0,호텔,82912.71,77586.0,4,8,0,0.0,0.0,...,22.7,0.0,-0.3,25.0,32.6,0.0,3.6,60.0,90.0,69.47355,18.695833,3.258097,3.514526,30.722386,11.326145


In [20]:
test['hum'] = test['hum'].astype(float)

In [21]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 생성
le = LabelEncoder()

# fit_transform: train에 대해 레이블 인코딩
train['use'] = le.fit_transform(train['use'])
test['use'] = le.transform(test['use'])

# 예시: '호텔' → 2, '병원' → 1 이런 식으로 변환됨
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'IDC(전화국)': 0, '건물기타': 1, '공공': 2, '백화점': 3, '병원': 4, '상용': 5, '아파트': 6, '연구소': 7, '학교': 8, '호텔': 9}


In [22]:
train.to_csv('./data/train_p_final2.csv', index=False)
test.to_csv('./data/test_p_final2.csv', index=False)