In [46]:
import os
print(os.getcwd())
print(os.listdir())

d:\workspace\hit_ml_dl\energy
['data', 'sol01.ipynb', 'source']


In [47]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from datetime import datetime
import warnings

In [48]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [49]:
# SMAPE 계산 함수
def smape(gt, preds):
    gt = np.array(gt)
    preds = np.array(preds)
    v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
    score = np.mean(v) * 100
    return score

In [50]:
# 커스텀 손실 함수
def weighted_mse(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float64")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2)
        return grad, hess
    return weighted_mse_fixed

In [51]:
# 데이터 로드
train = pd.read_csv("./data/train.csv", encoding='utf-8')
test = pd.read_csv("./data/test.csv", encoding='utf-8')
building_info = pd.read_csv("./data/building_info.csv", encoding='utf-8')
sample_submission = pd.read_csv("./data/sample_submission.csv", encoding='utf-8')

In [52]:
# 열 이름 번역
train = train.rename(columns={
    '건물번호': 'building_number', '일시': 'date_time', '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall', '풍속(m/s)': 'windspeed', '습도(%)': 'humidity',
    '일조(hr)': 'sunshine', '일사(MJ/m2)': 'solar_radiation', '전력소비량(kWh)': 'power_consumption'
})
test = test.rename(columns={
    '건물번호': 'building_number', '일시': 'date_time', '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall', '풍속(m/s)': 'windspeed', '습도(%)': 'humidity'
})
building_info = building_info.rename(columns={
    '건물번호': 'building_number', '건물유형': 'building_type', '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area', '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity', 'PCS용량(kW)': 'pcs_capacity'
})

In [53]:
# 건물 유형 번역
translation_dict = {
    '건물기타': 'Other Buildings', '공공': 'Public', '학교': 'University', '백화점': 'Department Store',
    '병원': 'Hospital', '상용': 'Commercial', '아파트': 'Apartment', '연구소': 'Research Institute',
    'IDC(전화국)': 'IDC', '호텔': 'Hotel'
}
building_info['building_type'] = building_info['building_type'].replace(translation_dict)

In [54]:
# 데이터 병합
train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

In [55]:
# 날짜 및 시간 처리
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month
train['day_of_week'] = train['date_time'].dt.dayofweek

test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['day_of_week'] = test['date_time'].dt.dayofweek

In [56]:
# 이상치 제거
train = train[train['power_consumption'] > 0]

In [57]:
# 휴일 여부
holi_weekday = ['2024-06-06', '2024-08-15']
train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [58]:
# 주기성 피처
train['sin_hour'] = np.sin(2 * np.pi * train['hour'] / 23.0)
train['cos_hour'] = np.cos(2 * np.pi * train['hour'] / 23.0)
test['sin_hour'] = np.sin(2 * np.pi * test['hour'] / 23.0)
test['cos_hour'] = np.cos(2 * np.pi * test['hour'] / 23.0)

In [59]:
# CDH, THI, WCT 계산
def calculate_cdh(dataframe):
    cdhs = []
    for i in range(1, 101):
        temp = dataframe[dataframe['building_number'] == i]['temperature'].values
        cdh = np.cumsum(temp - 26)
        cdhs.append(cdh)
    return np.concatenate(cdhs)

train['CDH'] = calculate_cdh(train)
test['CDH'] = calculate_cdh(test)
train['THI'] = 9/5 * train['temperature'] - 0.55 * (1 - train['humidity']/100) * train['humidity'] + 32
test['THI'] = 9/5 * test['temperature'] - 0.55 * (1 - test['humidity']/100) * test['humidity'] + 32
train['WCT'] = 13.12 + 0.6125 * train['temperature'] - 11.37 * (train['windspeed'] ** 0.16) + 0.3965 * (train['windspeed'] ** 0.16) * train['temperature']
test['WCT'] = 13.12 + 0.6125 * test['temperature'] - 11.37 * (test['windspeed'] ** 0.16) + 0.3965 * (test['windspeed'] ** 0.16) * test['temperature']

In [60]:
# 통계 피처
power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
power_mean.columns = ['building_number', 'hour', 'hour_mean']
train = train.merge(power_mean, on=['building_number', 'hour'], how='left')
test = test.merge(power_mean, on=['building_number', 'hour'], how='left')

In [61]:
# 학습 데이터 준비
X = train.drop(['power_consumption', 'date_time', 'building_type', 'rainfall', 'sunshine', 'solar_radiation'], axis=1)
y = train['power_consumption']
# Remove the problematic line
# Continue with the rest of your code

In [None]:
# 건물 유형별 모델 학습
type_list = train['building_type'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
submission = pd.read_csv('./data/sample_submission.csv')

for btype in type_list:
    train_type = train[train['building_type'] == btype]
    test_type = test[test['building_type'] == btype]
    
    X_type = train_type.drop(['power_consumption', 'date_time', 'building_type', 'rainfall', 'sunshine', 'solar_radiation'], axis=1)
    y_type = train_type['power_consumption']
    test_X_type = test_type.drop(['date_time', 'building_type', 'rainfall', 'sunshine', 'solar_radiation'], axis=1)
    
    model = XGBRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=10, subsample=0.7,
        colsample_bytree=0.5, min_child_weight=3, random_state=42, objective=weighted_mse(3)
    )
    
    model.fit(X_type, np.log(y_type))
    pred = np.exp(model.predict(test_X_type))
    test.loc[test['building_type'] == btype, 'power_consumption'] = pred

FileNotFoundError: [Errno 2] No such file or directory: 'sample_submission.csv'

In [None]:
# 제출 파일 생성
submission['answer'] = test['power_consumption']
submission.to_csv('submission.csv', index=False)

print("제출 파일 'submission.csv'가 생성되었습니다.")  