In [2]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from datetime import datetime, timedelta

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')
building_df = pd.read_csv('./building_info.csv')

from datetime import datetime, timedelta

def classify_weekdays(start_date, end_date):
    date_format = "%Y-%m-%d"
    start = datetime.strptime(start_date, date_format)
    end = datetime.strptime(end_date, date_format)
    
    classification = []
    current_date = start
    
    while current_date <= end:
        if current_date.weekday() < 5:  # 0: 월요일, 1: 화요일, ..., 4: 금요일
            classification.append(0)  # 평일
        else:
            classification.append(1)  # 주말
        
        current_date += timedelta(days=1)
    
    return classification

# 예시 데이터 프레임 생성
data = {'date': pd.date_range(start='2022-06-01', end='2022-08-24')}
train = pd.DataFrame(data)
t_data = {'date': pd.date_range(start='2022-08-25', end='2022-08-31')}
test = pd.DataFrame(t_data)

# 주말/평일 분류한 결과를 'holidays' 열에 추가
start_date = "2022-06-01"
end_date = "2022-08-24"
test_start_date = "2022-08-25"
test_end_date = "2022-08-31"
train['holidays'] = classify_weekdays(start_date, end_date)
train_df['holidays'] = train['holidays']
test['holidays'] = classify_weekdays(test_start_date, test_end_date)
test_df['holidays'] = test['holidays']
#added
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')




# date time feature 생성
train_df['hour'] = train_df['일시'].dt.hour
train_df['day'] = train_df['일시'].dt.day
train_df['month'] = train_df['일시'].dt.month
train_df['year'] = train_df['일시'].dt.year

test_df['hour'] = test_df['일시'].dt.hour
test_df['day'] = test_df['일시'].dt.day
test_df['month'] = test_df['일시'].dt.month
test_df['year'] = test_df['일시'].dt.year

train_df['sin_time'] = np.sin(2*np.pi*train_df.hour/24)
train_df['cos_time'] = np.cos(2*np.pi*train_df.hour/24)

test_df['sin_time'] = np.sin(2*np.pi*test_df.hour/24)
test_df['cos_time'] = np.cos(2*np.pi*test_df.hour/24)
train_df['THI'] = 9/5*train_df['기온(C)'] - 0.55*(1-train_df['습도(%)']/100)*(9/5*train_df['습도(%)']-26)+32
test_df['THI'] = 9/5*test_df['기온(C)'] - 0.55*(1-test_df['습도(%)']/100)*(9/5*test_df['습도(%)']-26)+32
##############################################################


train_df = train_df.drop(columns = ['일조(hr)', '일사(MJ/m2)', '강수량(mm)', '풍속(m/s)'])
test_df = test_df.drop(columns = ['강수량(mm)', '풍속(m/s)'])
train_data = TabularDataset(train_df)
train_data.head()

label = '전력소비량(kWh)'
####################################################

# 주어진 날짜 범위에 따라 데이터를 K-fold로 분할하는 함수
def split_data_by_date_ranges(data, date_ranges):
    split_data = []
    for start_date, end_date in date_ranges:
        fold_data = data[(data['일시'] >= start_date) & (data['일시'] <= end_date)]
        split_data.append(fold_data)
    return split_data

# 주어진 날짜 범위들에 따라 K-fold로 데이터 분할
date_ranges = [
    ("2022-06-01", "2022-06-17"),
    ("2022-06-18", "2022-07-04"),
    ("2022-07-05", "2022-07-21"),
    ("2022-07-22", "2022-08-07"),
    ("2022-08-08", "2022-08-24")
]

train_folds = split_data_by_date_ranges(train_df, date_ranges)

# K-fold 교차 검증 수행 및 앙상블 모델 학습
ensemble_predictor = TabularPredictor(label=label, problem_type='regression')
for fold_idx, fold_data in enumerate(train_folds):
    print(f"Training fold {fold_idx + 1}")
    
    # 예시 데이터 프레임 생성
    data = {'일시': pd.date_range(start=date_ranges[fold_idx][0], end=date_ranges[fold_idx][1])}  # 일시 열 생성
    fold_train = pd.DataFrame(data)
    
    # 주말/평일 분류한 결과를 'holidays' 열에 추가
    holidays = classify_weekdays(date_ranges[fold_idx][0], date_ranges[fold_idx][1])
    fold_train['holidays'] = holidays
    
    # 추가 전처리 및 변수 생성 코드
    fold_train['hour'] = fold_train['일시'].dt.hour
    fold_train['day'] = fold_train['일시'].dt.day
    fold_train['month'] = fold_train['일시'].dt.month
    fold_train['year'] = fold_train['일시'].dt.year
    fold_train['sin_time'] = np.sin(2*np.pi*fold_train.hour/24)
    fold_train['cos_time'] = np.cos(2*np.pi*fold_train.hour/24)
    fold_train['THI'] = 9/5*fold_train['기온(C)'] - 0.55*(1-fold_train['습도(%)']/100)*(9/5*fold_train['습도(%)']-26)+32
    fold_train = fold_train.drop(columns=['일조(hr)', '일사(MJ/m2)', '강수량(mm)', '풍속(m/s)'])
    
    fold_train_data = TabularDataset(fold_train)
    ensemble_predictor.fit(fold_train_data)

# 테스트 데이터에 모델을 적용하여 예측 결과 생성
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')
test_df['hour'] = test_df['일시'].dt.hour
test_df['day'] = test_df['일시'].dt.day
test_df['month'] = test_df['일시'].dt.month
test_df['year'] = test_df['일시'].dt.year
test_df['sin_time'] = np.sin(2*np.pi*test_df.hour/24)
test_df['cos_time'] = np.cos(2*np.pi*test_df.hour/24)
test_df['THI'] = 9/5*test_df['기온(C)'] - 0.55*(1-test_df['습도(%)']/100)*(9/5*test_df['습도(%)']-26)+32
test_data = TabularDataset(test_df)

predictions = ensemble_predictor.predict(test_data)

# 결과를 sample_submission 형식에 맞게 저장
sample_submission['answer'] = predictions['전력소비량(kWh)']
sample_submission.to_csv('./baseline_submission_kfold.csv', index=False)

No path specified. Models will be saved in: "AutogluonModels\ag-20230819_142234\"


Training fold 1


KeyError: '기온(C)'