In [110]:
#import lightgbm
import random
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
t = time.strftime('%m%d-%H%M', time.localtime(time.time()))
import warnings
warnings.filterwarnings(action='ignore')

pd.set_option('display.max_columns', 30)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정
#데이터 불러오기

df = pd.read_csv('./data/train.csv')

df = df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

df.head()

Unnamed: 0,num_date_time,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [111]:
def dmw(df):
    df = df.fillna(0)
    date = pd.to_datetime(df.date_time)
    df['hour'] = date.dt.hour
    df['day'] = date.dt.weekday
    df['month'] = date.dt.month
    df['week'] = date.dt.isocalendar().week
        
    df['holiday'] = df.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)
    df['date_time'] = df['date_time'].apply(lambda x : int(x[0:8]))
    df.loc[((df['month'] == 6) & (df['day'] == 2) &(df['week'] == 22), 'holiday')] = 1
    df.loc[((df['month'] == 6) & (df['day'] == 0) &(df['week'] == 23), 'holiday')] = 1
    df.loc[((df['month'] == 8) & (df['day'] == 0) &(df['week'] == 33), 'holiday')] = 1
    
    def CDH(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)]-26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)]-26))
        return np.array(ys)
    cdhs = np.array([])
    
    for num in range(1,101,1):
        temp = df[df['building_number'] == num]
        cdh = CDH(temp['temperature'].values)
        cdhs = np.concatenate([cdhs, cdh])
    df['CDH'] = cdhs
    ## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
    df['sin_time'] = np.sin(2*np.pi*df.hour/24)
    df['cos_time'] = np.cos(2*np.pi*df.hour/24)
    ## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
    df['THI'] = 9/5*df['temperature'] - 0.55*(1-df['humidity']/100)*(9/5*df['humidity']-26)+32
    return df

dmw = dmw(df)

esangch_95_1 = dmw.loc[(dmw.building_number == 95)&(dmw.hour == 16)&(dmw.day == 2), 'power_consumption']
sum16 = esangch_95_1.sum()/12
esangch_95_2 = dmw.loc[(dmw.building_number == 95)&(dmw.hour == 17)&(dmw.day == 2), 'power_consumption']
sum17 = (esangch_95_2.sum()-0.36)/12
dmw.loc[(dmw.building_number == 95)&(dmw.hour == 16)&(dmw.day == 2)&(dmw.month == 7)&(dmw.week == 30), 'power_consumption'] = sum16
dmw.loc[(dmw.building_number == 95)&(dmw.hour == 17)&(dmw.day == 2)&(dmw.month == 7)&(dmw.week == 30), 'power_consumption'] = sum17
dmw
dmw.to_csv('./data/dmw_train.csv', index=False)

In [112]:
train_df1 = pd.DataFrame()
pretest_df1 = pd.DataFrame()
for i in range(1,101,1):
    buff = dmw.loc[(dmw.building_number == i)] 
    train_df1 = pd.concat([train_df1, buff[:-168]])
    pretest_df1 = pd.concat([pretest_df1, buff[-168:]])

train, pretest 빌딩별로 split

In [113]:
def preprocessing(df, target_df):
    # power_consumption의 하위 5% 값을 계산
    threshold = target_df['power_consumption'].quantile(0.00)

    # 하위 5% 값을 제외한 데이터만 필터링
    filtered_df = target_df[target_df['power_consumption'] > threshold]

    #######################################
    ## 건물별, 요일별, 전력시간별 소비량 평균 넣어주기
    power_mean = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour', 'day'], aggfunc = np.mean).reset_index()
    #######################################
    ## 건물별, 요일별, 전력시간별 소비량 표준편차 넣어주기
    power_std = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour', 'day'], aggfunc = np.std).reset_index()
    #######################################
    ## 건물별 시간별 전력소비량 평균 넣어주기
    power_hour_mean = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour'], aggfunc = np.mean).reset_index()
    #######################################
    ## 건물별 시간별 전력소비량 표준편차 넣어주기
    power_hour_std = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour'], aggfunc = np.std).reset_index()
    #######################################
    #추가
    #power_median = pd.pivot_table(merged_train_df1, values = 'power_consumption', index = ['building_number', 'hour', 'day'], aggfunc = np.median).reset_index()
    #power_hour_median = pd.pivot_table(merged_train_df1, values = 'power_consumption', index = ['building_number', 'hour'], aggfunc = np.median).reset_index()
    #######################################
     # 병합을 위한 키 설정
    merge_keys = ['hour', 'day']  
    # 데이터프레임 병합
    df = df.merge(power_mean[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_mean'))
    df = df.merge(power_std[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_std'))
    df = df.merge(power_hour_mean[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_mean'))
    df = df.merge(power_hour_std[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_std'))
    
    #추가
    #df = df.merge(power_median[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_median'))
    #df = df.merge(power_hour_median[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_median'))
    
    df = df.rename(columns = {'power_consumption_day_hour_mean':'day_hour_mean','power_consumption_day_hour_std':'day_hour_std','power_consumption_hour_mean':'hour_mean','power_consumption_hour_std':'hour_std'})
    return df


In [114]:
def data_train(df):
    
    grouped = df.groupby(['date_time'])
    df['max_power'] = grouped['power_consumption'].transform(np.max)
    df['min_power'] = grouped['power_consumption'].transform(np.min)
    
    grouped2 = df.groupby(['day'])
    df['max_power'] = grouped2['max_power'].transform(np.mean)
    df['min_power'] = grouped2['min_power'].transform(np.mean)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.fillna(0)
    df['THI_cat'] = pd.cut(df['THI'], bins = [0,68,75,80,200],labels=[1,2,3,4])
    return df.drop(columns=['date_time'])

def data_test(df, target_df):
    grouped_train = target_df.groupby(['day'])
    max_power = grouped_train['max_power'].median()
    min_power = grouped_train['min_power'].median()
    max_power_df = max_power.reset_index()
    min_power_df = min_power.reset_index()

    # 테스트 데이터에 max_power_median와 min_power_median을 추가
    df = pd.merge(df, max_power_df, on=['day'])
    df = pd.merge(df, min_power_df, on=['day'])
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.fillna(0)
    df['THI_cat'] = pd.cut(df['THI'], bins = [0,68,75,80,200],labels=[1,2,3,4])

    
    
    return df.drop(columns=['date_time'])


In [115]:
def change_type(df):
    df['THI_cat'] = df['THI_cat'].astype('category')
    df['hour'] = df['hour'].astype('category')
    df['holiday'] = df['holiday'].astype('category')
    
    return df
    

In [116]:
train_first = train_df1.drop(columns=['num_date_time','sunshine','solar_radiation'])
pretest_first = pretest_df1.drop(columns=['num_date_time','sunshine','solar_radiation'])
for i in range(1,101,1):
    tr = train_first.loc[train_first.building_number==i].reset_index()
    tr = tr.drop(columns=['index','building_number'])
    te = pretest_first.loc[pretest_first.building_number==i].reset_index()
    te = te.drop(columns=['index','building_number'])
    
    pr_train_df = preprocessing(tr,tr)
    pr_pretest_df = preprocessing(te,tr)
    
    mm_train_df = data_train(pr_train_df)
    mm_pretest_df = data_test(pr_pretest_df, pr_train_df)
    

    mm_train_df = change_type(mm_train_df)
    mm_pretest_df = change_type(mm_pretest_df)
    
    mm_train_df.to_csv(f'./pretest/train_building{i}.csv',index =False)
    mm_pretest_df.to_csv(f'./pretest/pretest_building{i}.csv',index =False)
    


KeyError: 'hoilday'

In [None]:
# train_first = train_df1.drop(columns=['num_date_time','sunshine','solar_radiation'])
# pretest_first = pretest_df1.drop(columns=['num_date_time','sunshine','solar_radiation'])

# i = 
# tr = train_first.loc[train_first.building_number==i].reset_index()
# tr = tr.drop(columns=['index','building_number'])
# te = pretest_first.loc[pretest_first.building_number==i].reset_index()
# te = te.drop(columns=['index','building_number'])

# pr_train_df = preprocessing(tr,tr)
# pr_pretest_df = preprocessing(te,tr)

# mm_train_df = data_train(pr_train_df2)
# mm_pretest_df = data_test(pr_pretest_df2, pr_train_df2)

In [None]:
i = 75
dd = pd.read_csv(f'./pretest/train_building{i}.csv')
dd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1872 entries, 0 to 1871
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   temperature        1872 non-null   float64
 1   rainfall           1872 non-null   float64
 2   windspeed          1872 non-null   float64
 3   humidity           1872 non-null   float64
 4   power_consumption  1872 non-null   float64
 5   hour               1872 non-null   int64  
 6   day                1872 non-null   int64  
 7   month              1872 non-null   int64  
 8   week               1872 non-null   int64  
 9   holiday            1872 non-null   int64  
 10  CDH                1872 non-null   float64
 11  sin_time           1872 non-null   float64
 12  cos_time           1872 non-null   float64
 13  THI                1872 non-null   float64
 14  day_hour_mean      1872 non-null   float64
 15  day_hour_std       1872 non-null   float64
 16  hour_mean          1872 

In [None]:
# i = 75
# dd = pd.read_csv(f'./pretest/train_building{i}.csv')
# dd

train, pretest 저장

In [None]:
# ## test용
# merged_train_df.to_csv(f'./data/train_split_wsw_{t}.csv')
# merged_pretest_df.to_csv(f'./data/pretest_wsw_{t}.csv')

빌딩별로 csv생성

In [None]:
# #train,test
# train_df  = pd.read_csv((f'./data/train_split_wsw_{t}.csv')).drop(columns='Unnamed: 0')
# pretest_df  = pd.read_csv((f'./data/pretest_wsw_{t}.csv')).drop(columns='Unnamed: 0')
# for i in range(1,101,1):
#     tr = train_df.loc[train_df.building_number==i].reset_index()
#     tr = tr.drop(columns=['index'])
#     te = pretest_df.loc[pretest_df.building_number==i].reset_index()
#     te = te.drop(columns=['index'])
#     tr.to_csv(f'./pretest/train_building{i}.csv')
#     te.to_csv(f'./pretest/pretest_building{i}.csv')

In [None]:
# #혹시 vaild를 나눠야 할수도있을때만 사용
# train_df  = pd.read_csv((f'./split/train_fianl_wsw_{t}.csv')).drop(columns='Unnamed: 0')
# test_df  = pd.read_csv((f'./split/test_fianl_wsw_{t}.csv')).drop(columns='Unnamed: 0')
# for i in range(1,101,1):
#     tr = train_df.loc[train_df.building_number==i].reset_index()
#     tr = tr.drop(columns=['index'])
#     te = test_df.loc[test_df.building_number==i].reset_index()
#     te = te.drop(columns=['index'])
#     tr.to_csv(f'./split/train_building{i}.csv')
#     te.to_csv(f'./split/test_building{i}.csv')