In [12]:
#import lightgbm
import random
import pandas as pd
import numpy as np
import os
import time
t = time.strftime('%m%d-%H%M', time.localtime(time.time()))
import warnings
warnings.filterwarnings(action='ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정
#데이터 불러오기
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

train_df = train_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test_df = test_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})


test_df.head()

Unnamed: 0,num_date_time,building_number,date_time,temperature,rainfall,windspeed,humidity
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77


In [13]:
def dmw(df):
    df = df.fillna(0)
    date = pd.to_datetime(df.date_time)
    df['hour'] = date.dt.hour
    df['day'] = date.dt.weekday
    df['month'] = date.dt.month
    df['week'] = date.dt.isocalendar().week
        
    df['holiday'] = df.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)
    df['date_time'] = df['date_time'].apply(lambda x : int(x[0:8]))
    df.loc[((df['month'] == 6) & (df['day'] == 2) &(df['week'] == 22), 'holiday')] = 1
    df.loc[((df['month'] == 6) & (df['day'] == 0) &(df['week'] == 23), 'holiday')] = 1
    df.loc[((df['month'] == 8) & (df['day'] == 0) &(df['week'] == 33), 'holiday')] = 1
    
    def CDH(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)]-26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)]-26))
        return np.array(ys)
    cdhs = np.array([])
    
    for num in range(1,101,1):
        temp = df[df['building_number'] == num]
        cdh = CDH(temp['temperature'].values)
        cdhs = np.concatenate([cdhs, cdh])
    df['CDH'] = cdhs
    ## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
    df['sin_time'] = np.sin(2*np.pi*df.hour/24)
    df['cos_time'] = np.cos(2*np.pi*df.hour/24)
    ## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
    df['THI'] = 9/5*df['temperature'] - 0.55*(1-df['humidity']/100)*(9/5*df['humidity']-26)+32
    return df

tr_dmw = pd.read_csv('./data/dmw_train.csv')
te_dmw = dmw(test_df)


In [14]:
import pandas as pd

outlier_index = {
4: [1318],
7: [442, 1578, 1579],
11: [1300, 1301, 1493, 1658, 1665],
17: list(range(1249, 1272)),
22: [830],
28: [1370],
34: [1653],
35: [1653, 1654],
56: [184],
58: list(range(810, 838)),
70: list(range(1412, 1584)),
75: [202] + list(range(343, 355)) + list(range(463, 496)) + [1433],
91: [184],
92: [1281],
98: [1489],
100: [185, 686]
}


def outlier(df,i):
    df['outlier'] = 0
    for k ,v in outlier_index.items():
        if i == k:
            df['outlier'] = df.index.isin(v).astype(int)
    
    return df

In [15]:
def preprocessing(df, target_df):
    
    filtered_df = target_df[target_df['outlier'] < 1]

    #######################################
    ## 건물별, 요일별, 전력시간별 소비량 평균 넣어주기
    power_mean = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour', 'day'], aggfunc = np.mean).reset_index()
    #######################################
    ## 건물별, 요일별, 전력시간별 소비량 표준편차 넣어주기
    power_std = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour', 'day'], aggfunc = np.std).reset_index()
    #######################################
    ## 건물별 시간별 전력소비량 평균 넣어주기
    #power_hour_mean = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour'], aggfunc = np.mean).reset_index()
    #######################################
    ## 건물별 시간별 전력소비량 표준편차 넣어주기
    #power_hour_std = pd.pivot_table(filtered_df, values = 'power_consumption', index = ['hour'], aggfunc = np.std).reset_index()
    #######################################
    #추가
    #power_median = pd.pivot_table(merged_train_df1, values = 'power_consumption', index = ['building_number', 'hour', 'day'], aggfunc = np.median).reset_index()
    #power_hour_median = pd.pivot_table(merged_train_df1, values = 'power_consumption', index = ['building_number', 'hour'], aggfunc = np.median).reset_index()
    #######################################
     # 병합을 위한 키 설정
    merge_keys = ['hour', 'day']  
    # 데이터프레임 병합
    df = df.merge(power_mean[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_mean'))
    df = df.merge(power_std[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_std'))
    #df = df.merge(power_hour_mean[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_mean'))
    #df = df.merge(power_hour_std[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_std'))
    
    #추가
    #df = df.merge(power_median[merge_keys + ['power_consumption']], on=merge_keys, how='left', suffixes=('', '_day_hour_median'))
    #df = df.merge(power_hour_median[merge_keys[:-1] + ['power_consumption']], on=merge_keys[:-1], how='left', suffixes=('', '_hour_median'))
    
    df = df.rename(columns = {'power_consumption_day_hour_mean':'day_hour_mean','power_consumption_day_hour_std':'day_hour_std'})
    #,'power_consumption_hour_mean':'hour_mean','power_consumption_hour_std':'hour_std'
    
    return df

In [1]:
def data_train(df):
    filtered_df = df[df['outlier'] < 1]
    
    grouped = filtered_df.groupby(['date_time'])
    df['max_power'] = grouped['power_consumption'].transform(np.max)
    df['min_power'] = grouped['power_consumption'].transform(np.min)
    
    grouped2 = df.groupby(['day'])
    df['max_power'] = grouped2['max_power'].transform(np.mean)
    df['min_power'] = grouped2['min_power'].transform(np.mean)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.fillna(0)
    df['THI_cat'] = pd.cut(df['THI'], bins = [0,68,75,80,200],labels=[1,2,3,4])
    return df.drop(columns=['date_time'])

def data_test(df, target_df):
    
    
    grouped_train = target_df.groupby(['day'])
    df['max_power'] = grouped_train['max_power'].transform(np.max)
    df['min_power'] = grouped_train['min_power'].transform(np.max)
   
   
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.fillna(0)
    df['THI_cat'] = pd.cut(df['THI'], bins = [0,68,75,80,200],labels=[1,2,3,4])

    return df.drop(columns=['date_time'])

In [17]:
def change_type(df):
    df['THI_cat'] = df['THI_cat'].astype('category')
    
    df['holiday'] = df['holiday'].astype('category')
    
    return df
    

빌딩별로 csv생성

In [20]:
train_first = tr_dmw.drop(columns=['num_date_time','sunshine','solar_radiation'])
test_first = te_dmw.drop(columns=['num_date_time'])
for i in range(1,101,1):
    tr = train_first.loc[train_first.building_number==i].reset_index()
    tr = tr.drop(columns=['index','building_number'])
    te = test_first.loc[test_first.building_number==i].reset_index()
    te = te.drop(columns=['index','building_number'])
    
    outlier_train_df = outlier(tr,i)
    outlier_test_df = outlier(te,i)
    
    pr_train_df = preprocessing(outlier_train_df,outlier_train_df)
    pr_test_df = preprocessing(outlier_test_df,outlier_train_df)

    
    mm_train_df = data_train(pr_train_df)
    mm_test_df = data_test(pr_test_df, pr_train_df)
       
    mm_train_df = change_type(mm_train_df)
    mm_test_df = change_type(mm_test_df)
    
    mm_train_df.to_csv(f'./submit/train_building{i}.csv',index =False)
    mm_test_df = mm_test_df.rename(columns = {'power_consumption':'day_hour_mean'})
    mm_test_df.to_csv(f'./submit/test_building{i}.csv',index =False)


In [21]:
buff = pd.read_csv('./submit/test_building2.csv')
buff

Unnamed: 0,temperature,rainfall,windspeed,humidity,hour,day,month,week,holiday,CDH,sin_time,cos_time,THI,outlier,day_hour_mean,day_hour_std,max_power,min_power,THI_cat
0,23.5,0.0,2.2,72,0,3,8,34,0,-2.5,0.000000,1.000000,58.3456,0,1242.450000,51.422896,2516.820000,938.370000,1
1,23.0,0.0,0.9,72,1,3,8,34,0,-5.5,0.258819,0.965926,57.4456,0,1236.180000,55.879404,2516.820000,938.370000,1
2,22.7,0.0,1.5,75,2,3,8,34,0,-8.8,0.500000,0.866025,57.8725,0,1227.810000,48.237425,2516.820000,938.370000,1
3,22.1,0.0,1.3,78,3,3,8,34,0,-12.7,0.707107,0.707107,57.9376,0,1205.670000,58.753431,2516.820000,938.370000,1
4,21.8,0.0,1.0,77,4,3,8,34,0,-16.9,0.866025,0.500000,56.9961,0,1196.070000,58.792121,2516.820000,938.370000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,23.5,0.0,2.0,75,19,2,8,35,0,-36.5,-0.965926,0.258819,59.3125,0,2200.569231,468.551499,2553.230769,919.024615,1
164,22.4,0.0,2.0,80,20,2,8,35,0,-34.0,-0.866025,0.500000,59.3400,0,2026.190769,487.192856,2553.230769,919.024615,1
165,21.7,0.0,1.6,81,21,2,8,35,0,-32.6,-0.707107,0.707107,58.5409,0,1689.895385,496.291062,2553.230769,919.024615,1
166,21.1,0.0,1.0,83,22,2,8,35,0,-33.0,-0.500000,0.866025,58.4421,0,967.126154,206.599418,2553.230769,919.024615,1
