# 전력 사용량 예측 경진대회

# 모델링 파트 정리

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn as sk
from sklearn.cluster import KMeans

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold

from bayes_opt import BayesianOptimization

import datetime
import warnings
warnings.filterwarnings("ignore")

In [4]:
print(pd.__version__)
print(np.__version__)
print(sns.__version__)
print(sk.__version__)

1.2.4
1.19.5
0.11.1
0.23.2


## Cluster Model: 6.19229

In [5]:
PATH = r'C:\Users\Wyatt\wyatt37\data\energy'

In [6]:
train_col = ['num', 'date_time', 'target', 'temp', 'wind', 'humid', 'rain', 'sunny', 'cooler', 'solar']
test_col = ['num', 'date_time', 'temp', 'wind', 'humid', 'rain', 'sunny', 'cooler', 'solar']

In [7]:
train = pd.read_csv(PATH + '\\train.csv', encoding='cp949', names=train_col, header=0)
test = pd.read_csv(PATH + '\\test.csv', encoding='cp949', names=test_col, header=0)
sub = pd.read_csv(PATH + '\\sample_submission.csv', encoding='cp949')

In [8]:
def make_time(train):
    train['datetime'] = pd.to_datetime(train.date_time)
    
    # 1차 time / 지지
    train['month'] = train.datetime.dt.month                    # 월(숫자)
    train['day'] = train.datetime.dt.day                        # 일(숫자)
    train['hour'] = train.datetime.dt.hour                      # 시(숫자)
    train['weekday'] = train.datetime.dt.weekday                # 시(숫자)
    train['dayofyear'] = train.datetime.dt.dayofyear            # 연 기준 몇일째(숫자)
    train['weekofyear'] = train.datetime.dt.isocalendar().week  # 연 기준 몇주째(숫자) (=week)
    
    # 2차 time / 지지
    train['month_start'] = train.datetime.dt.is_month_start
    train['month_end'] = train.datetime.dt.is_month_end
    
    # 여기까지
    
    train.drop(['date_time', 'datetime'], axis=1, inplace=True)
    
    train.weekofyear = train.weekofyear.astype(int)
    
    return train

In [9]:
train = make_time(train)
test = make_time(test)

In [10]:
# cooler와 solar를 넣어주겠습니다.
for i in range(1, 61):
    test.loc[test.num == i, 'cooler'] = train.loc[train.num == i, 'cooler'].mean()
    test.loc[test.num == i, 'solar'] = train.loc[train.num == i, 'solar'].mean()

In [11]:
test[['temp', 'wind', 'humid', 'rain', 'sunny']] = test[['temp', 'wind', 'humid', 'rain', 'sunny']].interpolate(method='values').round(1)

In [12]:
# 6월 6일은 현충일
# 8월 15일은 광복절
# 8월 17일은 임시공휴일

# 휴일이 많지 않으니, 주말에 포함시키자.

train.loc[train.weekday >= 5, 'holiday'] = True
train.loc[train.weekday < 5, 'holiday'] = False
train.loc[(train.month == 8) & (train.day == 17), 'holiday'] = True

test.loc[test.weekday >= 5, 'holiday'] = True
test.loc[test.weekday < 5, 'holiday'] = False

train.holiday = train.holiday.astype('bool')
test.holiday = test.holiday.astype('bool')

### Clustering

In [13]:
train_origin = train.copy()

In [14]:
# 정확한 군집화를 위한 스케일링
for i in range(1, 61):
    mean_ = train.loc[train.num == i].target.mean()
    std_ = train.loc[train.num == i].target.std()
    train.loc[train.num == i, 'target'] = ((train.loc[train.num == i, 'target']) - mean_) / std_

In [15]:
# 요일에 따른 평균 값 산출
weekday_mean = (
    train.groupby(['num', 'weekday'])['target'].mean()
    .reset_index()
    .pivot('num', 'weekday', 'target')
    .reset_index()
)

In [16]:
# 시간에 따른 평균 값 산출
hour_mean = (
    train.groupby(['num', 'hour'])['target'].mean()
    .reset_index()
    .pivot('num', 'hour', 'target')
    .reset_index()
    .drop('num', axis=1)
)

In [17]:
# 요일별, 시간별 평균 전력 사용량 테이블 구축
cl_df = pd.concat([weekday_mean, hour_mean], axis=1)

In [18]:
# 보기 편하게 컬럼 이름 지정
columns = (
    ['num']
    + ['day_mean_' + str(i) for i in range(7)]
    + ['hour_mean_' + str(i) for i in range(24)]
)

cl_df.columns = columns

In [19]:
# 엘보우 포인트를 통해 kmeans 클러스터링
model = KMeans(n_clusters = 4, random_state = 42)
pred = model.fit_predict(cl_df.iloc[:, 1:])

In [20]:
# 값 할당
cl_df['km_cluster'] = pred

In [21]:
# 기존 train set과 merge
train_cl = pd.merge(train, cl_df[['num', 'km_cluster']], how='left', on='num')

In [22]:
test_cl = test.copy()

In [23]:
# test set에도 클러스터 값 할당
for i in range(1, 61):
    test_cl.loc[test_cl.num == i, 'km_cluster'] = (
        train_cl.loc[train_cl.num == i, 'km_cluster'].max()
    )

### Modeling

In [24]:
X = pd.concat([train_origin, train_cl['km_cluster']], axis=1)
y = pd.concat([test, test_cl['km_cluster']], axis=1)

In [25]:
others = [31, 32, 33, 34, 39, 59, 1, 3, 5, 9, 15]
for other, km_num in zip(others, range(4, len(others) + 4)):
    X.loc[X.num == other, 'km_cluster'] = km_num
    y.loc[y.num == other, 'km_cluster'] = km_num

In [26]:
X.km_cluster.value_counts()

0     63240
2     24480
3     12240
4      2040
5      2040
6      2040
7      2040
8      2040
9      2040
10     2040
11     2040
12     2040
13     2040
14     2040
Name: km_cluster, dtype: int64

In [27]:
train_cl_0 = X.loc[X.km_cluster == 0,
                          ~X.columns.isin(['target', 'km_cluster', 'date_time'])].reset_index(drop=True)
train_cl_1 = X.loc[X.km_cluster == 1,
                          ~X.columns.isin(['target', 'km_cluster', 'date_time'])].reset_index(drop=True)
train_cl_2 = X.loc[X.km_cluster == 2,
                          ~X.columns.isin(['target', 'km_cluster', 'date_time'])].reset_index(drop=True)
train_cl_3 = X.loc[X.km_cluster == 3,
                          ~X.columns.isin(['target', 'km_cluster', 'date_time'])].reset_index(drop=True)

# 별도 모델링
# 건물별로 모델링을 할 때는 단일 값인 cooler, solar, num을 제거합니다.
for i in range(4, len(others) + 4):
    globals()['train_cl_{}'.format(i)] = X.loc[X.km_cluster == i,
                                               ~X.columns.isin(['target', 'km_cluster', 'date_time', 'cooler', 'solar', 'num'])].reset_index(drop=True)

In [28]:
train_cl_0_y = X.loc[X.km_cluster == 0, 'target'].reset_index(drop=True)
train_cl_1_y = X.loc[X.km_cluster == 1, 'target'].reset_index(drop=True)
train_cl_2_y = X.loc[X.km_cluster == 2, 'target'].reset_index(drop=True)
train_cl_3_y = X.loc[X.km_cluster == 3, 'target'].reset_index(drop=True)

# 별도 모델링
for i in range(4, len(others) + 4):
    globals()['train_cl_{}_y'.format(i)] = X.loc[X.km_cluster == i, 'target'].reset_index(drop=True)

In [29]:
test_cl_0 = y.loc[y.km_cluster == 0, ~y.columns.isin(['km_cluster', 'date_time'])]
test_cl_1 = y.loc[y.km_cluster == 1, ~y.columns.isin(['km_cluster', 'date_time'])]
test_cl_2 = y.loc[y.km_cluster == 2, ~y.columns.isin(['km_cluster', 'date_time'])]
test_cl_3 = y.loc[y.km_cluster == 3, ~y.columns.isin(['km_cluster', 'date_time'])]

# 별도 모델링
# 건물별로 모델링을 할 때는 단일 값인 cooler, solar, num을 제거합니다.
for i in range(4, len(others) + 4):
    globals()['test_cl_{}'.format(i)] = y.loc[y.km_cluster == i, ~y.columns.isin(['km_cluster', 'date_time', 'cooler', 'solar', 'num'])]

In [30]:
train_x_sets = [train_cl_0, train_cl_2, train_cl_3,
               train_cl_4, train_cl_5, train_cl_6, train_cl_7,
               train_cl_8, train_cl_9, train_cl_10, train_cl_11,
               train_cl_12, train_cl_13, train_cl_14]
train_y_sets = [train_cl_0_y, train_cl_2_y, train_cl_3_y,
               train_cl_4_y, train_cl_5_y, train_cl_6_y, train_cl_7_y,
               train_cl_8_y, train_cl_9_y, train_cl_10_y, train_cl_11_y,
               train_cl_12_y, train_cl_13_y, train_cl_14_y]
test_x_sets = [test_cl_0, test_cl_2, test_cl_3,
              test_cl_4, test_cl_5, test_cl_6, test_cl_7,
              test_cl_8, test_cl_9, test_cl_10, test_cl_11,
              test_cl_12, test_cl_13, test_cl_14]

In [31]:
preds = []
fe = []
oob = []
for x, y, t in zip(train_x_sets, train_y_sets, test_x_sets):
    rf = RandomForestRegressor(random_state=42,
                               oob_score=True,
                               n_estimators=200,
                               n_jobs = -1)
    rf.fit(x, y)
    pred = rf.predict(t)
    preds.append(pred)
    fe.append(rf.feature_importances_)
    oob.append(rf.oob_score_)

In [32]:
pred_set = []
for te, p in zip(test_x_sets, preds):
    te['pred'] = p
    pred_set.append(te)

In [33]:
pred_set = pd.concat(pred_set).sort_index()

In [34]:
sub.answer = pred_set.pred

In [35]:
cluster_pred = pred_set.pred

## XGB: 6.39996

In [38]:
preds = []
fe = []
for i in range(1, 61):
    # 건물별로 데이터 할당
    x_train_building = train.loc[train.num == i, ~train.columns.isin(['target', 'num', 'cooler', 'solar'])]
    y_train_building = train.loc[train.num == i, 'target']
    x_test_building = test.loc[test.num == i, ~test.columns.isin(['num', 'cooler', 'solar'])]

    # kfold
    cross = KFold(n_splits=5, shuffle=True, random_state=42)
    folds = []
    for train_idx, valid_idx in cross.split(x_train_building, y_train_building):
        folds.append((train_idx, valid_idx))
        
    models={}
    for fold in range(5):
        train_idx, valid_idx = folds[fold]
        X_tra = x_train_building.iloc[train_idx, :]
        y_tra = y_train_building.iloc[train_idx]
        X_val = x_train_building.iloc[valid_idx, :]
        y_val = y_train_building.iloc[valid_idx]

        model=XGBRegressor(learning_rate=0.027, random_state=42, n_estimators=10000)
        model.fit(X_tra, y_tra, eval_set=[(X_tra, y_tra), (X_val, y_val)],
                  eval_metric='rmse',
                  early_stopping_rounds=30, verbose=0)
        models[fold]=model
    
    pred = models[0].predict(x_test_building) / 5
    for k in range(1, 5):
        pred += models[k].predict(x_test_building) / 5
    
    preds.append(pred.tolist())
    fe.append(model.feature_importances_)

In [39]:
xgb_preds = sum(preds, [])

## LGBM: 6.34797

In [40]:
preds = []
fes = []

for i in range(1, 61): 
    # 건물별로 데이터 할당
    x_train_building = train.loc[train.num == i, ~train.columns.isin(['target', 'num', 'cooler', 'solar'])]
    y_train_building = train.loc[train.num == i, 'target']
    x_test_building = test.loc[test.num == i, ~test.columns.isin(['num', 'cooler', 'solar'])]
    
    cross = KFold(n_splits=5, shuffle=True, random_state=42)
    folds = []
    for train_idx, valid_idx in cross.split(x_train_building, y_train_building):
        folds.append((train_idx, valid_idx))
        
    models={}
    for fold in range(5):
        train_idx, valid_idx = folds[fold]
        X_tra = x_train_building.iloc[train_idx, :]
        y_tra = y_train_building.iloc[train_idx]
        X_val = x_train_building.iloc[valid_idx, :]
        y_val = y_train_building.iloc[valid_idx]

        model=LGBMRegressor(learning_rate=0.027, random_state=42, n_estimators=10000)
        model.fit(X_tra, y_tra, eval_set=[(X_tra, y_tra), (X_val, y_val)],
                  eval_metric='smape',
                  early_stopping_rounds=30, verbose=0)
        models[fold]=model
    
    pred = models[0].predict(x_test_building) / 5
    fe = models[0].feature_importances_ / 5
    for k in range(1, 5):
        pred += models[k].predict(x_test_building) / 5
        fe += models[k].feature_importances_ / 5
    
    preds.append(pred.tolist())
    fes.append(fe)

In [41]:
lgbm_preds = sum(preds, [])

## 앙상블 Best3: 6.11069

In [42]:
best_3_ens_preds = (np.array(cluster_pred) + np.array(xgb_preds) + np.array(lgbm_preds)) / 3

In [43]:
sub.answer = best_3_ens_preds

In [44]:
#sub.to_csv('submission/210625_final_newbest3-ens.csv', index=False)
sub

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,2892.107278
1,1 2020-08-25 01,2890.838846
2,1 2020-08-25 02,2889.665891
3,1 2020-08-25 03,2888.122274
4,1 2020-08-25 04,2885.626592
...,...,...
10075,60 2020-08-31 19,1376.812282
10076,60 2020-08-31 20,1324.986924
10077,60 2020-08-31 21,1217.368370
10078,60 2020-08-31 22,1187.763003


- **PUBLIC SCORE / RANK(RATIO):**  6.11069 / 21 (6.25%)
- **PRIVATE SCORE / RANK(RATIO):** 5.58900 / 13 (3.86%)