In [2]:
import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from catboost import CatBoostRegressor
from sktime.forecasting.model_selection import temporal_train_test_split

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0) # Seed 고정

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def mae(y, pred):
    return np.mean(abs(y-pred))
 #점수 측정을 위한 코드  
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

In [3]:
clusters = [
[3, 32, 61, 62, 63, 64, 65, 66, 67, 68],
[8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 58, 59, 60, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84],
[1, 2, 4, 5, 6, 7, 9, 11, 12, 16, 33, 34, 35, 36, 54, 57, 81, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100],
[10, 13, 14, 15],
[37, 38, 39, 40, 41, 42, 43, 44]
]

drop_cols = [
[ 'week', 'THI', 'CDH', 'max_power', 'min_power','day_hour_mean','day_hour_std', 'hour_mean', 'hour_std','outlier','sensory','Rain_cat'],
[ 'week', 'THI', 'CDH', 'max_power', 'min_power' ,'day_hour_mean','day_hour_std','hour_mean', 'hour_std','sensory','Rain_cat'],
[ 'week', 'THI', 'CDH', 'max_power', 'min_power' ],
[ 'week','month','holiday', 'THI', 'CDH', 'day_hour_mean', 'day_hour_std','hour_mean', 'hour_std', 'max_power', 'min_power', 'THI_cat', 'sensory','Rain_cat'],
[ 'THI','hour_mean', 'hour_std', 'outlier']
]

for n_estimator in [100,200,300,400]:
    smape_scores = []
    mae_scores = []
    
    for k in range(5):
        cluster = clusters[k]
        drop_col = drop_cols[k]

        for i in tqdm(cluster):
            train_df = pd.read_csv(f'./pretest/train_building{i}.csv').drop(columns=drop_col)
            pretest_df = pd.read_csv(f'./pretest/pretest_building{i}.csv').drop(columns=drop_col)
            x_train = train_df.drop(columns=['power_consumption'])
            y_train = train_df['power_consumption']
            y_valid = pretest_df['power_consumption']
            x_valid = pretest_df.drop(columns=['power_consumption'])
            
            model_catboost = CatBoostRegressor(iterations=n_estimator, verbose=False)
            model_catboost.fit(x_train, y_train)
            smape_score, mae_score = validate(x_valid, y_valid, model_catboost)
            smape_scores.append(smape_score)
            mae_scores.append(mae_score)
    
    smape_mean = np.mean(smape_scores)
    mae_mean = np.mean(mae_scores)
        
    print(f"n_estimator: {n_estimator}")
    print(f'SMAPE: {smape_mean}\nMAE: {mae_mean}')


100%|██████████| 10/10 [00:01<00:00,  6.73it/s]
100%|██████████| 45/45 [00:06<00:00,  7.46it/s]
100%|██████████| 33/33 [00:05<00:00,  6.01it/s]
100%|██████████| 4/4 [00:00<00:00,  7.88it/s]
100%|██████████| 8/8 [00:01<00:00,  5.74it/s]


n_estimator: 100
SMAPE: 5.862831303791805
MAE: 111.96320758944267


100%|██████████| 10/10 [00:02<00:00,  4.23it/s]
100%|██████████| 45/45 [00:10<00:00,  4.30it/s]
100%|██████████| 33/33 [00:09<00:00,  3.32it/s]
100%|██████████| 4/4 [00:00<00:00,  4.54it/s]
100%|██████████| 8/8 [00:02<00:00,  3.18it/s]


n_estimator: 200
SMAPE: 5.75246218356308
MAE: 109.7239054054223


100%|██████████| 10/10 [00:03<00:00,  2.99it/s]
100%|██████████| 45/45 [00:15<00:00,  2.97it/s]
100%|██████████| 33/33 [00:15<00:00,  2.19it/s]
100%|██████████| 4/4 [00:01<00:00,  3.07it/s]
100%|██████████| 8/8 [00:03<00:00,  2.22it/s]


n_estimator: 300
SMAPE: 5.69779291148119
MAE: 108.9770746005359


100%|██████████| 10/10 [00:04<00:00,  2.36it/s]
100%|██████████| 45/45 [00:18<00:00,  2.38it/s]
100%|██████████| 33/33 [00:18<00:00,  1.82it/s]
100%|██████████| 4/4 [00:01<00:00,  2.42it/s]
100%|██████████| 8/8 [00:04<00:00,  1.67it/s]

n_estimator: 400
SMAPE: 5.654175886228478
MAE: 108.52367413633856





In [5]:

color = ['red', 'green' , 'blue', 'black', 'm']
preds = [0 for _ in range(100)]


for k in range(5):
    cluster = clusters[k]
    drop_col = drop_cols[k]
    c = color[k]
    for i in tqdm(cluster):
        train_df = pd.read_csv(f'./submit/train_building{i}.csv').drop(columns=drop_col)
        test_df = pd.read_csv(f'./submit/test_building{i}.csv').drop(columns=drop_col)
        pred_df = pd.DataFrame()   # 시드별 예측값을 담을 data frame
        
        t_x = train_df.drop(columns=['power_consumption'])
        t_y = train_df['power_consumption']
    
        #print(f"building_{i}")
        for seed in [0,1,2,3,4,5,6]:
            model = CatBoostRegressor(iterations=400,verbose=False)
            model.fit(t_x, t_y)
            y_pred = model.predict(test_df)
            pred_df.loc[:,seed] = y_pred  
        pred = pred_df.mean(axis=1)
        preds[i-1] = pred
        
np_preds = np.array(preds)
submit = np_preds.flatten()  


100%|██████████| 10/10 [00:30<00:00,  3.05s/it]
100%|██████████| 45/45 [02:11<00:00,  2.93s/it]
100%|██████████| 33/33 [02:05<00:00,  3.80s/it]
100%|██████████| 4/4 [00:11<00:00,  2.92s/it]
100%|██████████| 8/8 [00:33<00:00,  4.14s/it]


In [6]:
########################################
csv_name = './submission/CatBoost_wsw_clust5.csv'
########################################

In [7]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = submit
submission.to_csv(f'{csv_name}', index = False)