In [6]:
import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from catboost import CatBoostRegressor
from sktime.forecasting.model_selection import temporal_train_test_split

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0) # Seed 고정

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def mae(y, pred):
    return np.mean(abs(y-pred))
 #점수 측정을 위한 코드  
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

In [7]:
clusters = [
[32, 61, 62, 63, 64, 65, 66, 67, 68],
[8, 24, 25, 26, 27, 28, 29, 30, 31, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 58, 59, 60, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84],
[1, 2, 4, 5, 6, 7, 9, 11, 12, 16, 33, 34, 35, 36, 54, 57, 81, 96, 97, 98, 99, 100],
[13, 15],
[37, 38, 39, 40, 41, 42, 43, 44],
[85, 86, 87, 88, 89, 90, 91, 92],
[17, 18, 19, 20, 21, 22, 23],
[93, 94],
[3, 10, 14, 95]
]

drop_cols = [
[ 'week', 'THI', 'CDH', 'max_power', 'min_power','day_hour_mean','day_hour_std', 'hour_mean', 'hour_std','outlier','sensory','Rain_cat', 'close'],
[ 'week', 'THI', 'CDH', 'max_power', 'min_power' ,'day_hour_mean','day_hour_std','hour_mean', 'hour_std','sensory','Rain_cat', 'close'],
[ 'week', 'THI', 'CDH', 'max_power', 'min_power' , 'close'],
[ 'week','month','holiday', 'THI', 'CDH', 'day_hour_mean', 'day_hour_std','hour_mean', 'hour_std', 'max_power', 'min_power', 'THI_cat', 'sensory','Rain_cat', 'close'],
[ 'THI','hour_mean', 'hour_std', 'close'],
[ 'THI', 'CDH', 'Rain_cat', 'sensory'],
[ 'windspeed', 'week', 'CDH', 'sin_time', 'cos_time', 'THI', 'sensory', 'THI_cat','day_hour_mean', 'day_hour_std', 'hour_mean', 'hour_std', 'max_power', 'min_power','Rain_cat','close'],
[ 'week', 'THI', 'CDH', 'THI_cat', 'close' ],
[ 'week', 'month', 'THI', 'CDH', 'close', 'outlier', 'day_hour_mean', 'day_hour_std','hour_mean', 'hour_std', 'Rain_cat', 'cos_time', 'sin_time']
]

for n_estimator in [100,200,300,400]:
    smape_scores = []
    mae_scores = []
    
    for k in range(9):
        cluster = clusters[k]
        drop_col = drop_cols[k]

        for i in tqdm(cluster):
            train_df = pd.read_csv(f'./pretest/train_building{i}.csv').drop(columns=drop_col)
            pretest_df = pd.read_csv(f'./pretest/pretest_building{i}.csv').drop(columns=drop_col)
            x_train = train_df.drop(columns=['power_consumption'])
            y_train = train_df['power_consumption']
            y_valid = pretest_df['power_consumption']
            x_valid = pretest_df.drop(columns=['power_consumption'])
            
            model_catboost = CatBoostRegressor(iterations=n_estimator, verbose=False)
            model_catboost.fit(x_train, y_train)
            smape_score, mae_score = validate(x_valid, y_valid, model_catboost)
            smape_scores.append(smape_score)
            mae_scores.append(mae_score)
    
    smape_mean = np.mean(smape_scores)
    mae_mean = np.mean(mae_scores)
        
    print(f"n_estimator: {n_estimator}")
    print(f'SMAPE: {smape_mean}\nMAE: {mae_mean}')


100%|██████████| 9/9 [00:01<00:00,  7.42it/s]
100%|██████████| 38/38 [00:05<00:00,  7.06it/s]
100%|██████████| 22/22 [00:04<00:00,  5.46it/s]
100%|██████████| 2/2 [00:00<00:00,  7.60it/s]
100%|██████████| 8/8 [00:01<00:00,  5.62it/s]
100%|██████████| 8/8 [00:01<00:00,  6.22it/s]
100%|██████████| 7/7 [00:00<00:00,  7.87it/s]
100%|██████████| 2/2 [00:00<00:00,  5.51it/s]
100%|██████████| 4/4 [00:00<00:00,  6.14it/s]


n_estimator: 100
SMAPE: 5.472086882183638
MAE: 104.263124365366


100%|██████████| 9/9 [00:02<00:00,  3.74it/s]
100%|██████████| 38/38 [00:08<00:00,  4.22it/s]
100%|██████████| 22/22 [00:06<00:00,  3.37it/s]
100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 8/8 [00:02<00:00,  3.12it/s]
100%|██████████| 8/8 [00:02<00:00,  3.51it/s]
100%|██████████| 7/7 [00:01<00:00,  4.12it/s]
100%|██████████| 2/2 [00:00<00:00,  3.33it/s]
100%|██████████| 4/4 [00:01<00:00,  3.38it/s]


n_estimator: 200
SMAPE: 5.4015404491887065
MAE: 102.49110733148403


100%|██████████| 9/9 [00:03<00:00,  2.72it/s]
100%|██████████| 38/38 [00:12<00:00,  2.94it/s]
100%|██████████| 22/22 [00:08<00:00,  2.50it/s]
100%|██████████| 2/2 [00:00<00:00,  3.05it/s]
100%|██████████| 8/8 [00:03<00:00,  2.28it/s]
100%|██████████| 8/8 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:02<00:00,  3.32it/s]
100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
100%|██████████| 4/4 [00:01<00:00,  2.69it/s]


n_estimator: 300
SMAPE: 5.325602921694737
MAE: 101.53733801421717


100%|██████████| 9/9 [00:03<00:00,  2.42it/s]
100%|██████████| 38/38 [00:15<00:00,  2.48it/s]
100%|██████████| 22/22 [00:11<00:00,  1.92it/s]
100%|██████████| 2/2 [00:00<00:00,  2.38it/s]
100%|██████████| 8/8 [00:05<00:00,  1.49it/s]
100%|██████████| 8/8 [00:04<00:00,  1.90it/s]
100%|██████████| 7/7 [00:02<00:00,  2.33it/s]
100%|██████████| 2/2 [00:01<00:00,  1.54it/s]
100%|██████████| 4/4 [00:01<00:00,  2.01it/s]

n_estimator: 400
SMAPE: 5.250085104671692
MAE: 100.49054019982881





In [8]:

color = [ 'red','green','blue','black', 'orange', 'm', 'yellow', 'purple', 'gray']
preds = [0 for _ in range(100)]


for k in range(9):
    cluster = clusters[k]
    drop_col = drop_cols[k]
    c = color[k]
    for i in tqdm(cluster):
        train_df = pd.read_csv(f'./submit/train_building{i}.csv').drop(columns=drop_col)
        test_df = pd.read_csv(f'./submit/test_building{i}.csv').drop(columns=drop_col)
        pred_df = pd.DataFrame()   # 시드별 예측값을 담을 data frame
        
        t_x = train_df.drop(columns=['power_consumption'])
        t_y = train_df['power_consumption']
    
        #print(f"building_{i}")
        for seed in [0,1,2,3,4,5,6]:
            model = CatBoostRegressor(iterations=400,verbose=False)
            model.fit(t_x, t_y)
            y_pred = model.predict(test_df)
            pred_df.loc[:,seed] = y_pred  
        pred = pred_df.mean(axis=1)
        preds[i-1] = pred
        
np_preds = np.array(preds)
submit = np_preds.flatten()  


100%|██████████| 9/9 [00:27<00:00,  3.01s/it]
100%|██████████| 38/38 [01:51<00:00,  2.94s/it]
100%|██████████| 22/22 [01:24<00:00,  3.83s/it]
100%|██████████| 2/2 [00:05<00:00,  2.97s/it]
100%|██████████| 8/8 [00:32<00:00,  4.12s/it]
100%|██████████| 8/8 [00:28<00:00,  3.59s/it]
100%|██████████| 7/7 [00:20<00:00,  2.87s/it]
100%|██████████| 2/2 [00:08<00:00,  4.16s/it]
100%|██████████| 4/4 [00:13<00:00,  3.47s/it]


In [9]:
########################################
csv_name = './submission/CatBoost_wsw_clust9_2.csv'
########################################

In [10]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = submit
submission.to_csv(f'{csv_name}', index = False)