In [1]:
import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from catboost import CatBoostRegressor
from sktime.forecasting.model_selection import temporal_train_test_split

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0) # Seed 고정

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def mae(y, pred):
    return np.mean(abs(y-pred))
 #점수 측정을 위한 코드  
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

In [3]:
drop_col = ['week', 'CDH',  'THI','day_hour_mean','day_hour_std']

for n_estimator in [100,200,300,400]:
    smape_scores = []
    mae_scores = []
    for i in range(1,101,1):
        train_df = pd.read_csv(f'./pretest/train_building{i}.csv').drop(columns=drop_col)
        pretest_df = pd.read_csv(f'./pretest/pretest_building{i}.csv').drop(columns=drop_col)
        x_train = train_df.drop(columns=['power_consumption'])
        y_train = train_df['power_consumption']
        y_valid = pretest_df['power_consumption']
        x_valid = pretest_df.drop(columns=['power_consumption'])
        
        model_catboost = CatBoostRegressor(iterations=n_estimator, verbose=False)
        model_catboost.fit(x_train, y_train)
        smape_score, mae_score = validate(x_valid, y_valid, model_catboost)
        smape_scores.append(smape_score)
        mae_scores.append(mae_score)
    
    smape_mean = np.mean(smape_scores)
    mae_mean = np.mean(mae_scores)
        
    print(f"n_estimator: {n_estimator}")
    print(f'SMAPE: {smape_mean}\nMAE: {mae_mean}')


n_estimator: 100
SMAPE: 6.017793588800727
MAE: 114.01722809143133
n_estimator: 200
SMAPE: 5.886380354045253
MAE: 111.73339453936953
n_estimator: 300
SMAPE: 5.855412350273274
MAE: 110.85018189224655
n_estimator: 400
SMAPE: 5.79102319474743
MAE: 109.81080044692604


In [4]:
preds = np.array([])
drop_col = ['week', 'CDH',  'THI','day_hour_mean','day_hour_std']

for i in tqdm(range(1,101,1)):
    train_df = pd.read_csv(f'./submit/train_building{i}.csv').drop(columns=drop_col)
    test_df = pd.read_csv(f'./submit/test_building{i}.csv').drop(columns=drop_col)
    pred_df = pd.DataFrame()   # 시드별 예측값을 담을 data frame
    
    t_x = train_df.drop(columns=['power_consumption'])
    t_y = train_df['power_consumption']
 
    #print(f"building_{i}")
    for seed in [0,1,2,3,4,5,6]:
        model = CatBoostRegressor(iterations=400,verbose=False)
        model.fit(t_x, t_y)
        y_pred = model.predict(test_df)
        pred_df.loc[:,seed] = y_pred  
    pred = pred_df.mean(axis=1)
    preds = np.append(preds, pred)
    


  0%|          | 0/100 [00:00<?, ?it/s]


KeyError: "['hour_mean', 'hour_std', 'max_power', 'min_power'] not found in axis"

In [None]:
########################################
csv_name = './submission/CatBoost_wsw_col.csv'
########################################

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = preds
submission.to_csv(f'{csv_name}', index = False)