In [None]:
import sys
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

def export_datetime(x) :
    return str(x.split('_')[1][:11])


def export_month(x) :
    return int(x.split('_')[1][4:6])

def export_date(x):
    return int(x.split('_')[1][6:8])

def export_time_cos(x):
    return np.cos((int(x.split('_')[1][9:11])/24)*2*np.pi)

def export_time_sin(x):
    return np.sin(int(x.split('_')[1][9:11])/24*2*np.pi)

def SMAPE(real, pred):
    sape = abs((real - pred))/((abs(real) + abs(pred)) / 2) * 100
    smape = sape.mean()
    return smape

pd.set_option('display.max_columns', 30)

In [None]:
# Train 불러오기

train = pd.read_csv('Data/train.csv')#, encoding = 'cp949')

train['date_time'] = train['num_date_time'].apply(export_datetime)
date = pd.to_datetime(train.date_time)
train['hour'] = date.dt.hour
train['day'] = date.dt.weekday
train['month'] = date.dt.month
train['week'] = date.dt.weekofyear
train['holiday'] = train.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
train['sin_time'] = np.sin(2*np.pi*train.hour/24)
train['cos_time'] = np.cos(2*np.pi*train.hour/24)

# 파생변수 THI, CDH
train['THI'] = 9/5*train['기온(C)'] - 0.55*(1-train['습도(%)']/100)*(9/5*train['습도(%)']-26)+32
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = train[train['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

train.pop('일조(hr)')
train.pop('일사(MJ/m2)')
train.pop('num_date_time')
train.pop('hour')
train.pop('date_time')
train['power']= train['전력소비량(kWh)']
train.pop('전력소비량(kWh)')
train['강수량(mm)']= train['강수량(mm)'].fillna(0)
train['풍속(m/s)']= train['풍속(m/s)'].fillna(0)
train['습도(%)']= train['습도(%)'].fillna(0) 

## save the preprocessed data
train.to_csv('./data/train_preprocessed.csv')

In [None]:
test = pd.read_csv('./data/test.csv')
test['date_time'] = test['num_date_time'].apply(export_datetime)
date_test = pd.to_datetime(test.date_time)

test['hour'] = date_test.dt.hour
test['day'] = date_test.dt.weekday
test['month'] = date_test.dt.month
test['week'] = date_test.dt.weekofyear

test['THI'] = 9/5*test['기온(C)'] - 0.55*(1-test['습도(%)']/100)*(9/5*test['습도(%)']-26)+32

cdhs = np.array([])
for num in range(1,101,1):
    temp = test[test['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
test['CDH'] = cdhs
test['sin_time'] = np.sin(2*np.pi*test.hour/24)
test['cos_time'] = np.cos(2*np.pi*test.hour/24)
test['holiday'] = test.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)

test = test[['건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
        'day', 'month', 'week', 'holiday','sin_time','cos_time', 'THI', 'CDH',  ]]
test.to_csv('./data/test_preprocessed.csv')

In [None]:
import pandas as pd
from sktime.utils.plotting import plot_series
from sktime.forecasting.model_selection import temporal_train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
import numpy as np
import sklearn
from tqdm import tqdm

# 없애고 싶은 feature가 있으면 설정
def pop_feat(train_copy, log = False, train = True) :

#     train_copy.pop('day')
#     train_copy.pop('month')
#     train_copy.pop('week')
    # train_copy.pop('holiday')
#     train_copy.pop('day_hour_mean')
#     train_copy.pop('day_hour_std')
#     train_copy.pop('hour_mean')
#     train_copy.pop('hour_std')

#     train_copy.pop('기온(C)')
#     train_copy.pop('습도(%)')
#     train_copy.pop('THI')
#     train_copy.pop('CDH')
    # train_copy.pop('week')
    # train_copy.pop('holiday')
    
    if log and train :
        train_copy['power'] = np.log(train_copy['power'])
    return train_copy

def train_test_split(train, num, val_hour) :
    y = train.loc[train['건물번호'] == num, 'power']
    x = train.loc[train['건물번호'] == num].iloc[:, 2:-1]
    if val_hour == 0 :
        return y, None, x, None
    else :
        y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size =  val_hour) # 24시간*7일 = 168
        print('train data shape\nx:{}, y:{}'.format(x_train.shape, y_train.shape))
        return y_train, y_valid, x_train, x_valid 

# Define SMAPE loss function
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100
#### alpha를 argument로 받는 함수로 실제 objective function을 wrapping하여 alpha값을 쉽게 조정할 수 있도록 작성했습니다.
# custom objective function for forcing model not to underestimate

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [None]:
train = pd.read_csv('./data/train_preprocessed.csv').iloc[:,1:]
test =  pd.read_csv('./data/test_preprocessed.csv').iloc[:,1:]
log = False # target을 log normalization 할것인가
train = pop_feat(train, log, True)
test = pop_feat(test, log, False)

In [None]:
save_path = 'exp/XGB_log_false_use_gscv_params' #'exp/XGB_use_stat_log_false'
try :
    # Gfid-search에서 찾은 parameter가 있으면
    params_xgb = pd.read_csv(r"exp\XGB_log_false_gscv\gscv_params.csv").iloc[:,1:]
except :
    # 없으면
    params_xgb = None

save_root = 'exp/XGB_log_false_1' #'exp/XGB_log_false_use_gscv_params_seed'
pred_val_enss = []
test_enss = []
# Seed 앙상블을 한다면 1보다 크게
for seed in range(1) :
    print(seed)
    save_path = os.path.join(save_root,str(seed))
    os.makedirs(save_path, exist_ok = True)
    summary_list = []
    ans_val_list = []
    pred_val_list = []
    pred_test_list = []

    for i in tqdm(range(100)) :
#         param_xgb = params_xgb.iloc[i]
        param_xgb = None
        print(i)
        summary_dict = {}
        smape_val, r2_val, ans_val, pred_val, pred_test = fit_XGB(train, test, i+1, save_path, param_xgb, seed) 

        summary_dict['건물번호'] = i+1
        summary_dict['SMAPE_val'] = smape_val
        summary_dict['R2_val'] = r2_val
        summary_list.append(summary_dict)
        ans_val_list.append(ans_val)
        pred_val_list.append(pred_val)
        pred_test_list.append(pred_test)
    ans_val_whole = np.concatenate(ans_val_list)
    pred_val_whole = np.concatenate(pred_val_list)
    pred_test_whole = np.concatenate(pred_test_list)
    smape_val = SMAPE(ans_val_whole, pred_val_whole)
    pd.DataFrame(summary_list).to_csv(os.path.join(save_path,'summary_log_{}.csv'.format(smape_val)), index = False)
    sub = pd.read_csv('Data/sample_submission.csv')
    sub['answer'] = pred_test_whole
    sub.to_csv(os.path.join(save_path,'sub.csv'), index = False)
    
    pred_val_enss.append(pred_val_whole)
    test_enss.append(pred_test_whole)

In [None]:
summary_list = []
ans_val_list = []
pred_val_list = []
pred_test_list = []
save_path = 'exp/XGB_log_false_gscv'
os.makedirs(save_path, exist_ok = True)
df_params = pd.DataFrame(columns = ['n_estimators', 'eta', 'min_child_weight','max_depth', 'colsample_bytree', 'subsample'])
params_xgb = 
for i in tqdm(range(100)) :

    print(i)
    summary_dict = {}
    
    smape_val, r2_val, ans_val, pred_val, pred_test, params = fit_XGB_gscv(train, test, i+1, save_path ) 

    summary_dict['건물번호'] = i+1
    summary_dict['SMAPE_val'] = smape_val
    summary_dict['R2_val'] = r2_val
    summary_list.append(summary_dict)
    ans_val_list.append(ans_val)
    pred_val_list.append(pred_val)
    pred_test_list.append(pred_test)
    df_params = pd.concat([df_params, pd.DataFrame(params, index = [0])], axis = 0)
ans_val_whole = np.concatenate(ans_val_list)
pred_val_whole = np.concatenate(pred_val_list)
pred_test_whole = np.concatenate(pred_test_list)
df_params.to_csv(r"exp\XGB_log_false_gscv\gscv_params.csv")

In [None]:
def fit_pycaret(train, test, num, path = ('./')) :#, param = None):
    y_train, y_valid, x_train, x_valid = train_test_split(train, num, 168)
    x_train['power'] = y_train
      
    exp_reg = setup(data = x_train, target ='power', session_id=123, fold_strategy = 'timeseries')
    best_model = compare_models(include = ['rf', 'catboost','lightgbm', 'xgboost', 'et', 'gbr'], sort = 'MAPE')
    
    ## 주황색이 실제 전력소비량, 초록색이 예측값입니다.
    pred = best_model.predict(x_valid)
    pred = pd.Series(pred)
    pred.index = np.arange(y_valid.index[0], y_valid.index[-1]+1)
    # plot_series(y_train, y_valid, pd.Series(pred), markers=[',' , ',', ','])

    # Test
    x_test_i = test[test['건물번호'] == 1]
    y_pred_test = best_model.predict(x_test_i.iloc[:,2:])
    test_series = pd.Series(y_pred_test,index = np.arange((y_valid.index.max() + 1), (y_valid.index.max() + 1 + len(y_pred_test))))
    plot_series(y_train, y_valid, pd.Series(pred), test_series, markers=[',' , ',', ',', ',',])
    plt.title(num)
    os.makedirs(os.path.join(save_path, 'results'), exist_ok = True)
    plt.savefig(os.path.join(path, 'results', str(num)))
    plt.show()
    os.makedirs(os.path.join(save_path, 'results'), exist_ok = True)
    

    if log == True :
        pred = np.exp(pred)
        y_valid = np.exp(y_valid)
        y_pred_test = np.exp(y_pred_test)
    smape_val = SMAPE(y_valid, pred)
    r2_val = sklearn.metrics.r2_score(y_valid, pred)
    best_model_type = str(type(best_model))
    print('best model: {}'.format(best_model_type ))
    print('SMAPE : {}'.format(SMAPE(y_valid, pred)))
    return best_model_type, smape_val, r2_val, y_valid, pred, y_pred_test

In [None]:

train = pd.read_csv('./data/train_preprocessed.csv').iloc[:,1:]
test =  pd.read_csv('./data/test_preprocessed.csv').iloc[:,1:]
log = False
train = pop_feat(train, log, True)
test = pop_feat(test, log, False)
save_path = 'exp/pycater_feat_eng' #'exp/XGB_use_stat_log_false'
os.makedirs(save_path, exist_ok = True)
summary_list = []
ans_val_list = []
pred_val_list = []
pred_test_list = []

for i in tqdm(range(100)) :
    summary_dict = {}
    best_model_type, smape_val, r2_val, ans_val, pred_val, pred_test = fit_pycaret(train, test, i+1, save_path) 

    summary_dict['건물번호'] = i+1
    summary_dict['Best_model'] = best_model_type
    summary_dict['SMAPE_val'] = smape_val
    summary_dict['R2_val'] = r2_val
    summary_list.append(summary_dict)
    ans_val_list.append(ans_val)
    pred_val_list.append(pred_val)
    pred_test_list.append(pred_test)
ans_val_whole = np.concatenate(ans_val_list)
pred_val_whole = np.concatenate(pred_val_list)
pred_test_whole = np.concatenate(pred_test_list)
smape_val = SMAPE(ans_val_whole, pred_val_whole)
pd.DataFrame(summary_list).to_csv(os.path.join(save_path,'summary_log_{}.csv'.format(smape_val)), index = False)
sub = pd.read_csv('Data/sample_submission.csv')
sub['answer'] = pred_test_whole
sub.to_csv(os.path.join(save_path,'sub.csv'), index = False)