In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import glob
import random
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

## Baseline

In [None]:
train = pd.read_csv('./data/train/train.csv')

In [None]:
train.tail()

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
submission.tail()

In [None]:
def preprocess_data(data, is_train=True):
    
    temp = data.copy()
    temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]

    if is_train==True:          
    
        temp['Target1'] = temp['TARGET'].shift(-48).fillna(method='ffill') # shift(-48) -> 48개 행 위로; fillna -> 결측값을 앞 방향으로 채우기
        temp['Target2'] = temp['TARGET'].shift(-48*2).fillna(method='ffill') # fillna -> 결측값을 앞 방향으로 채우기
        temp = temp.dropna()
        
        return temp.iloc[:-96]

    elif is_train==False:
        
        temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]
                              
        return temp.iloc[-48:, :]


df_train = preprocess_data(train)
df_train.iloc[:48]

In [None]:
df_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    temp = preprocess_data(temp, is_train=False)
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

In [None]:
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], test_size=0.3, random_state=0)

In [None]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [None]:
# CatBoost Regressor

def catboost(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    cat_model = CatBoostRegressor(eval_metric = 'Quantile',
                                  loss_function = f'Quantile:alpha={q}',
                                  grow_policy = 'Depthwise',
                                  random_seed = 2021,
#                                   iterations = 6000,
#                                   learning_rate = 0.01,
                                  depth = 6,
                                  early_stopping_rounds = 1000,
                                  l2_leaf_reg = 30)
    
    parameters = {'iterations':[5000, 6000],
                  'learning_rate':[0.01, 0.02]}
    
    grid_model = GridSearchCV(cat_model, param_grid=parameters,
                              scoring='neg_mean_squared_error',
                              cv=2)
    
    grid_model.fit(X_train, Y_train)
    rmse = np.sqrt(-1 * grid_model.best_score_)
    print('최적 평균 RMSE 값:', np.round(rmse, 4))
    print('최적 파라미터:', grid_model.best_params_)
    
    model = grid_model.best_estimator_
    
    model.fit(X_train, Y_train,
              eval_set = (X_valid, Y_valid), verbose = 500)
    
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model
    
#     parameters = {'iterations':[2500, 3000, 3500, 4000],
#                   'learning_rate':[0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
#                   'depth':[6, 10, 14, 18],
#                   'metric_period':[80, 100, 120, 140],
#                   'early_stopping_rounds':[150, 200, 250],
#                   'l2_leaf_reg':[30, 40]
#                  }
#     cat_model.fit(X_train, Y_train, eval_set = (X_valid, Y_valid))
    
#     rmse = np.sqrt(-1 * cat_model)
#     print('RMSE:', np.round(rmse, 4))

#     pred = pd.Series(cat_model.predict(X_test).round(2))
    
#     return pred, cat_model

In [None]:
# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):
    
    cat_models = []
    cat_actual_pred = pd.DataFrame()
    
    for q in quantiles:
        print(q)
        pred, model = catboost(q, X_train, Y_train, X_valid, Y_valid, X_test)
        cat_models.append(model)
        cat_actual_pred = pd.concat([cat_actual_pred, pred], axis = 1)
    
    cat_actual_pred.columns = quantiles
    
    return cat_models, cat_actual_pred

In [None]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)
results_1.sort_index()[:48]

In [None]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)
results_2.sort_index()[:48]

In [None]:
print(results_1.shape, results_2.shape)

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

In [None]:
submission.to_csv('./data/submission_yeonjung_cb_210208_v1.csv', index=False)