In [None]:
%config Completer.use_jedi = False

In [None]:
import os, sys, shutil, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [None]:
# import modules
import sklearn
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split as tts
from sklearn import ensemble
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import scipy as sp
import math
import random

In [None]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

## DATA Load

In [None]:
base_dir = os.getcwd()
train_dir = os.path.join(base_dir, 'train/train.csv')

In [None]:
base_df = pd.read_csv(train_dir)
base_df.head()

base_df['Time'] = base_df['Hour']*60 + base_df['Minute']

## PREPROCESS

In [None]:
def shift_dataset(inputdata):
    # Shift
    shift_df = inputdata.copy()

    shift_df['TARGET1'] = shift_df['TARGET'].shift(-48).fillna(method='ffill')
    shift_df['TARGET2'] = shift_df['TARGET'].shift(-96).fillna(method='ffill')

    shift_df_result = shift_df.iloc[:-96]
    
    return shift_df_result

In [None]:
def make_ghi(inputdata):
    
    df = inputdata.copy()
    
    # 전체 도달 에너지량 
    df['sum_energy'] = df['DHI'] + df['DNI']

    # 태양고도
    df['theta'] = 0
    condition_list = [
        (df['Hour'] == 6) | (df['Hour'] == 19),
        (df['Hour'] == 7) | (df['Hour'] == 18),
        (df['Hour'] == 8) | (df['Hour'] == 17),
        (df['Hour'] == 9) | (df['Hour'] == 16),
        (df['Hour'] == 10) | (df['Hour'] == 15),
        (df['Hour'] == 11) | (df['Hour'] == 14),
        (df['Hour'] == 12) | (df['Hour'] == 13)
    ]

    choice_list = [0,10,20,30,40,50,60]

    df['theta'] = np.select(condition_list, choice_list)

    # GHI
    df['GHI'] = df['DNI'] * np.cos(df['theta']) + df['DHI']

    return df

In [None]:
# Td, T-Td
def make_dp(inputdata):
    
    tempdf = inputdata
    
    b = 17.62
    c = 243.12
    term1 = b*tempdf['T']/(c + tempdf['T'])
    term2 = np.log(tempdf['RH']/100)
    gamma = term1 + term2
    dp = (c*gamma)/(b-gamma)

    tempdf['DP'] = dp
    
    return tempdf

In [None]:
def set_dataset(inputdata, dataset_type=None):
    
    temp_df = inputdata.copy()
    
    if dataset_type:
        temp_df = shift_dataset(temp_df)
    
    temp_df = make_ghi(temp_df)
    temp_df = make_dp(temp_df)
    
    return temp_df

In [None]:
var_df = set_dataset(base_df, 'train')
logger.info(f"Setting Basic Dataset Completed --- shape:{var_df.shape}")

### * make arbitrary testset

In [None]:
needed_vars = ['Time', 'DHI', 'DNI', 'WS', 'RH', 'T', 'DP', 'GHI', 'TARGET']

# load testset (81 csv files)
base_dir = os.getcwd()
test_dir = os.path.join(base_dir, 'test')

df_test = []
for i in range(81):
    
    file_path = os.path.join(test_dir, str(i)+'.csv')
    temp = pd.read_csv(file_path)
    
    temp['Time'] = temp['Hour']*60 + temp['Minute']
    
    fin_testset = set_dataset(temp)
    fin_testset = fin_testset.loc[fin_testset.Day == 6, :][needed_vars]
    
    df_test.append(fin_testset)

In [None]:
def make_arbi_test(input_test, input_train):
    
    org_trainset_shape = input_train.shape
    
    # similarity based calling
    fin_testindex = []
    for i in range(81):
        temp_testdf = input_test[i]
        dsc1 = temp_testdf.loc[temp_testdf.TARGET > 0.0].describe()['DHI']
        dsc2 = temp_testdf.loc[temp_testdf.TARGET > 0.0].describe()['T']

        dsc1_range = (dsc1[6]*0.8, dsc1[6]*1.2)
        dsc2_range = (dsc2[6]*0.8, dsc2[6]*1.2)

        candids = input_train.loc[(input_train['DHI'] >= dsc1_range[0]) & (input_train['DHI'] <= dsc1_range[1]) &
                                  (input_train['T'] >= dsc2_range[0]) & (input_train['T'] <= dsc2_range[1]), ]

        if len(candids) < 400:
            fin_testindex.extend(random.sample(list(input_train.index), 40))
        else:
            fin_testindex.extend(random.sample(list(candids.index), 40))
    
    logger.info(f"Called similar train data")
    
    # data with zeros
    zeroset = random.sample(list(input_train.loc[input_train['TARGET'] == 0, ].index), 2000)
    fin_testindex.extend(zeroset)
    
    logger.info(f"Called train data with zero target value")
    
    # drop duplicates
    fin_testindex = list(set(fin_testindex))
    
    testset = input_train.loc[fin_testindex, ]
    input_train = input_train.drop(index=fin_testindex)
    
    logger.info(f"Origin Trainset: {org_trainset_shape} | Trainset after sampling: {input_train.shape} | Testset: {testset.shape}")
    
    return testset, input_train

In [None]:
testset, trainset = make_arbi_test(df_test, var_df)

## Variable Selection

In [None]:
needed_vars = ['Time', 'DHI', 'DNI', 'WS', 'RH', 'T', 'DP', 'GHI', 'TARGET']

In [None]:
test_X = testset[needed_vars]
test_Y = testset[['TARGET1', 'TARGET2']]

In [None]:
df_vars = trainset[needed_vars]
df_label = trainset[['TARGET1', 'TARGET2']]

In [None]:
def split_df(test_size = 0.2, *datasets):
    
    df_vars = datasets[0]
    df_label = datasets[1]
    
    train_x, val_x, train_y, val_y = tts(df_vars, df_label, test_size = test_size, random_state = 2021)
    
    train_y_t1 = train_y.iloc[:, 0:1]
    train_y_t2 = train_y.iloc[:, 1:2]
    
    val_y_t1 = val_y.iloc[:, 0:1]
    val_y_t2 = val_y.iloc[:, 1:2]
    
    return train_x, val_x, train_y_t1, train_y_t2, val_y_t1, val_y_t2

In [None]:
train_x, val_x, train_y_t1, train_y_t2, val_y_t1, val_y_t2 = split_df(0.2, df_vars, df_label)

# Modeling

In [None]:
# Get the model and the predictions in (a) - (b)
def lgbm_model(q, X_train, Y_train, X_valid, Y_valid):
    
    # (a) Modeling
    model = LGBMRegressor(objective='quantile', alpha=q, max_depth=128, boosting='gbdt',
                         n_estimators=750, num_leaves=152, bagging_fraction=0.5, learning_rate=0.02)                   
    
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=512, verbose=500)

    return model

In [None]:
def train_lgbm(X_train, Y_train, X_valid, Y_valid):

    LGBM_models=[]
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    for q in quantiles:
        print(f"Qunatile: {q}")
        model = lgbm_model(q, X_train, Y_train, X_valid, Y_valid)
        LGBM_models.append(model)
    
    return LGBM_models

In [None]:
models_1 = train_lgbm(train_x, train_y_t1, val_x, val_y_t1)

In [None]:
models_2 = train_lgbm(train_x, train_y_t2, val_x, val_y_t2)

In [None]:
def quantile_loss(gt, pred, tau):
    error = gt - pred
    loss = np.mean(np.maximum(tau * error, (tau - 1) * error))
    return loss

In [None]:
def test_pred(model_lists, input_gt, input_testset, quantiles):
    
    model = model_lists
    input_testset = input_testset
    gt = input_gt.values.reshape(-1)
    
    preds = [model_q.predict(input_testset) for i, model_q in enumerate(model)]
    losses = [quantile_loss(gt, pred, quantiles[i]) for i, pred in enumerate(preds)]
    
    fin_loss = sum(losses)/len(losses)
    
    logger.info(f"Test on testset completed --- loss: {fin_loss:.4f}")
    
    return

## Validset Prediction

In [None]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

test_pred(models_1, val_y_t1, val_x, quantiles)
test_pred(models_2, val_y_t2, val_x, quantiles)

## Testset Prediction

In [None]:
test_pred(models_1, test_Y.iloc[:, 0:1], test_X, quantiles)
test_pred(models_2, test_Y.iloc[:, 1:2], test_X, quantiles)

# SUBMISSION

In [None]:
test_dir = os.path.join(base_dir, 'test')
lists = os.listdir(test_dir)

In [None]:
df_test = []

for i in range(81):
    
    file_path = os.path.join(test_dir, str(i)+'.csv')
    temp = pd.read_csv(file_path)
    
    temp['Time'] = temp['Hour']*60 + temp['Minute']
    
    fin_testset = set_dataset(temp)
    fin_testset = fin_testset.loc[fin_testset.Day == 6, :][needed_vars]
    
    df_test.append(fin_testset)

X_test = pd.concat(df_test)
X_test.shape

In [None]:
test_pred1 = pd.DataFrame(np.array([models_1[i].predict(X_test) for i in range(9)]).transpose()).round(4)
test_pred2 = pd.DataFrame(np.array([models_2[i].predict(X_test) for i in range(9)]).transpose()).round(4)

In [None]:
submission = pd.read_csv(os.path.join(base_dir, 'sample_submission.csv'))

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = test_pred1.values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = test_pred2.values
submission.iloc[10:20]

In [None]:
def fill_zeros(inputdata, target_hours):
    
    def find_zero(x): return x['id'].split('.')[1].split('_')[-1].split('h')[0] in target_hours
    
    to_fill = inputdata.loc[inputdata.apply(find_zero, axis = 1), 'q_0.1':]
    zeros_array = np.zeros(to_fill.shape)
    
    inputdata.loc[inputdata.apply(find_zero, axis = 1), 'q_0.1':] = zeros_array
    logger.info(f"Filled target spaces with zeros, final dataset set!")
    
    return inputdata

In [None]:
target_hours = ['0', '1', '2', '3', '4', '20', '21', '22', '23']

fin_result = fill_zeros(submission, target_hours)

### Save result

In [None]:
save_path = './submission/submission_0118_v0.csv'
if os.path.exists(save_path):
    raise Exception("Same submission file already exists!")
else:
    fin_result.to_csv(save_path, index=False)