In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from utils.group_ts_split import GroupTimeSeriesSplit, PurgedGroupTimeSeriesSplit

In [4]:
data_train = pd.read_csv('data/extra_train.csv', index_col=0)
data_test = pd.read_csv('data/extra_test.csv', index_col=0)
data_train

Unnamed: 0,Month,Hour,cons_target,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],day_year,number,...,net,net_target,net+1,net-23,net_target+1,net_target-23,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,8.0,22.0,0.441346,0.000000,19.4,81.0,0.000000,0.000000,23,1,...,3.557083,0.617429,1.411333,2.275800,0.431845,0.506612,0.000000,0.000000,81.0,20.0
1,8.0,23.0,0.170776,0.000000,20.0,81.0,0.000000,0.000000,24,1,...,1.411333,0.431845,0.979417,0.851167,0.394489,0.383397,0.000000,0.000000,87.0,19.4
2,8.0,24.0,0.116313,0.000000,19.4,87.0,0.000000,0.000000,25,1,...,0.979417,0.394489,0.900975,0.834600,0.387705,0.381964,0.000000,0.000000,87.0,19.4
3,8.0,1.0,0.106422,0.000000,19.4,87.0,0.000000,0.000000,26,1,...,0.900975,0.387705,0.922433,0.838167,0.389561,0.382273,0.000000,0.000000,90.0,19.4
4,8.0,2.0,0.109127,0.000000,19.4,90.0,0.000000,0.000000,27,1,...,0.922433,0.389561,0.910483,1.478433,0.388527,0.437649,0.000000,0.000000,90.0,18.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43675,7.0,17.0,0.802751,31.179167,21.1,81.0,5.389072,5.669881,8754,5,...,3.839883,0.862123,2.713900,3.463783,0.720845,0.814933,3.295837,4.875197,84.0,20.6
43676,7.0,18.0,0.552807,4.070833,20.6,84.0,3.295837,4.875197,8755,5,...,2.713900,0.720845,0.935833,0.457167,0.497751,0.437693,0.000000,0.000000,81.0,21.1
43677,7.0,19.0,0.189487,0.000000,21.1,81.0,0.000000,0.000000,8756,5,...,0.935833,0.497751,2.155017,0.714717,0.650722,0.470007,0.000000,0.000000,79.0,21.7
43678,7.0,20.0,0.436347,0.000000,21.7,79.0,0.000000,0.000000,8757,5,...,2.155017,0.650722,2.316400,1.659000,0.670971,0.588487,0.000000,0.000000,76.0,21.3


In [9]:
features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-23', 'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target+1'
qts = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]])
groups = data_train['day_year'].values

In [10]:
sampler = TPESampler(seed = 123)

def objective(trial, X_temp, y_temp, tau):
    # Define hyperparameters to be optimized
    params = {
        "objective": "quantile",
        "metric": "quantile",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.5, 1, 0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.5, 1, 0.1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "alpha": trial.suggest_uniform("alpha", 0.0, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10)
    }
    # 6-fold time-series cross validation
    tfold = GroupTimeSeriesSplit()
    mae_list = []
    for _, (train_index, test_index) in tqdm(enumerate(tfold.split(X_temp, y, groups))):
        X_train = X_temp.iloc[train_index]
        X_val = X_temp.iloc[test_index]
        y_train = y_temp[train_index]
        y_val = y_temp[test_index]        
        train_dataset = lgb.Dataset(X_train, 
                            y_train, 
                            categorical_feature = categoricals,
                            free_raw_data=False)
        val_dataset = lgb.Dataset(X_val, 
                                y_val,
                                categorical_feature = categoricals,
                                free_raw_data=False)
        # train LGBM2
        model = lgb.train(params = params,
                        train_set = train_dataset, 
                        valid_sets = [val_dataset],
                        callbacks=[lgb.early_stopping(100)],
                        verbose_eval=False
                        )

    # Compute Pinball Loss on validation set
    y_pred = model.predict(X_val)
    pinball_loss = np.mean(np.where(y_val >= y_pred, (1 - tau) * (y_val - y_pred), tau * (y_pred - y_val)))
    return pinball_loss

def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_booster", value=trial.user_attrs["best_booster"])

In [8]:
# run several lgb models for multiple quantiles
def run_lgb(data, datat, features, target, quantile, params, seed=42):
    # set seed
    X_temp = data[features]
    y_temp = data[target]
    func = lambda trial: objective(trial, X_temp, y_temp, quantile)
    study_model = optuna.create_study(direction = 'minimize', sampler = sampler)
    study_model.optimize(objective, n_trials = 50, callbacks=[callback])
    best_model=study_model.user_attrs["best_booster"]

    y_pred = best_model.predict(data[features])
    y_pred_test = best_model.predict(datat[features])
    return y_pred, y_pred_test

# produce 5 quantile forecasts with uniform weights and output the dataframe with 5 columns
def run_lgb_ensemble(data, datat, features, target, seed=42):
    y_preds = []
    y_preds_test = []
    for quantile in qts:
        y_pred, y_pred_test = run_lgb(data, datat, features, target, quantile, seed)
        y_preds.append(y_pred)
        y_preds_test.append(y_pred_test)
    y_preds = np.array(y_preds).T
    y_preds_test = np.array(y_preds_test).T
    y_preds = pd.DataFrame(y_preds, columns=qts, index=data.timestamp)
    y_preds_test = pd.DataFrame(y_preds_test, columns=qts, index=datat.timestamp)
    return y_preds, y_preds_test

In [9]:
output, output_test = run_lgb_ensemble(data_train, data_test, features, target, seed=42)

Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.000201374	valid_1's quantile: 0.000232327
Early stopping, best iteration is:
[107]	training's quantile: 0.000197298	valid_1's quantile: 0.000229806
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.00656907	valid_1's quantile: 0.00854172
[200]	training's quantile: 0.00625145	valid_1's quantile: 0.00849752
Early stopping, best iteration is:
[235]	training's quantile: 0.00618738	valid_1's quantile: 0.00849725
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.0115056	valid_1's quantile: 0.0155339
[200]	training's quantile: 0.0110391	valid_1's quantile: 0.0154192
[300]	training's quantile: 0.0107591	valid_1's quantile: 0.015362
[400]	training's quantile: 0.0105533	valid_1's quantile: 0.0153307
Early stopping, best iteration is:
[371]	training's quantile: 0.0106125	valid_1's quantile: 0.0153184
Training until validation sco

In [12]:
def post_process(data):
    # sort column names
    data = data[sorted(data.columns)]
    data.columns = np.round(data.columns, 3)
    data.index = pd.to_datetime(data.index)
    data['hour'] = data.index.hour
    return data

output = post_process(output)
output_test = post_process(output_test)

In [13]:
output.to_csv('./data/quantile/year_qs.csv', index=True)
output_test.to_csv('./data/quantile/year_qs_test.csv', index=True)

In [5]:
output = pd.read_csv('./data/quantile/year_qs.csv', index_col=0)
output 

Unnamed: 0_level_0,0.001,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,...,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.999,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-02 00:00:00,0.303144,0.369353,0.376349,0.380305,0.382202,0.382707,0.386298,0.387832,0.387259,0.388268,...,0.399557,0.402965,0.402966,0.409924,0.421649,0.438896,0.460093,0.489065,0.747136,0
2021-08-02 01:00:00,0.305496,0.369540,0.377217,0.382645,0.382997,0.384135,0.384699,0.385413,0.386054,0.385496,...,0.389481,0.392013,0.397967,0.398482,0.411848,0.424562,0.448965,0.494651,0.743658,1
2021-08-02 02:00:00,0.286406,0.371241,0.378363,0.381176,0.383090,0.384410,0.386936,0.387737,0.388410,0.388770,...,0.395280,0.403726,0.403535,0.408394,0.414624,0.428989,0.438694,0.495129,0.743658,2
2021-08-02 03:00:00,0.286969,0.371241,0.379088,0.383727,0.384219,0.385751,0.387749,0.387367,0.389014,0.388077,...,0.394311,0.395869,0.404769,0.410257,0.416182,0.426240,0.438164,0.490161,0.740722,3
2021-08-02 04:00:00,0.269961,0.351769,0.362449,0.374904,0.375802,0.374120,0.382955,0.387517,0.391618,0.388278,...,0.405398,0.407918,0.412246,0.419744,0.426758,0.437952,0.455525,0.499027,0.741087,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-30 19:00:00,0.307131,0.381668,0.404760,0.413892,0.424813,0.441078,0.439176,0.443101,0.446479,0.463930,...,0.479411,0.492119,0.500108,0.497599,0.522463,0.539418,0.566452,0.635146,0.820193,19
2022-07-30 20:00:00,0.313776,0.384778,0.418112,0.438087,0.446400,0.462508,0.463368,0.468091,0.473173,0.490476,...,0.512539,0.519381,0.525012,0.522898,0.559484,0.571380,0.597751,0.671609,0.817159,20
2022-07-30 21:00:00,0.332102,0.392807,0.432249,0.449053,0.467606,0.480640,0.491217,0.512607,0.520690,0.540575,...,0.585383,0.594138,0.599300,0.618007,0.639146,0.651856,0.683629,0.750670,0.845875,21
2022-07-30 22:00:00,0.327744,0.389125,0.416536,0.430872,0.443294,0.449306,0.465738,0.476574,0.482941,0.494904,...,0.527539,0.534826,0.542192,0.550477,0.576443,0.602120,0.618810,0.689448,0.841164,22


In [6]:
data_train

Unnamed: 0,Month,Hour,cons_target,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],day_year,number,...,net,net_target,net+1,net-23,net_target+1,net_target-23,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,8.0,22.0,0.441346,0.000000,19.4,81.0,0.000000,0.000000,23,1,...,3.557083,0.617429,1.411333,2.275800,0.431845,0.506612,0.000000,0.000000,81.0,20.0
1,8.0,23.0,0.170776,0.000000,20.0,81.0,0.000000,0.000000,24,1,...,1.411333,0.431845,0.979417,0.851167,0.394489,0.383397,0.000000,0.000000,87.0,19.4
2,8.0,24.0,0.116313,0.000000,19.4,87.0,0.000000,0.000000,25,1,...,0.979417,0.394489,0.900975,0.834600,0.387705,0.381964,0.000000,0.000000,87.0,19.4
3,8.0,1.0,0.106422,0.000000,19.4,87.0,0.000000,0.000000,26,1,...,0.900975,0.387705,0.922433,0.838167,0.389561,0.382273,0.000000,0.000000,90.0,19.4
4,8.0,2.0,0.109127,0.000000,19.4,90.0,0.000000,0.000000,27,1,...,0.922433,0.389561,0.910483,1.478433,0.388527,0.437649,0.000000,0.000000,90.0,18.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43675,7.0,17.0,0.802751,31.179167,21.1,81.0,5.389072,5.669881,8754,5,...,3.839883,0.862123,2.713900,3.463783,0.720845,0.814933,3.295837,4.875197,84.0,20.6
43676,7.0,18.0,0.552807,4.070833,20.6,84.0,3.295837,4.875197,8755,5,...,2.713900,0.720845,0.935833,0.457167,0.497751,0.437693,0.000000,0.000000,81.0,21.1
43677,7.0,19.0,0.189487,0.000000,21.1,81.0,0.000000,0.000000,8756,5,...,0.935833,0.497751,2.155017,0.714717,0.650722,0.470007,0.000000,0.000000,79.0,21.7
43678,7.0,20.0,0.436347,0.000000,21.7,79.0,0.000000,0.000000,8757,5,...,2.155017,0.650722,2.316400,1.659000,0.670971,0.588487,0.000000,0.000000,76.0,21.3
