In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [161]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from utils.group_ts_split import GroupTimeSeriesSplit, PurgedGroupTimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import calibration_curve

In [162]:
data_train = pd.read_csv('data/extra_train.csv', index_col=0)
data_test = pd.read_csv('data/extra_test.csv', index_col=0)
data_train

Unnamed: 0,Month,Hour,cons_target,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],day_year,number,...,net,net_target,net+1,net-23,net_target+1,net_target-23,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,8.0,24.0,0.441346,0.000000,19.4,81.0,0.000000,0.000000,23,1,...,3.557083,0.617429,1.411333,2.275800,0.431845,0.506612,0.000000,0.000000,81.0,20.0
1,8.0,1.0,0.170776,0.000000,20.0,81.0,0.000000,0.000000,24,1,...,1.411333,0.431845,0.979417,0.851167,0.394489,0.383397,0.000000,0.000000,87.0,19.4
2,8.0,2.0,0.116313,0.000000,19.4,87.0,0.000000,0.000000,25,1,...,0.979417,0.394489,0.900975,0.834600,0.387705,0.381964,0.000000,0.000000,87.0,19.4
3,8.0,3.0,0.106422,0.000000,19.4,87.0,0.000000,0.000000,26,1,...,0.900975,0.387705,0.922433,0.838167,0.389561,0.382273,0.000000,0.000000,90.0,19.4
4,8.0,4.0,0.109127,0.000000,19.4,90.0,0.000000,0.000000,27,1,...,0.922433,0.389561,0.910483,1.478433,0.388527,0.437649,0.000000,0.000000,90.0,18.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43675,7.0,19.0,0.802751,31.179167,21.1,81.0,5.389072,5.669881,8754,5,...,3.839883,0.862123,2.713900,3.463783,0.720845,0.814933,3.295837,4.875197,84.0,20.6
43676,7.0,20.0,0.552807,4.070833,20.6,84.0,3.295837,4.875197,8755,5,...,2.713900,0.720845,0.935833,0.457167,0.497751,0.437693,0.000000,0.000000,81.0,21.1
43677,7.0,21.0,0.189487,0.000000,21.1,81.0,0.000000,0.000000,8756,5,...,0.935833,0.497751,2.155017,0.714717,0.650722,0.470007,0.000000,0.000000,79.0,21.7
43678,7.0,22.0,0.436347,0.000000,21.7,79.0,0.000000,0.000000,8757,5,...,2.155017,0.650722,2.316400,1.659000,0.670971,0.588487,0.000000,0.000000,76.0,21.3


In [163]:
features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-23', 'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target+1'
qts = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]])
# round to 3 decimals
qts = np.round(qts, 3)
groups = data_train['day_year'].values

In [164]:
# run several lgb models for multiple quantiles
def run_lgb(data, datat, features, target, quantile, params, seed=42):
    # set seed
    X_temp = data[features]
    y_temp = data[target]
    params = {
    'objective': 'quantile',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'verbose': -1
    }
    params['seed'] = seed
    params['metric'] = 'quantile'
    params['alpha'] = quantile
    lgb_train = lgb.Dataset(X_temp, y_temp)
    best_model =  lgb.train(params, lgb_train, num_boost_round=200)
    best_model.save_model('models/lag_minus_24/lgb_' + str(quantile) + '.txt')
    y_pred = best_model.predict(data[features])
    y_pred_test = best_model.predict(datat[features])
    return y_pred, y_pred_test

# produce 5 quantile forecasts with uniform weights and output the dataframe with 5 columns
def run_lgb_ensemble(data, datat, features, target, seed=42):
    y_preds = []
    y_preds_test = []
    for quantile in qts:
        y_pred, y_pred_test = run_lgb(data, datat, features, target, quantile, seed)
        y_preds.append(y_pred)
        y_preds_test.append(y_pred_test)
    y_preds = np.array(y_preds).T
    y_preds_test = np.array(y_preds_test).T
    y_preds = pd.DataFrame(y_preds, columns=qts, index=data.timestamp)
    y_preds['number'] = data['number'].values
    y_preds['day_year'] = data['day_year'].values
    y_preds_test = pd.DataFrame(y_preds_test, columns=qts, index=datat.timestamp)
    y_preds_test['number'] = datat['number'].values
    y_preds_test['day_year'] = datat['day_year'].values
    return y_preds, y_preds_test

In [165]:

output, output_test = run_lgb_ensemble(data_train, data_test, features, target, seed=42)

In [97]:
def post_process(data):
    # sort column names
    #data = data[sorted(data.columns)]
    data.index = pd.to_datetime(data.index)
    data['hour'] = data.index.hour
    return data

output = post_process(output)
output_test = post_process(output_test)

In [154]:
output

Unnamed: 0_level_0,0.001,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,...,0.7,0.75,0.8,0.85,0.9,0.95,0.999,number,day_year,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-31 21:00:00,0.322883,0.383452,0.413785,0.431443,0.444508,0.456197,0.461880,0.472461,0.491571,0.494161,...,0.556131,0.572204,0.602189,0.641947,0.686733,0.746369,0.860502,1,23,21
2021-08-01 22:00:00,0.338289,0.361590,0.371128,0.373164,0.375278,0.378782,0.380464,0.382798,0.385171,0.386062,...,0.420209,0.425590,0.452466,0.470609,0.525271,0.589980,0.787205,1,24,22
2021-08-01 23:00:00,0.345352,0.361546,0.370288,0.373332,0.375271,0.378129,0.379995,0.382583,0.382851,0.386352,...,0.415988,0.416176,0.430124,0.441633,0.500803,0.532917,0.744893,1,25,23
2021-08-01 00:00:00,0.331988,0.360662,0.368349,0.371216,0.373884,0.375455,0.376025,0.378864,0.379295,0.382737,...,0.400082,0.404158,0.415416,0.426367,0.455668,0.490970,0.728661,1,26,0
2021-08-01 01:00:00,0.324950,0.379630,0.410492,0.420451,0.427570,0.434322,0.435179,0.436782,0.437027,0.438343,...,0.450469,0.458898,0.475228,0.488172,0.510236,0.549509,0.800668,1,27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-30 16:00:00,0.300362,0.422021,0.455104,0.530801,0.542858,0.588118,0.600280,0.637452,0.641795,0.649856,...,0.725742,0.760222,0.793425,0.811300,0.835018,0.875491,0.807802,5,8754,16
2022-07-30 17:00:00,0.319181,0.380770,0.396219,0.411757,0.422813,0.429266,0.433115,0.441603,0.442054,0.451250,...,0.493604,0.510791,0.529246,0.557017,0.608167,0.660644,0.841968,5,8755,17
2022-07-30 18:00:00,0.323339,0.384611,0.411142,0.430256,0.440105,0.455556,0.465402,0.474146,0.477585,0.499881,...,0.539979,0.553502,0.576317,0.605207,0.644079,0.688531,0.855566,5,8756,18
2022-07-30 19:00:00,0.323339,0.392862,0.424677,0.451279,0.480485,0.496076,0.519573,0.536870,0.545165,0.565358,...,0.622803,0.637520,0.661215,0.679893,0.709380,0.734730,0.835429,5,8757,19


In [13]:
output.to_csv('./data/quantile/year_qs.csv', index=True)
output_test.to_csv('./data/quantile/year_qs_test.csv', index=True)