In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import joblib
from optuna.samplers import TPESampler
from utils.group_ts_split import GroupTimeSeriesSplit, PurgedGroupTimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import calibration_curve
from utils.util_functions import import_data

In [3]:
data_train = import_data('data/extra_train.csv')
# drop brackets from column names
data_train.columns = data_train.columns.str.replace('[', '', regex=False)
data_train.columns = data_train.columns.str.replace(']', '', regex=False)
data_train.columns = data_train.columns.str.replace(' ', '_', regex=False)
data_train

Memory usage of dataframe is 8.66 MB
Memory usage after optimization is: 2.29 MB
Decreased by 73.6%


Unnamed: 0,Month,Hour,cons_target,Solar_Generation_W/kW,Outdoor_Drybulb_Temperature_C,Relative_Humidity_%,Diffuse_Solar_Radiation_W/m2,Direct_Solar_Radiation_W/m2,day_year,number,...,net_target,net+1,net-23,net_target+1,net_target-23,net_target_diff,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,8,0,0.441406,0.000000,19.406250,81.0,0.000000,0.000000,23,1,...,0.617188,1.411133,2.275391,0.431885,0.506836,-0.029327,0.000000,0.000,81.0,20.000000
1,8,1,0.170776,0.000000,20.000000,81.0,0.000000,0.000000,24,1,...,0.431885,0.979492,0.851074,0.394531,0.383301,-0.185547,0.000000,0.000,87.0,19.406250
2,8,2,0.116333,0.000000,19.406250,87.0,0.000000,0.000000,25,1,...,0.394531,0.900879,0.834473,0.387695,0.382080,-0.037354,0.000000,0.000,87.0,19.406250
3,8,3,0.106445,0.000000,19.406250,87.0,0.000000,0.000000,26,1,...,0.387695,0.922363,0.838379,0.389648,0.382324,-0.006783,0.000000,0.000,90.0,19.406250
4,8,4,0.109131,0.000000,19.406250,90.0,0.000000,0.000000,27,1,...,0.389648,0.910645,1.478516,0.388428,0.437744,0.001856,0.000000,0.000,90.0,18.906250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43675,7,19,0.802734,31.171875,21.093750,81.0,5.390625,5.667969,8754,5,...,0.862305,2.714844,3.462891,0.720703,0.814941,0.090393,3.294922,4.875,84.0,20.593750
43676,7,20,0.552734,4.070312,20.593750,84.0,3.294922,4.875000,8755,5,...,0.720703,0.936035,0.457275,0.497803,0.437744,-0.141235,0.000000,0.000,81.0,21.093750
43677,7,21,0.189453,0.000000,21.093750,81.0,0.000000,0.000000,8756,5,...,0.497803,2.154297,0.714844,0.650879,0.469971,-0.223145,0.000000,0.000,79.0,21.703125
43678,7,22,0.436279,0.000000,21.703125,79.0,0.000000,0.000000,8757,5,...,0.650879,2.316406,1.659180,0.670898,0.588379,0.152954,0.000000,0.000,76.0,21.296875


In [4]:
features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-23', 'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target+1'
qts = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]])
# round to 3 decimals
qts = np.round(qts, 3)
groups = data_train['day_year'].values

In [6]:
# run several lgb models for multiple quantiles
def run_lgb(data, features, target, quantile, params, seed=42):
    # set seed
    X_temp = data[features]
    y_temp = data[target]
    params = {
    'objective': 'quantile',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'verbose': -1
    }
    params['seed'] = seed
    params['metric'] = 'quantile'
    params['alpha'] = quantile
    lgb_train = lgb.Dataset(X_temp, y_temp)
    best_model =  lgb.train(params, lgb_train, num_boost_round=200)
    best_model.save_model('models/lag_minus_24/lgb_' + str(quantile) + '.txt')
    y_pred = best_model.predict(data[features])
    return y_pred

# produce 5 quantile forecasts with uniform weights and output the dataframe with 5 columns
def run_lgb_ensemble(data,  features, target, seed=42):
    y_preds = []
    for quantile in qts:
        y_pred = run_lgb(data,  features, target, quantile, seed)
        y_preds.append(y_pred)
    y_preds = np.array(y_preds).T
    y_preds = pd.DataFrame(y_preds, columns=qts, index=data.index)
    y_preds['number'] = data['number'].values
    y_preds['day_year'] = data['day_year'].values
    return y_preds

In [8]:

output = run_lgb_ensemble(data_train, features, target, seed=42)

In [97]:
def post_process(data):
    # sort column names
    #data = data[sorted(data.columns)]
    data.index = pd.to_datetime(data.index)
    data['hour'] = data.index.hour
    return data

output = post_process(output)
output_test = post_process(output_test)

In [154]:
output

Unnamed: 0_level_0,0.001,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,...,0.7,0.75,0.8,0.85,0.9,0.95,0.999,number,day_year,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-31 21:00:00,0.322883,0.383452,0.413785,0.431443,0.444508,0.456197,0.461880,0.472461,0.491571,0.494161,...,0.556131,0.572204,0.602189,0.641947,0.686733,0.746369,0.860502,1,23,21
2021-08-01 22:00:00,0.338289,0.361590,0.371128,0.373164,0.375278,0.378782,0.380464,0.382798,0.385171,0.386062,...,0.420209,0.425590,0.452466,0.470609,0.525271,0.589980,0.787205,1,24,22
2021-08-01 23:00:00,0.345352,0.361546,0.370288,0.373332,0.375271,0.378129,0.379995,0.382583,0.382851,0.386352,...,0.415988,0.416176,0.430124,0.441633,0.500803,0.532917,0.744893,1,25,23
2021-08-01 00:00:00,0.331988,0.360662,0.368349,0.371216,0.373884,0.375455,0.376025,0.378864,0.379295,0.382737,...,0.400082,0.404158,0.415416,0.426367,0.455668,0.490970,0.728661,1,26,0
2021-08-01 01:00:00,0.324950,0.379630,0.410492,0.420451,0.427570,0.434322,0.435179,0.436782,0.437027,0.438343,...,0.450469,0.458898,0.475228,0.488172,0.510236,0.549509,0.800668,1,27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-30 16:00:00,0.300362,0.422021,0.455104,0.530801,0.542858,0.588118,0.600280,0.637452,0.641795,0.649856,...,0.725742,0.760222,0.793425,0.811300,0.835018,0.875491,0.807802,5,8754,16
2022-07-30 17:00:00,0.319181,0.380770,0.396219,0.411757,0.422813,0.429266,0.433115,0.441603,0.442054,0.451250,...,0.493604,0.510791,0.529246,0.557017,0.608167,0.660644,0.841968,5,8755,17
2022-07-30 18:00:00,0.323339,0.384611,0.411142,0.430256,0.440105,0.455556,0.465402,0.474146,0.477585,0.499881,...,0.539979,0.553502,0.576317,0.605207,0.644079,0.688531,0.855566,5,8756,18
2022-07-30 19:00:00,0.323339,0.392862,0.424677,0.451279,0.480485,0.496076,0.519573,0.536870,0.545165,0.565358,...,0.622803,0.637520,0.661215,0.679893,0.709380,0.734730,0.835429,5,8757,19


In [13]:
output.to_csv('./data/quantile/year_qs.csv', index=True)
output_test.to_csv('./data/quantile/year_qs_test.csv', index=True)