In [4]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb

features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-1', 'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target'
qts = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]])

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [6]:
data_train = pd.read_csv('data/extra_train.csv', index_col=0)
data_test = pd.read_csv('data/extra_test.csv', index_col=0)
data_train

Unnamed: 0,Month,Hour,Day Type,cons_target-1,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],building,...,net_target-1_min_lag3,net_target-1_std_lag3,cons_target,gen_target,cons_target-2,gen_target-2,diffuse_solar_radiation+1,drybulb_temp+1,relative_humidity+1,net_target-23
2,8.0,0.0,2.0,0.116313,0.0,19.4,87.0,0.0,0.0,2,...,0.3982,0.119400,0.106422,0.000000,0.170776,0.000000,0.000000,19.4,87.0,0.382273
3,8.0,1.0,2.0,0.106422,0.0,19.4,87.0,0.0,0.0,2,...,0.3884,0.023770,0.109127,0.000000,0.116313,0.000000,0.000000,19.4,90.0,0.437649
4,8.0,2.0,2.0,0.109127,0.0,19.4,90.0,0.0,0.0,2,...,0.3880,0.003506,0.107621,0.000000,0.106422,0.000000,0.000000,18.9,90.0,0.418428
5,8.0,3.0,2.0,0.107621,0.0,18.9,90.0,0.0,0.0,2,...,0.3877,0.000930,0.120207,0.000000,0.109127,0.000000,0.000000,18.3,93.0,0.466455
6,8.0,4.0,2.0,0.120207,0.0,18.3,93.0,0.0,0.0,2,...,0.3887,0.004715,0.156445,14.095833,0.107621,0.000000,2.772589,18.9,90.0,0.341153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43653,7.0,19.0,7.0,0.092567,0.0,18.3,90.0,0.0,0.0,1,...,0.4750,0.217000,0.144716,0.000000,0.703661,2.858333,0.000000,18.9,87.0,0.525154
43654,7.0,20.0,7.0,0.144716,0.0,18.9,87.0,0.0,0.0,1,...,0.4410,0.209100,0.335914,0.000000,0.092567,0.000000,0.000000,19.4,84.0,0.596310
43655,7.0,21.0,7.0,0.335914,0.0,19.4,84.0,0.0,0.0,1,...,0.4410,0.079400,0.223369,0.000000,0.144716,0.000000,0.000000,18.9,90.0,0.457615
43656,7.0,22.0,7.0,0.223369,0.0,18.9,90.0,0.0,0.0,1,...,0.4749,0.059540,0.272052,0.000000,0.335914,0.000000,0.000000,20.5,76.0,0.453849


In [8]:
# run several lgb models for multiple quantiles
def run_lgb(data, datat, features, target, quantile, params, seed=42):
    # set seed
    np.random.seed(seed)
    # set params
    params['seed'] = seed
    params['metric'] = 'quantile'
    params['alpha'] = quantile
    # train test split
    train = data[data['day_year'] < 7000]
    valid = data[data['day_year'] >= 7000]
    # train
    x_train = train[features]
    y_train = train[target]
    x_valid = valid[features]
    y_valid = valid[target]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100)
    # save model in the model folder, with the name of the quantile
    model.save_model('models/lgb_' + str(quantile.round(3)) + '.txt')
    # predict
    x_test = datat[features]
    y_pred = model.predict(data[features], num_iteration=model.best_iteration)
    y_pred_test = model.predict(x_test, num_iteration=model.best_iteration)
    return y_pred, y_pred_test

# produce 5 quantile forecasts with uniform weights and output the dataframe with 5 columns
def run_lgb_ensemble(data, datat, features, target, params, seed=42):
    y_preds = []
    y_preds_test = []
    for quantile in qts:
        y_pred, y_pred_test = run_lgb(data, datat, features, target, quantile, params, seed)
        y_preds.append(y_pred)
        y_preds_test.append(y_pred_test)
    y_preds = np.array(y_preds).T
    y_preds_test = np.array(y_preds_test).T
    y_preds = pd.DataFrame(y_preds, columns=qts, index=data.timestamp)
    y_preds_test = pd.DataFrame(y_preds_test, columns=qts, index=datat.timestamp)
    return y_preds, y_preds_test

In [11]:
params = {
    'objective': 'quantile',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'verbose': -1
}
output, output_test = run_lgb_ensemble(data_train, data_test, features, target, params, seed=42)

Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.00019888	valid_1's quantile: 0.000265543
Early stopping, best iteration is:
[128]	training's quantile: 0.000180011	valid_1's quantile: 0.000260544
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.00559065	valid_1's quantile: 0.00751702
Early stopping, best iteration is:
[119]	training's quantile: 0.00543929	valid_1's quantile: 0.00748835
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.00949362	valid_1's quantile: 0.0127763
[200]	training's quantile: 0.00885746	valid_1's quantile: 0.0126423
Early stopping, best iteration is:
[186]	training's quantile: 0.00889786	valid_1's quantile: 0.0126362
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.0124695	valid_1's quantile: 0.0170324
[200]	training's quantile: 0.0118041	valid_1's quantile: 0.0169228
[300]	training's quantile: 0.01149

In [12]:
def post_process(data):
    # sort column names
    data = data[sorted(data.columns)]
    data.columns = np.round(data.columns, 3)
    data.index = pd.to_datetime(data.index)
    data['hour'] = data.index.hour
    return data

output = post_process(output)
output_test = post_process(output_test)

In [13]:
output.to_csv('./data/quantile/year_qs.csv', index=True)
output_test.to_csv('./data/quantile/year_qs_test.csv', index=True)

In [15]:
output_test.iloc[20]

0.001     0.310481
0.05      0.369237
0.1       0.380590
0.15      0.392508
0.2       0.395841
0.25      0.407541
0.3       0.410353
0.35      0.412127
0.4       0.417983
0.45      0.426125
0.5       0.424063
0.55      0.435589
0.6       0.445733
0.65      0.449849
0.7       0.472793
0.75      0.479409
0.8       0.506465
0.85      0.540945
0.9       0.601305
0.95      0.663160
0.999     0.815909
hour     20.000000
Name: 2021-08-02 20:00:00, dtype: float64