In [2]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [78]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import lightgbm as lgb
from scipy.interpolate import interp1d
from scipy.stats import norm
# import a minmax scaler
from sklearn.preprocessing import MinMaxScaler

features = ['hour', 'month', 'yearly_cons', 'age_house_precise', 'year_construction', 'weekday', 'floor_area']
categoricals = ['month','weekday','hour','employment','social_class','ppl','ppl_over15','ppl_home_over15','ppl_under15','ppl_home_under15','house_type','ownership','age_house','#bedrooms','heating_0',
 'heating_1','heating_2','heating_3','heating_4','heating_5','heating_6',
 'heat_water_0','heat_water_1','heat_water_2','heat_water_3','heat_water_4','heat_water_5','heat_water_6','heat_water_7','cooking']
categoricals = list(set(features) & set(categoricals))
target = 'demand'  
qts = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]])

In [84]:
# Input data directory
input_dir = './data/first/'
data_train = pd.read_csv(input_dir+'full_test.csv', parse_dates=['datetime'])
data_train.index = data_train['datetime']
ID_test = 4644
# limit only to one ID for testing
data_test = data_train[data_train['ID'] == ID_test]
data_test_lastweek = data_test[-168:]
data_test = data_test[:-168]
# normalize the demand column with minmax
# scaler = MinMaxScaler()
# data_test['demand'] = scaler.fit_transform(data_test['demand'].values.reshape(-1,1))
# data_test_lastweek['demand'] = scaler.transform(data_test_lastweek['demand'].values.reshape(-1,1))
# # save the scaler
# with open('data/scalers/minmax.pkl', 'wb') as f:
#     pickle.dump(scaler, f)


In [85]:
# run several lgb models for multiple quantiles
def run_lgb(data, datat, features, target, quantile, params, seed=42):
    # set seed
    np.random.seed(seed)
    # set params
    params['seed'] = seed
    params['metric'] = 'quantile'
    params['alpha'] = quantile
    # train test split
    train = data[data['datetime']<'2010-05-01']
    valid = data[data['datetime']>='2010-05-01']
    # train
    x_train = train[features]
    y_train = train[target]
    x_valid = valid[features]
    y_valid = valid[target]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100)
    # predict
    x_test = datat[features]
    y_pred = model.predict(data[features], num_iteration=model.best_iteration)
    y_pred_test = model.predict(x_test, num_iteration=model.best_iteration)
    return y_pred, y_pred_test

# produce 5 quantile forecasts with uniform weights and output the dataframe with 5 columns
def run_lgb_ensemble(data, datat, features, target, params, seed=42):
    y_preds = []
    y_preds_test = []
    for quantile in qts:
        y_pred, y_pred_test = run_lgb(data, datat, features, target, quantile, params, seed)
        y_preds.append(y_pred)
        y_preds_test.append(y_pred_test)
    y_preds = np.array(y_preds).T
    y_preds_test = np.array(y_preds_test).T
    y_preds = pd.DataFrame(y_preds, columns=qts)
    y_preds_test = pd.DataFrame(y_preds_test, columns=qts)
    return y_preds, y_preds_test

In [86]:
output

Unnamed: 0,0.001,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,...,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.999,hour
0,0.004047,0.007103,0.006967,0.009830,0.010794,0.012456,0.015734,0.016462,0.018337,0.020451,...,0.031006,0.040184,0.050877,0.043874,0.048157,0.056826,0.062624,0.075071,0.527291,0
1,0.003828,0.004712,0.004507,0.004870,0.005467,0.005716,0.006717,0.007250,0.010821,0.012456,...,0.022438,0.026998,0.034206,0.026613,0.032489,0.033562,0.046392,0.052562,0.527291,1
2,0.003739,0.004543,0.004458,0.004625,0.005416,0.005567,0.006635,0.007093,0.010615,0.012112,...,0.021932,0.026998,0.034206,0.026613,0.032103,0.033371,0.045864,0.052562,0.527291,2
3,0.003660,0.004528,0.004676,0.004664,0.005312,0.005557,0.006724,0.007151,0.010562,0.012728,...,0.021932,0.026998,0.034206,0.026613,0.032103,0.033371,0.045864,0.052562,0.527291,3
4,0.003528,0.003725,0.003758,0.004194,0.004451,0.004801,0.005977,0.006736,0.010408,0.012986,...,0.021932,0.026998,0.034206,0.026613,0.032103,0.033371,0.045864,0.052562,0.527291,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8611,0.004140,0.022820,0.035591,0.043309,0.031009,0.044161,0.036389,0.043767,0.039832,0.043408,...,0.070687,0.088597,0.100885,0.106669,0.119091,0.146025,0.174031,0.261485,0.670724,19
8612,0.005215,0.025046,0.034290,0.036745,0.031432,0.044161,0.036389,0.043767,0.039832,0.043408,...,0.070687,0.088597,0.097056,0.104445,0.113700,0.135144,0.156598,0.234950,0.652722,20
8613,0.005836,0.025854,0.039435,0.041356,0.035334,0.047916,0.042172,0.047216,0.042428,0.046103,...,0.070687,0.087839,0.094194,0.091319,0.101148,0.121384,0.139626,0.222498,0.655452,21
8614,0.007457,0.031578,0.039486,0.039211,0.039466,0.039056,0.041601,0.044899,0.041184,0.044070,...,0.057155,0.057955,0.070333,0.079368,0.091574,0.095077,0.140687,0.166880,0.655452,22


In [87]:
params = {
    'objective': 'quantile',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'verbose': -1
}
output, output_test = run_lgb_ensemble(data_test, data_test_lastweek, features, target, params, seed=42)

Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.00112821	valid_1's quantile: 0.000883768
Early stopping, best iteration is:
[92]	training's quantile: 0.00113152	valid_1's quantile: 0.000882475
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.0512927	valid_1's quantile: 0.040384
[200]	training's quantile: 0.0498416	valid_1's quantile: 0.0399725
Early stopping, best iteration is:
[185]	training's quantile: 0.0500074	valid_1's quantile: 0.0399175
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.0969417	valid_1's quantile: 0.076713
Early stopping, best iteration is:
[104]	training's quantile: 0.0968262	valid_1's quantile: 0.076706
Training until validation scores don't improve for 50 rounds
[100]	training's quantile: 0.138386	valid_1's quantile: 0.108853
[200]	training's quantile: 0.133812	valid_1's quantile: 0.109131
Early stopping, best iteration is:
[173]	training'

In [88]:
def post_process(data):
    # sort column names
    data = data[sorted(data.columns)]
    data.columns = np.round(data.columns, 3)
    data['hour'] = data.index % 24
    return data

output = post_process(output)
output_test = post_process(output_test)

In [89]:
output.to_csv('./data/quantile/year_qs.csv', index=False)
output_test.to_csv('./data/quantile/year_qs_test.csv', index=False)
# save data_test and data_test_lastweek for later use
data_test.to_csv('./data/quantile/{}data_test.csv'.format(ID_test), index=False)
data_test_lastweek.to_csv('./data/quantile/{}data_test_lastweek.csv'.format(ID_test), index=False)

In [44]:
data_test_lastweek[data_test_lastweek.hour == 0].index

DatetimeIndex(['2010-07-09', '2010-07-10', '2010-07-11', '2010-07-12',
               '2010-07-13', '2010-07-14', '2010-07-15'],
              dtype='datetime64[ns]', name='datetime', freq=None)

In [45]:
output_test

Unnamed: 0,0.001,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,...,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.999,hour
0,0.210782,0.297217,0.376889,0.415789,0.466874,0.514720,0.588888,0.580264,0.610399,0.667911,...,0.828792,0.923618,1.058736,1.196110,1.318222,1.559211,1.938083,2.569329,9.534807,0
1,0.210782,0.244901,0.270879,0.279707,0.311995,0.327328,0.369006,0.400038,0.410428,0.463758,...,0.521119,0.597033,0.671905,0.728459,0.739545,0.905610,1.113475,1.332178,9.534807,1
2,0.210782,0.235962,0.260991,0.267025,0.298839,0.321912,0.358810,0.404994,0.413844,0.433277,...,0.478435,0.505260,0.580209,0.584621,0.575351,0.663469,0.761573,0.798620,9.534807,2
3,0.209403,0.232752,0.260232,0.262646,0.293621,0.317519,0.340165,0.378871,0.386367,0.416072,...,0.463775,0.500469,0.566530,0.534426,0.539023,0.594908,0.618915,0.691040,9.534807,3
4,0.209403,0.231932,0.259945,0.264271,0.297110,0.314998,0.340261,0.364507,0.373659,0.406107,...,0.454913,0.499754,0.566893,0.530083,0.526829,0.557894,0.600408,0.691040,9.534807,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,0.212276,0.689565,0.753732,0.894608,1.024743,1.166347,1.508663,1.643778,1.870733,2.097732,...,2.532150,2.841900,2.837437,3.209722,3.485280,3.874037,4.507554,5.048666,10.476686,19
164,0.244275,0.742705,0.816659,0.929086,1.083407,1.223784,1.366440,1.544540,1.759519,1.962421,...,2.418147,2.678356,2.782819,3.060741,3.474532,3.845037,4.294941,5.170104,10.476686,20
165,0.256073,0.723718,0.897426,1.154073,1.316847,1.434545,1.713670,1.903882,2.060417,2.293244,...,2.912906,3.356329,3.408846,3.637994,3.962979,4.461031,4.425510,5.232768,10.476686,21
166,0.256073,0.732097,0.894927,1.172990,1.342170,1.490202,1.818806,1.985217,2.254235,2.476751,...,3.138579,3.354216,3.484439,3.683045,3.989262,4.025737,4.209906,4.582091,9.808737,22
