In [2]:
%matplotlib inline

# plotting
import matplotlib as mpl
mpl.style.use('ggplot')
import matplotlib.pyplot as plt

# math and data manipulation
import numpy as np
import pandas as pd

# set random seeds 
from numpy.random import seed
from dateutil.parser import parse
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, LabelEncoder, Normalizer, StandardScaler
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.externals import joblib
# modeling
import lightgbm as lgb
import xgboost as xgb
# progress bar
from tqdm import tqdm
import gc

In [3]:
train = pd.read_csv('../data/consumption_train.csv',parse_dates=['timestamp'],index_col=0)
train.head(5)

Unnamed: 0,series_id,timestamp,consumption,temperature
0,103088,2014-12-24 00:00:00,101842.233424,
1,103088,2014-12-24 01:00:00,105878.048906,
2,103088,2014-12-24 02:00:00,91619.105008,
3,103088,2014-12-24 03:00:00,94473.706203,
4,103088,2014-12-24 04:00:00,96976.755526,


In [4]:
print (train.shape)
print (train.series_id.nunique())

(509376, 4)
758


In [5]:
test = pd.read_csv('../data/cold_start_test.csv',index_col=0,parse_dates=['timestamp'])
test.head(5)

Unnamed: 0,series_id,timestamp,consumption,temperature
0,102781,2013-02-27 00:00:00,15295.740389,17.0
1,102781,2013-02-27 01:00:00,15163.209562,18.25
2,102781,2013-02-27 02:00:00,15022.264079,18.0
3,102781,2013-02-27 03:00:00,15370.420458,17.0
4,102781,2013-02-27 04:00:00,15303.103213,16.9


In [6]:
my_submission = pd.read_csv('../data/submission_format.csv')
my_submission.head(3)

Unnamed: 0,pred_id,series_id,timestamp,temperature,consumption,prediction_window
0,0,102781,2013-03-03 00:00:00,19.93125,0.0,daily
1,1,102781,2013-03-04 00:00:00,20.034375,0.0,daily
2,2,102781,2013-03-05 00:00:00,19.189583,0.0,daily


In [7]:
meta = pd.read_csv('../data/meta.csv')
meta.head(5)

Unnamed: 0,series_id,surface,base_temperature,monday_is_day_off,tuesday_is_day_off,wednesday_is_day_off,thursday_is_day_off,friday_is_day_off,saturday_is_day_off,sunday_is_day_off
0,100003,x-large,low,False,False,False,False,False,True,True
1,100004,x-large,low,False,False,False,False,False,True,True
2,100006,x-small,low,False,False,False,False,False,True,True
3,100008,x-small,low,False,False,False,False,False,True,True
4,100010,x-small,low,False,False,False,False,False,True,True


In [8]:
for col in meta.columns:
    if col != 'series_id':
        clf = LabelEncoder()
        meta[col] = clf.fit_transform(meta[col])

In [9]:
meta.head(5)

Unnamed: 0,series_id,surface,base_temperature,monday_is_day_off,tuesday_is_day_off,wednesday_is_day_off,thursday_is_day_off,friday_is_day_off,saturday_is_day_off,sunday_is_day_off
0,100003,3,1,0,0,0,0,0,1,1
1,100004,3,1,0,0,0,0,0,1,1
2,100006,4,1,0,0,0,0,0,1,1
3,100008,4,1,0,0,0,0,0,1,1
4,100010,4,1,0,0,0,0,0,1,1


In [10]:
train = pd.merge(train,meta,how="left")
test = pd.merge(test,meta,how="left")
my_submission = pd.merge(my_submission,meta,how="left")

In [11]:
def create_lagged_features(df, lag=1):
    if not type(df) == pd.DataFrame:
        df = pd.DataFrame(df, columns=['consumption'])
    
    def _rename_lag(ser, j):
        ser.name = ser.name + f'_{j}'
        return ser
        
    # add a column lagged by `i` steps
    if len(df) > lag:
        for i in range(1, lag + 1):
            df = df.join(df.consumption.shift(i).pipe(_rename_lag, i))
    else:
        for i in range(1, lag):
            df = df.join(df.consumption.shift(i).pipe(_rename_lag, i))
        df.columns = ['consumption_'+str(i) for i in range(1,lag+1)]

    df.dropna(inplace=True)
    return df

In [12]:
days = {0:'monday',1:'tuesday',2:'wednesday',3:'thursday',4:'friday',5:'saturday',6:'sunday'}
daysnum = {'monday':1,'tuesday':2,'wednesday':3,'thursday':4,'friday':5,'saturday':6,'sunday':7}

In [12]:
def prepare_training_data(df, lag):
    """ Converts a series of consumption data into a
        lagged, scaled sample.
    """
    consumption_series = df.consumption
    # scale training data
    scaler = MinMaxScaler(feature_range=(0, 1))
    consumption_vals = scaler.fit_transform(consumption_series.values.reshape(-1, 1))
    
    # convert consumption series to lagged features
    consumption_lagged = create_lagged_features(consumption_vals, lag=lag)
    
    consumption_lagged['series_id'] = df.series_id.iloc[0]
    consumption_lagged['surface'] = df.surface.iloc[0]
    consumption_lagged['base_temp'] = df.base_temperature.iloc[0]
    
    consumption_lagged['day_of_week'] = [days[(df.timestamp.iloc[i+lag-1].dayofweek+1)%7] for i in range(0,len(consumption_lagged))]
    consumption_lagged['is_off'] = consumption_lagged.day_of_week.apply(lambda x: df[x+'_is_day_off'].iloc[0])
    
    consumption_lagged['day_of_week'] = consumption_lagged.day_of_week.apply(lambda x: daysnum[x])
    # X, y format taking the first column (original time series) to be the y
    if len(consumption_series) > lag:
        X = consumption_lagged.drop('consumption', axis=1)
        y = list(consumption_lagged.consumption.values)
    elif len(consumption_series) == lag:
        X = consumption_lagged
        y = []
        
    return X, y, scaler

In [30]:
lag = 72

In [31]:
X24 = pd.DataFrame()
y24 = []
for ser_id, ser_data in tqdm(train.groupby('series_id')):

    # prepare the data
    X, y, scaler = prepare_training_data(ser_data, lag)
    X24 = pd.concat([X24,X],axis=0)
    y24 += y


  0%|          | 0/758 [00:00<?, ?it/s][A
  0%|          | 1/758 [00:02<25:42,  2.04s/it][A
  0%|          | 2/758 [00:02<17:08,  1.36s/it][A
  0%|          | 3/758 [00:03<14:01,  1.11s/it][A
  1%|          | 4/758 [00:03<12:21,  1.02it/s][A
  1%|          | 5/758 [00:04<11:35,  1.08it/s][A
  1%|          | 6/758 [00:05<10:56,  1.15it/s][A
  1%|          | 7/758 [00:05<10:29,  1.19it/s][A
  1%|          | 8/758 [00:06<10:12,  1.22it/s][A
  1%|          | 9/758 [00:07<09:59,  1.25it/s][A
  1%|▏         | 10/758 [00:07<09:57,  1.25it/s]Exception in thread Thread-6:
Traceback (most recent call last):
  File "/home/victor/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/victor/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/home/victor/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed si

In [32]:
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        #if b < 1:
        #    b = 0
        c = a
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (100.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.array(labels), np.array(preds) #np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'mape', smape_fast(labels, preds), False

def mape(y_true,y_pred):
    l = []
    for i in range(len(y_true)):
        if y_true[i] != 0:
            l.append(np.abs(y_true[i]-y_pred[i])/y_true[i]*100)
    return np.mean(l)

In [33]:
train_cols = list(X24.columns)
train_cols.remove('series_id')
X24['target'] = y24

In [34]:
group_kfold = GroupKFold(n_splits=5)
for train_index, val_index in group_kfold.split(X24, y24, X24.series_id):
    pass

In [35]:
lgb_train = lgb.Dataset(X24[train_cols].iloc[train_index], label=X24['target'].iloc[train_index], free_raw_data=False)
lgb_val = lgb.Dataset(X24[train_cols].iloc[val_index], label=X24['target'].iloc[val_index], free_raw_data=False, reference=lgb_train)

In [36]:
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.05

In [37]:
model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train,lgb_val],early_stopping_rounds=50,feval=lgb_smape)

[1]	training's mape: 1268.83	valid_1's mape: 1575.6
Train until valid scores didn't improve in 50 rounds.
[2]	training's mape: 1213.79	valid_1's mape: 1505.5
[3]	training's mape: 1161.34	valid_1's mape: 1438.65
[4]	training's mape: 1111.75	valid_1's mape: 1375.73
[5]	training's mape: 1063.54	valid_1's mape: 1314.84
[6]	training's mape: 1018.84	valid_1's mape: 1258.12
[7]	training's mape: 977.536	valid_1's mape: 1205.39
[8]	training's mape: 936.814	valid_1's mape: 1153.98
[9]	training's mape: 898.387	valid_1's mape: 1105.52
[10]	training's mape: 861.781	valid_1's mape: 1057.92
[11]	training's mape: 826.724	valid_1's mape: 1013.62
[12]	training's mape: 794.076	valid_1's mape: 972.19
[13]	training's mape: 761.946	valid_1's mape: 932.119
[14]	training's mape: 732.208	valid_1's mape: 894.694
[15]	training's mape: 703.001	valid_1's mape: 858.046
[16]	training's mape: 676.314	valid_1's mape: 823.633
[17]	training's mape: 649.573	valid_1's mape: 790.637
[18]	training's mape: 625.683	valid_1's 

[152]	training's mape: 107.275	valid_1's mape: 108.145
[153]	training's mape: 107.266	valid_1's mape: 108.132
[154]	training's mape: 107.193	valid_1's mape: 108.043
[155]	training's mape: 106.798	valid_1's mape: 107.603
[156]	training's mape: 106.752	valid_1's mape: 107.548
[157]	training's mape: 106.551	valid_1's mape: 107.38
[158]	training's mape: 106.141	valid_1's mape: 107.127
[159]	training's mape: 105.89	valid_1's mape: 106.909
[160]	training's mape: 105.933	valid_1's mape: 106.959
[161]	training's mape: 105.541	valid_1's mape: 106.501
[162]	training's mape: 105.47	valid_1's mape: 106.427
[163]	training's mape: 105.357	valid_1's mape: 106.298
[164]	training's mape: 105.399	valid_1's mape: 106.287
[165]	training's mape: 105.18	valid_1's mape: 106.081
[166]	training's mape: 105.146	valid_1's mape: 106.062
[167]	training's mape: 104.827	valid_1's mape: 105.72
[168]	training's mape: 104.767	valid_1's mape: 105.649
[169]	training's mape: 104.555	valid_1's mape: 105.639
[170]	training'

[302]	training's mape: 92.8692	valid_1's mape: 95.221
[303]	training's mape: 92.8731	valid_1's mape: 95.2593
[304]	training's mape: 92.7647	valid_1's mape: 95.1037
[305]	training's mape: 92.7392	valid_1's mape: 95.0849
[306]	training's mape: 92.6617	valid_1's mape: 95.0439
[307]	training's mape: 92.3664	valid_1's mape: 94.6996
[308]	training's mape: 92.3521	valid_1's mape: 94.7409
[309]	training's mape: 92.2459	valid_1's mape: 94.598
[310]	training's mape: 92.234	valid_1's mape: 94.6066
[311]	training's mape: 92.2817	valid_1's mape: 94.6393
[312]	training's mape: 92.2146	valid_1's mape: 94.5813
[313]	training's mape: 92.2451	valid_1's mape: 94.625
[314]	training's mape: 92.2802	valid_1's mape: 94.653
[315]	training's mape: 92.2206	valid_1's mape: 94.5589
[316]	training's mape: 92.2207	valid_1's mape: 94.6012
[317]	training's mape: 92.215	valid_1's mape: 94.6187
[318]	training's mape: 92.1859	valid_1's mape: 94.6042
[319]	training's mape: 92.1388	valid_1's mape: 94.4153
[320]	training's

[452]	training's mape: 86.4114	valid_1's mape: 90.6309
[453]	training's mape: 86.3475	valid_1's mape: 90.8202
[454]	training's mape: 86.3226	valid_1's mape: 90.7877
[455]	training's mape: 86.2209	valid_1's mape: 90.7156
[456]	training's mape: 86.2094	valid_1's mape: 90.7139
[457]	training's mape: 86.2263	valid_1's mape: 90.7406
[458]	training's mape: 86.1715	valid_1's mape: 90.7961
[459]	training's mape: 86.1971	valid_1's mape: 90.8344
[460]	training's mape: 86.2048	valid_1's mape: 90.9337
[461]	training's mape: 86.207	valid_1's mape: 90.9495
[462]	training's mape: 86.1595	valid_1's mape: 90.8928
[463]	training's mape: 86.1478	valid_1's mape: 90.8891
[464]	training's mape: 86.1137	valid_1's mape: 90.871
[465]	training's mape: 86.1253	valid_1's mape: 90.8802
[466]	training's mape: 86.1049	valid_1's mape: 90.8519
[467]	training's mape: 86.0962	valid_1's mape: 90.856
[468]	training's mape: 86.0592	valid_1's mape: 90.8448
[469]	training's mape: 85.9535	valid_1's mape: 90.4759
[470]	trainin

[602]	training's mape: 82.377	valid_1's mape: 88.9468
[603]	training's mape: 82.3739	valid_1's mape: 88.9429
[604]	training's mape: 82.3243	valid_1's mape: 88.8863
[605]	training's mape: 82.3403	valid_1's mape: 88.948
[606]	training's mape: 82.3014	valid_1's mape: 88.8919
[607]	training's mape: 82.323	valid_1's mape: 88.8952
[608]	training's mape: 82.3207	valid_1's mape: 88.8994
[609]	training's mape: 82.3194	valid_1's mape: 88.9167
[610]	training's mape: 82.3151	valid_1's mape: 88.9265
[611]	training's mape: 82.2643	valid_1's mape: 88.9192
[612]	training's mape: 82.2407	valid_1's mape: 88.9031
[613]	training's mape: 82.2443	valid_1's mape: 88.9204
[614]	training's mape: 82.2476	valid_1's mape: 88.9347
[615]	training's mape: 82.2413	valid_1's mape: 88.9462
[616]	training's mape: 82.2207	valid_1's mape: 88.9044
[617]	training's mape: 82.2285	valid_1's mape: 88.9286
[618]	training's mape: 82.2104	valid_1's mape: 88.8985
[619]	training's mape: 82.1907	valid_1's mape: 88.8784
[620]	trainin

[752]	training's mape: 79.9135	valid_1's mape: 87.0442
[753]	training's mape: 79.9098	valid_1's mape: 87.0356
[754]	training's mape: 79.9183	valid_1's mape: 87.0551
[755]	training's mape: 79.9012	valid_1's mape: 87.0393
[756]	training's mape: 79.8836	valid_1's mape: 87.0153
[757]	training's mape: 79.7413	valid_1's mape: 86.9263
[758]	training's mape: 79.7028	valid_1's mape: 86.9356
[759]	training's mape: 79.6957	valid_1's mape: 87.0243
[760]	training's mape: 79.699	valid_1's mape: 87.0409
[761]	training's mape: 79.6906	valid_1's mape: 87.0524
[762]	training's mape: 79.7066	valid_1's mape: 87.0763
[763]	training's mape: 79.71	valid_1's mape: 87.1004
[764]	training's mape: 79.7081	valid_1's mape: 87.1074
[765]	training's mape: 79.6879	valid_1's mape: 87.0828
[766]	training's mape: 79.6787	valid_1's mape: 87.0684
[767]	training's mape: 79.6403	valid_1's mape: 86.9971
[768]	training's mape: 79.644	valid_1's mape: 86.9641
[769]	training's mape: 79.6369	valid_1's mape: 86.9805
[770]	training

[902]	training's mape: 77.9779	valid_1's mape: 86.6021
[903]	training's mape: 77.9807	valid_1's mape: 86.6197
[904]	training's mape: 77.9857	valid_1's mape: 86.6244
[905]	training's mape: 77.991	valid_1's mape: 86.6456
[906]	training's mape: 77.988	valid_1's mape: 86.6405
[907]	training's mape: 77.9755	valid_1's mape: 86.6297
[908]	training's mape: 77.9724	valid_1's mape: 86.6243
Early stopping, best iteration is:
[858]	training's mape: 78.3561	valid_1's mape: 85.7204


In [38]:
pred = model.predict(X24[train_cols].iloc[val_index])

In [39]:
print (mean_absolute_error(X24['target'].iloc[val_index],pred))
print (mape(list(X24['target'].iloc[val_index]),pred))

0.05196879611425471
86.94005098212098


In [40]:
'''
Minmax
lag = 24
0.059817944178918274
123.11137794094829
lag = 48
110
'''

'\nMinmax\nlag = 24\n0.059817944178918274\n123.11137794094829\nlag = 48\n110\n'

In [41]:
y_final = []
yhat_final = []
y_final_inv = []
yhat_final_inv = []
for ser_id, ser_data in tqdm(test.groupby('series_id')):

    # prepare the data
    if len(ser_data) > lag:
        X, y, scaler = prepare_training_data(ser_data, lag)
        yhat = model.predict(X[train_cols])

        y_final_inv += list(y)
        yhat_final_inv += list(yhat)
        yhat = scaler.inverse_transform(np.array(yhat).reshape(-1, 1)).ravel()  
        y = scaler.inverse_transform(np.array(y).reshape(-1, 1)).ravel()
        y_final += list(y)
        yhat_final += list(yhat)

100%|██████████| 625/625 [04:40<00:00,  2.23it/s]


In [42]:
print (mean_absolute_error(y_final_inv,yhat_final_inv))
print (mape(y_final_inv,yhat_final_inv))
print (mape(y_final,yhat_final))

0.07039256517667151
848.4761483355933
47.18367279955582


In [43]:
#explainer = shap.KernelExplainer(model = model.predict, data=X)
#shap_values = explainer.shap_values(X)
import eli5

In [44]:
train_cols

In [47]:
importances = model.feature_importance()
indices = np.argsort(importances)[::-1]

for i in range(len(train_cols)):
    print (train_cols[indices[i]], importances[indices[i]])

consumption_1 2075
consumption_24 1356
consumption_72 878
consumption_48 853
day_of_week 829
consumption_2 823
consumption_25 801
consumption_49 628
consumption_23 589
consumption_26 548
surface 507
consumption_3 451
consumption_71 443
consumption_13 391
consumption_50 385
consumption_4 368
consumption_12 365
consumption_47 359
consumption_27 340
consumption_6 339
consumption_22 336
consumption_18 334
consumption_11 309
consumption_36 301
consumption_14 295
consumption_51 293
consumption_70 292
consumption_60 289
consumption_19 279
consumption_5 276
consumption_28 271
consumption_16 271
consumption_21 270
consumption_17 264
consumption_10 256
consumption_46 255
consumption_7 251
consumption_8 249
consumption_59 249
consumption_66 248
consumption_52 240
consumption_45 239
consumption_69 237
consumption_61 234
consumption_20 234
consumption_43 229
consumption_37 227
consumption_54 222
consumption_35 218
consumption_9 216
consumption_68 215
consumption_65 215
consumption_38 213
consumptio

In [48]:
del X24, y24, lgb_train, lgb_val, y, y_final, y_final_inv, yhat, yhat_final, yhat_final_inv

In [49]:
def train_lagged_ts(lag_day):
    
    lag = lag_day * 24
    trainX = pd.DataFrame()
    trainy = []
    for ser_id, ser_data in tqdm(train.groupby('series_id')):

        # prepare the data
        X, y, scaler = prepare_training_data(ser_data, lag)
        trainX = pd.concat([trainX,X],axis=0)
        trainy += y
    
    for ser_id, ser_data in tqdm(test.groupby('series_id')):

        if len(ser_data) > lag:
            # prepare the data
            X, y, scaler = prepare_training_data(ser_data, lag)
            trainX = pd.concat([trainX,X],axis=0)
            trainy += y
  
    train_cols = list(trainX.columns)
    train_cols.remove('series_id')
    trainX['target'] = trainy
    
    model = lgb.LGBMRegressor(n_estimators=1000, silent=False, learning_rate=.05)
    #lgb_train = lgb.Dataset(trainX[train_cols], label=trainX['target'], free_raw_data=False)
    #model = lgb.train(param,lgb_train, 200,feval=lgb_smape)
    model = model.fit(trainX[train_cols], trainX['target'],eval_metric = lgb_smape)
    joblib.dump(model,'../lgb{}.pkl'.format(lag_day))
    return True

In [50]:
for i in range(1,8):
    train_lagged_ts(i)

100%|██████████| 758/758 [04:14<00:00,  2.97it/s]
100%|██████████| 625/625 [02:32<00:00,  4.09it/s]
100%|██████████| 758/758 [06:26<00:00,  1.96it/s]
100%|██████████| 625/625 [05:10<00:00,  2.01it/s]
100%|██████████| 758/758 [09:21<00:00,  1.35it/s]
100%|██████████| 625/625 [06:37<00:00,  1.57it/s]
100%|██████████| 758/758 [12:23<00:00,  1.02it/s]
100%|██████████| 625/625 [08:02<00:00,  1.29it/s]
100%|██████████| 758/758 [15:43<00:00,  1.25s/it]
100%|██████████| 625/625 [09:14<00:00,  1.13it/s]
100%|██████████| 758/758 [20:11<00:00,  1.60s/it]
100%|██████████| 625/625 [09:55<00:00,  1.05it/s]
100%|██████████| 758/758 [24:43<00:00,  1.96s/it] 
100%|██████████| 625/625 [11:16<00:00,  1.08s/it]


In [None]:
'''
### Results

with (0,1) scaling
1. 2 layers LSTM (24,12) =========> .465 => submission1
2. 3 layers LSTM (24,12,6) =======> .53 => submission2
3. 2 layers LSTM (12,6) with lag 12 ======> .408 => submission4
4. 2 layers LSTM (24,24) with lag 24 ======> .61 => submission5
5. 3 layers LSTM (24,24,12) with lag 24 ======> .39 => submission6 (current best)

4. 2 layers LSTM (12,6) with lag 12 bidirectional ======> .43 => submission3
5. 2 layers LSTM (24,12) with lag 24 bidirectional =====> 1.3
6. 2 layers LSTM (6,3) with lag 6 bidirectional =====> .45
7. 2 layers LSTM (12,12) with lag 12 bidirectional =====> .42
8. 2 layers LSTM (12,12) with lag 12 bidirectional with dropout =====>
9. 2 layers LSTM (12,12,6) with lag 12 bidirectional =====>
10. 2 layers LSTM (12,12,6) with lag 12 bidirectional with dropout =====>

with (-1,1) scaling
11. 2 layers LSTM (12,6) with lag 12 ======> .526
11. 2 layers LSTM (12,12) with lag 12 =========> 
12. 3 layers LSTM (24,12,6) =======> 
14. 2 layers LSTM (12,6) with lag 12 bidirectional ======> 
15. 2 layers LSTM (6,3) with lag 6 bidirectional =====>


In [13]:
models = {}
for i in range(1,8):
    models[i] = joblib.load('../lgb{}.pkl'.format(i))

In [None]:
def generate_hourly_forecast(num_pred_hours, consumption, model, scaler, lag):
    """ Uses last hour's prediction to generate next for num_pred_hours, 
        initialized by most recent cold start prediction. Inverts scale of 
        predictions before return.
    """
    # allocate prediction frame
    preds_scaled = np.zeros(num_pred_hours)
    
    # initial X is last lag values from the cold start
    X = scaler.transform(consumption.values.reshape(-1, 1))[-lag:]
    
    # forecast
    for i in range(num_pred_hours):
        # predict scaled value for next time step
        yhat = model.predict(X.reshape(1, 1, lag), batch_size=1)[0][0][0]
        preds_scaled[i] = yhat
        
        # update X to be latest data plus prediction
        X = pd.Series(X.ravel()).shift(-1).fillna(yhat).values

    # revert scale back to original range
    hourly_preds = scaler.inverse_transform(preds_scaled.reshape(-1, 1)).ravel()
    return hourly_preds

In [14]:
pred_window_to_num_preds = {'hourly': 24, 'daily': 7, 'weekly': 2}
pred_window_to_num_pred_hours = {'hourly': 24, 'daily': 7 * 24, 'weekly': 2 * 7 * 24}

In [None]:
%%time

num_test_series = my_submission.series_id.nunique()


for ser_id, pred_df in tqdm(my_submission.groupby('series_id'), 
                            total=num_test_series, 
                            desc="Forecasting from Cold Start Data"):
        
    # get info about this series' prediction window
    pred_window = pred_df.prediction_window.unique()[0]
    num_preds = pred_window_to_num_preds[pred_window]
    num_pred_hours = pred_window_to_num_pred_hours[pred_window]
    
    # prepare cold start data
    series_data = test[test.series_id == ser_id].consumption
    cold_X, cold_y, scaler = prepare_training_data(series_data, lag)
    cold_y = cold_y.reshape(cold_y.shape[0],1,1)
    # fine tune our lstm model to this site using cold start data    
    model.fit(cold_X, cold_y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
    
    # make hourly forecasts for duration of pred window
    preds = generate_hourly_forecast(num_pred_hours, series_data, model, scaler, lag)
    
    # reduce by taking sum over each sub window in pred window
    reduced_preds = [pred.sum() for pred in np.split(preds, num_preds)]
    
    # store result in submission DataFrame
    ser_id_mask = my_submission.series_id == ser_id
    my_submission.loc[ser_id_mask, 'consumption'] = reduced_preds

In [None]:
my_submission.head(10)

In [None]:
my_submission.to_csv("../data/submmission16.csv",index=False)