In [None]:
DATA_PATH = 'data/no_exogenous/data.csv'
FREQ = 'h'
YEAR = 2021
LOOK_BACK = 36
WAVELET = 'db4'
LEVELS = 3
LSTM_EPOCHS = 200
LSTM_BATCH_SIZE = 128
RF_MAX_DEPTH = 10
RF_N_ESTIMATORS = 100
OUTPUT_PATH = f'data/predictions_lstm_wavelet_rf_{YEAR}.csv'

In [None]:
import pandas as pd
import numpy as np
import pywt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [None]:
def load_data():
    data = pd.read_csv(DATA_PATH, parse_dates=['timestamp'])
    data = data.set_index('timestamp').asfreq(FREQ).dropna()
    data['hour'] = data.index.hour
    data['day_of_week'] = data.index.dayofweek
    data['month'] = data.index.month
    data['year'] = data.index.year
    data['is_weekend'] = (data.index.dayofweek >= 5).astype(int)
    data['quarter'] = data.index.quarter
    data['day_of_year'] = data.index.dayofyear
    return data

def decompose_signal(series):
    coeffs = pywt.wavedec(series, WAVELET, level=LEVELS)
    names = ['cA'] + [f'cD{i}' for i in range(LEVELS, 0, -1)]
    comps = {}
    for i, name in enumerate(names):
        temp = [np.zeros_like(c) for c in coeffs]
        temp[i] = coeffs[i]
        rec = pywt.waverec(temp, WAVELET)
        comps[name] = rec[:len(series)]
    return comps

def create_seq(X, y):
    Xs, ys = [], []
    for i in range(LOOK_BACK, len(X)):
        Xs.append(X[i-LOOK_BACK:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

In [None]:
data = load_data()
train = data[(data['year'] < YEAR) & (data['year'] > YEAR - 5)]
test  = data[data['year'] == YEAR]
calendar_cols = ['hour','day_of_week','month','is_weekend','quarter','day_of_year']
scaler_feat = StandardScaler().fit(train[calendar_cols])
scaler_tgt  = StandardScaler().fit(train[['value']])
train_feat  = scaler_feat.transform(train[calendar_cols])
test_feat   = scaler_feat.transform(test[calendar_cols])
train_tgt   = scaler_tgt.transform(train[['value']]).flatten()
components  = decompose_signal(train_tgt)
datasets    = {name: create_seq(np.hstack([train_feat, train_tgt.reshape(-1,1)]), comp)
               for name, comp in components.items()}

In [None]:
models = {}
for name, (Xs, ys) in datasets.items():
    m = Sequential([
        LSTM(128, return_sequences=True, input_shape=(LOOK_BACK, Xs.shape[2])),
        LSTM(64, return_sequences=True),
        LSTM(36),
        Dropout(0.2),
        Dense(1)
    ])
    m.compile('adam', 'mse')
    m.fit(Xs, ys, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH_SIZE, verbose=1)
    models[name] = m

In [None]:
X_rf = np.hstack([train_feat] + [components[name].reshape(-1,1) for name in sorted(components.keys())])
y_rf = train_tgt
rf = RandomForestRegressor(max_depth=RF_MAX_DEPTH, n_estimators=RF_N_ESTIMATORS, random_state=42)
rf.fit(X_rf, y_rf)

In [None]:
buf = np.hstack([train_feat, train_tgt.reshape(-1,1)])[-LOOK_BACK:]
preds = []
for i in range(len(test_feat)):
    inp = buf.reshape(1, LOOK_BACK, buf.shape[1])
    comp_pred = [models[name].predict(inp)[0,0] for name in sorted(models.keys())]
    rf_in = np.hstack([test_feat[i], comp_pred]).reshape(1, -1)
    p = rf.predict(rf_in)[0]
    preds.append(p)
    buf = np.vstack([buf[1:], np.hstack([test_feat[i], p])])


In [None]:
preds = scaler_tgt.inverse_transform(np.array(preds).reshape(-1,1)).flatten()
out = test.copy()
out['value'] = preds
out.to_csv(OUTPUT_PATH, index=False)