In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.models import load_model
from keras import callbacks, regularizers, optimizers
from keras.regularizers import L1L2
from keras_tuner import RandomSearch, Objective

In [None]:
# RUN IT: define a function check_stationarity(series)
from statsmodels.tsa.stattools import adfuller

def check_stationarity(series):

    result = adfuller(series.values)

    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

    if (result[1] <= 0.05) & (result[4]['5%'] > result[0]):
        print("\u001b[32mStationary\u001b[0m")
    else:
        print("\x1b[31mNon-stationary\x1b[0m")

# Data Preparation

Useful links
* https://towardsdatascience.com/breaking-the-curse-of-small-data-sets-in-machine-learning-part-2-894aa45277f4* 
* cross validation: https://github.com/Saktan/RNN-LSTM-with-Cross-Validation-for-Bitcoin-Price-Prediction/blob/master/Major_pro_final_1.ipynb
* https://www.sciencedirect.com/science/article/abs/pii/S0925231218311639: the preprocessing includes the decrease the noise but according to this http://cs230.stanford.edu/projects_fall_2021/reports/102851552.pdf, noise can be useful. So I will not remove the noise.
* the use of time series generator in here: https://medium.com/@cdabakoglu/time-series-forecasting-arima-lstm-prophet-with-python-e73a750a9887. But the design is not well-designed.
* https://www.kaggle.com/code/amar09/lstm-for-univariate-ts-forecasting
* Comparison of ARIMA and LSTM on univariate feature: https://acikerisim.sakarya.edu.tr/bitstream/handle/20.500.12619/45547/10.3846jbem.2019.10190.pdf?sequence=1&isAllowed=y
* LSTM is worse than ARIMA: https://thesis.eur.nl/pub/53546/Cracan_Thesis.pdf
* Above two shows that the ARIMA-LSTM hybrid model is the best
* The setting of recurrent dropout layer: https://stackoverflow.com/questions/44924690/keras-the-difference-between-lstm-dropout-and-lstm-recurrent-dropout the 

In [None]:
# this is the original weekly sum
series = pd.read_csv('./data/weekly_sum.csv', index_col=0, parse_dates=True, squeeze=True)
data = pd.DataFrame(series.values, index=range(0,312))
data.columns = ['bookings']
data['bookings'] = np.log(data['bookings'])

In [None]:
# the data after log transformation
series = pd.read_csv('./data/preprocessed_data.csv', index_col=0, parse_dates=True, squeeze=True)
data = pd.DataFrame(series.values, index=range(0,312))
data.columns = ['bookings']

test_size = 52
data_train, data_test =  data[ :-test_size], data[-test_size:]

print( data_train.shape, data_test.shape)
check_stationarity(data)

## Transform data into the problem scale

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(data_train)
scaled_train_data = scaler.transform(data_train)
scaled_test_data = scaler.transform(data_test)
scaled_data = np.append(scaled_train_data, scaled_test_data)

## Transform data into supervised learning

In [None]:
# only generate on scaled train data. The test date will use the 5th year scaled values as X.

from keras.preprocessing.sequence import TimeseriesGenerator

def data_generator(data, n_input, n_features):
    generator = TimeseriesGenerator(data = data, targets= data, length=n_input, batch_size=1)
    return generator

## Inverse versions

In [None]:
# define a function to report performance
def lstm_report(y_true, y_pred):
    # measures on validation set
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return rmse, mse, mae, mape

# Hyperparameter Tuning

* hyperparameters:
* the length of the feature. 1, 13, 52. - choose the best
* the architecture: the neuron numbers, the hidden layer numbers, dropout layers, etc. Save the best model.

## Self-defined functions of building models

In [None]:
d_train = data_generator(scaled_train_data, 13, 1)
d_test = data_generator(np.append(scaled_train_data[-52:], scaled_test_data), 13, 1)

### create models

In [None]:
def create_vanilla(n_input, neuron):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron, activation='relu', input_shape=(n_input, 1)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

### callbacks

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

def create_callbacks():
  early_stopping = EarlyStopping(patience=5, monitor='loss', verbose=1)
  reduce_lr = ReduceLROnPlateau(monitor='loss', min_lr=0.001, patience=5, mode='min', verbose=1)
  model_checkpoint = ModelCheckpoint(monitor='loss', filepath='./model/lstm.h5', save_weights_only=True, save_best_only=True, verbose=1)
  callbacks = [
      early_stopping,
      reduce_lr,
      model_checkpoint
  ]

  return callbacks

### Prediction

In [None]:
# notice that there is a 'scaled_data' variable, it's specified.
# the orignal version uses the predicted value as the feature to predict
# I use the walking forward validation
def predict_lstm(n_input, model, test_data):
    lstm_predictions_scaled = list()

    for i in range(len(test_data)): 
        batch = scaled_data[-52-n_input:-1][i:i+n_input]
        current_batch = batch.reshape((1, n_input, 1))  
        lstm_pred = model.predict(current_batch, verbose=0)[0]
        lstm_predictions_scaled.append(lstm_pred) 
        current_batch = np.append(current_batch[:,1:,:],[[lstm_pred]],axis=1)
    lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
    lstm_predictions = pd.Series(lstm_predictions.reshape(1,52)[0], index=data_test.index)
    return lstm_predictions

## hy0: the validation method

In [None]:
# the old method to test vanilla
# which is to use the newly predicted value to predict new value, rather than use the real data


def predict_lstm0(n_input, model, test_data):
    lstm_predictions_scaled = list()
    batch = scaled_train_data[-n_input:]
    current_batch = batch.reshape((1, n_input, 1))  

    for i in range(len(test_data)): 
        lstm_pred = model.predict(current_batch, verbose=0)[0]
        lstm_predictions_scaled.append(lstm_pred) 
        current_batch = np.append(current_batch[:,1:,:],[[lstm_pred]],axis=1)
    lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
    lstm_predictions = pd.Series(lstm_predictions.reshape(1,52)[0], index=data_test.index)
    return lstm_predictions

In [None]:
lstm_predictions = predict_lstm0(13, vanilla, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Vanilla LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

In [None]:
model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

model_selection.loc['Vanilla LSTM, Classical Prediction'] = \
                    list(lstm_report(data_test, 
                         predict_lstm0(13, vanilla, data_test)))
model_selection

## hp1: input size

* let's check the first hyperparamer, the length of the input variable

In [None]:
# create models for different input sizes: 1, 13, 52
vanilla_1 = create_vanilla(1, 512)
vanilla_2 = create_vanilla(13, 512)
vanilla_3 = create_vanilla(52, 512)

In [None]:
vanilla_1.fit(data_generator(scaled_train_data, 1, 1),epochs=20)
vanilla_2.fit(data_generator(scaled_train_data, 13, 1),epochs=20)
vanilla_3.fit(data_generator(scaled_train_data, 52, 1),epochs=20)

In [None]:
input_len_tune = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])
for i in ([1, vanilla_1], [13, vanilla_2], [52, vanilla_3]):
        input_len_tune.loc[f"vanilla LSTM, n_input = {str(i[0])}"] = list(lstm_report(data_test, predict_lstm(i[0], i[1], data_test)))
input_len_tune

In [None]:
# plot the results
# from the plot, we can see when input length = 1 or 52, they are unable to capture the patterns.

lstm_predictions = predict_lstm(1, vanilla_1, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Vanilla LSTM Predictions, input length = 1')
plt.legend()
plt.show()

lstm_predictions = predict_lstm(13, vanilla_2, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Vanilla LSTM Predictions, input length = 13')
plt.legend()
plt.show()

lstm_predictions = predict_lstm(52, vanilla_3, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Vanilla LSTM Predictions, input length = 52')
plt.legend()
plt.show()

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
vanilla = create_vanilla(13, 512)
vanilla.fit(d_train, epochs=20)

In [None]:
plt.figure()
plt.title('Loss History of Univariate Vanilla LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(vanilla.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
model_selection.loc['Vanilla LSTM'] = \
                    list(lstm_report(data_test, 
                         predict_lstm(13, vanilla, data_test)))
model_selection

In [None]:
lstm_predictions = predict_lstm(13, vanilla, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Vanilla LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

### Save model

In [None]:
# save model
vanilla.save('./model/vanilla LSTM.h5')
load_model('./model/vanilla LSTM.h5')

## hp2: architecture

* random search on: the number of neurons in each hidden layers. regularizers = l1l2 + dropoyt layer. The learning rate of the optimizer Adam
* tuned hp: add more layers
* The whole random search things doesn't work. I will just do it manually.

### Stacked LSTM 1, hidden layer = 2

In [None]:
# Vanilla: unit = 512
# Stack: hidden layer = 1, 2, 3
# add dropout layer to stack2 model, because the performance decreases, might be overfitting.

def create_2_stack(n_input, neuron1, neuron2):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape=(n_input, 1), return_sequences=True))
    lstm_model.add(LSTM(neuron2))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
stack1 = create_2_stack(13, 512, 512)
stack1.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stack1.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_predictions = predict_lstm(13, stack1, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Stacked LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Stack LSTM, 2 hidden layers'] = \
                    list(lstm_report(data_test, 
                         predict_lstm(13, stack1, data_test)))
model_selection

### Stacked LSTM, hidden layer = 3

In [None]:
def create_3_stack(n_input, neuron1, neuron2, neuron3):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape=(n_input, 1), return_sequences=True))
    lstm_model.add(LSTM(neuron2, return_sequences=True))
    lstm_model.add(LSTM(neuron3))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
stack2 = create_3_stack(13, 512, 512, 512)
stack2.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stack2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_predictions = predict_lstm(13, stack2, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Stacked LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Stack LSTM, 3 hidden layers'] = \
                    list(lstm_report(data_test, 
                         predict_lstm(13, stack2, data_test)))
model_selection

### Stacked LSTM, hidden layer = 2, dropout layer

In [None]:
def create_3_stack2(n_input, neuron1, neuron2, neuron3, dropout_rate):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape=(n_input, 1), return_sequences=True))
    lstm_model.add(LSTM(neuron2, return_sequences=True))
    lstm_model.add(LSTM(neuron3, recurrent_dropout = dropout_rate, recurrent_regularizer=L1L2(l1=0.001, l2=0.001)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
stack3 = create_3_stack2(13, 512, 512, 512, 0.1)
stack3.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stack3.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_predictions = predict_lstm(13, stack3, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Stacked LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Stack LSTM, 3 hidden layers, dropout rate = 0.1'] = \
                    list(lstm_report(data_test, 
                         predict_lstm(13, stack3, data_test)))
model_selection

## [Expired] hp3: stationarity

Will use the stationary data to compare with the result from the non-stationary data with same hyperparameters. But it's impossible to compare differenced predictions to ARIMA. There is no way to inverse the differencing of predictions.

In [None]:
# the data after log transformation
series = pd.read_csv('./data/preprocessed_data.csv', index_col=0, parse_dates=True, squeeze=True)
data = pd.DataFrame(series.values, index=range(0,312))
data.columns = ['bookings']

test_size = 52
data_train, data_test =  data[ :-test_size], data[-test_size:]

print( data_train.shape, data_test.shape)
check_stationarity(data)

In [None]:
# notice that the diff_data index is from 1 to 311. It means it's from week 2 to week 312
diff_series = data.squeeze().diff().dropna()
diff_data = pd.DataFrame(diff_series.values, index=range(1,312))
diff_data.columns = ['bookings']
check_stationarity(diff_data)

test_size = 52
diff_data_train, diff_data_test =  diff_data[ :-test_size], diff_data[-test_size:]

print(diff_data_train.shape, diff_data_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
diff_scaler = MinMaxScaler()

diff_scaler.fit(diff_data)
diff_scaled_train_data = diff_scaler.transform(diff_data_train)
diff_scaled_test_data = diff_scaler.transform(diff_data_test)
diff_scaled_data = np.append(diff_scaled_train_data, diff_scaled_test_data)
# only generate on scaled train data. The test date will use the 5th year scaled values as X.

In [None]:
diff_d_train = data_generator(diff_scaled_train_data, 13, 1)
diff_d_test = data_generator(diff_scaled_data, 13, 1)

In [None]:
# the best is stack model with 2 hidden layers

# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
diff_vanilla = create_vanilla(13, 512)
diff_vanilla.fit(diff_d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(diff_vanilla.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_predictions = predict_lstm(13, diff_vanilla, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Vanilla LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

# hp 4: model type

In [None]:
def create_bidirectional(n_input, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation="relu", input_shape=(n_input, 1))))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
bidirectional = create_bidirectional(13, 512)
bidirectional.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Biodirectional LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_predictions = predict_lstm(13, bidirectional, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Bidirectional LSTM Predictions, input_length = 13')
plt.legend()
plt.show()

In [None]:
lstm_report(data_test, predict_lstm(13, bidirectional, data_test))

In [None]:
bidirectional.save('./model/bidirectional_r')

## BI-LSTM with multiple layers = 2

In [None]:
def create_stacked_bidirectional(n_input, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation="relu", input_shape=(n_input, 1), return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
bidirectional2 = create_stacked_bidirectional(13, 512)
bidirectional2.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Biodirectional LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_predictions = predict_lstm(13, bidirectional2, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Bidirectional LSTM Predictions, hidden layer = 2')
plt.legend()
plt.show()

In [None]:
lstm_report(data_test, predict_lstm(13, bidirectional2, data_test))

In [None]:
bidirectional2.save('./model/bidirectional_r2')

## BI-LSTM with multiple layers = 3

In [None]:
def create_stacked_bidirectional2(n_input, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation="relu", input_shape=(n_input, 1), return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron, return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# we can see n_input = 13 has the best performance as it captures the patterns. Let's plot the best model - our new baseline model
bidirectional3 = create_stacked_bidirectional2(13, 512)
bidirectional3.fit(d_train, epochs=20)
plt.figure()
plt.title('Loss History of Univariate Biodirectional LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional3.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_predictions = predict_lstm(13, bidirectional3, data_test)

plt.figure()
plt.plot(data_test, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Univariate Bidirectional LSTM Predictions, hidden layer = 2')
plt.legend()
plt.show()

In [None]:
lstm_report(data_test, predict_lstm(13, bidirectional3, data_test))

In [None]:
bidirectional3.save('./model/bidirectional_r3')

# Final Performance change

In [None]:
lstm_predictions = predict_lstm(13, bidirectional, data_test)

exp_mse = np.sqrt(mean_squared_error(np.exp(lstm_predictions),np.exp(data_test)))
exp_mae = np.sqrt(mean_absolute_error(np.exp(lstm_predictions),np.exp(data_test)))
print(f'{exp_mse}, {exp_mae}')

compare = np.concatenate(((np.exp(lstm_predictions.values)),np.exp(data_test.values).reshape(1,-1)[0])).reshape(2,-1)
compare = pd.DataFrame(compare).transpose()
compare = compare.rename(columns={0:'prediction', 1:'bookings'})
compare['residual'] = compare['prediction']/compare['bookings']
plt.figure()
plt.hist(compare['residual'])
plt.show()

In [None]:
residual_rate = compare[(compare['residual'] >=0.80) & (compare['residual'] <=1.20)].shape[0]/compare.shape[0]
agg_residual_rate = np.mean(compare['residual'].values)
print(f'the residual that is lower than 20% is {residual_rate}, the average residual is {agg_residual_rate}.')
compare[0:10]

# Save Models

In [None]:
# save models
vanilla.save('./model/univariate_vanilla.h5')
stack1.save('./model/univariate_stack1.h5')
stack2.save('./model/univariate_stack2.h5')
stack3.save('./model/univariate_stack3.h5')
bidirectional.save('./model/bidirectional.h5')


# Not Work

### save model