# Data Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from keras import callbacks, regularizers, optimizers
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.regularizers import L1L2
from keras_tuner import RandomSearch, Objective
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
# RUN IT: define a function check_stationarity(series)
from statsmodels.tsa.stattools import adfuller

def check_stationarity(series):

    result = adfuller(series.values)

    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

    if (result[1] <= 0.05) & (result[4]['5%'] > result[0]):
        print("\u001b[32mStationary\u001b[0m")
    else:
        print("\x1b[31mNon-stationary\x1b[0m")

In [None]:
raw_data = pd.read_csv('./data/cleaned_raw_data.csv', index_col=None)
# deal with negative data 100904, 101990
raw_data = raw_data.drop(raw_data.index[[14597, 17480, 28433, 29224, 271980, 100904, 101990, 212036, 211017, 198306]], axis=0)  # drop a negative value that influence the sum
weekly_data = raw_data.loc[:,['Week', 'ENT Area', 'Product Platform', 'Line Item Net New TCV']]
weekly_sum = weekly_data.groupby(['Week', 'ENT Area', 'Product Platform']).agg({'Line Item Net New TCV': 'sum'}).reset_index()
# weekly_sum.to_csv('./data/weekly_sum_categorical.csv', index = None)

In [None]:
raw_data.info()

In [None]:
for x in weekly_sum.columns:
    print(x ,':', len(weekly_sum[x].unique()))

# the sum of HQ is 0, so not show here


## One Hot Encoding the Categorical Features

In [None]:
weekly_sum = pd.read_csv('./data/weekly_sum_categorical.csv')
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(weekly_sum[['ENT Area','Product Platform']]).toarray()
feature_labels = ohe.categories_
feature_labels = ohe.get_feature_names_out()
features = pd.DataFrame(feature_arr, columns=feature_labels)
features['bookings'] = weekly_sum['Line Item Net New TCV']
features.insert(0,'week', weekly_sum['Week'])
# features.to_csv('./data/encoded_data.csv')

## Pivot table by Product and Area

In [None]:
weekly_sum = pd.read_csv('./data/weekly_sum_categorical.csv' )
product_sum = pd.pivot_table(weekly_sum, values='Line Item Net New TCV', index='Week', columns='Product Platform', aggfunc=np.sum, fill_value=0, margins=True).iloc[:-1, :]
# product_sum.to_csv('./data/product sum.csv')
area_sum = pd.pivot_table(weekly_sum, values='Line Item Net New TCV', index='Week', columns='ENT Area', aggfunc=np.sum, fill_value=0, margins=True).iloc[:-1, :]
# area_sum.to_csv('./data/area sum.csv')

# Plot the data by classes

In [None]:
def viz_categorical_transition(df, col, row_n, col_n):
    category = df[col].unique().tolist()
    
    fig, axes = plt.subplots(row_n, col_n, figsize=(12, 9))
    ax = axes.ravel()
    
    for i, cate in enumerate(category):
        x = df[df[col] == cate].groupby("Week").mean().loc[:, ["Line Item Net New TCV"]]
        x.plot(ax=ax[i])
        ax[i].set_title(cate)
        ax[i].set_xlabel("")
        
    plt.tight_layout()


In [None]:
viz_categorical_transition(weekly_sum, 'ENT Area', 3,3)

In [None]:
viz_categorical_transition(weekly_sum, 'Product Platform', 3,2)

# Training Seperately

## Self-defined Functions

In [None]:
def data_generator(data, targets, n_input):
    generator = TimeseriesGenerator(data = data, targets= targets, length=n_input, batch_size=1)
    return generator

In [None]:
# define a function to report performance
def lstm_report(y_true, y_pred):
    # measures on validation set
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return rmse, mse, mae, mape

# Area as Features

* Note!!! Two training/test sets have the same variable names. Plz run then from defining the datasets when training the model

## Data Preprocessing

In [None]:
# read the dataset, transfer to supervised learning. The target variable is the total bookings for each week.
area_sum = pd.read_csv('./data/area sum.csv', index_col=0, parse_dates=True).squeeze("columns")

test_size = 52
data_train, data_test =  area_sum.iloc[ :-test_size,:], area_sum.iloc[-test_size:,:]

# log transformation
data_train, data_test = np.log(data_train+1), np.log(data_test+1)

### Log transformation and normalization

In [None]:
# split the dataset for normalization
train_X = data_train.iloc[:,:-1]
train_y = np.array(data_train.iloc[:,-1]).reshape(-1,1)
test_X = data_test.iloc[:,:-1]
test_y = np.array(data_test.iloc[:,-1]).reshape(-1,1)

# normalization

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(train_X)
scaled_train_X = scaler_X.transform(train_X)
scaled_test_X = scaler_X.transform(test_X)

scaler_y.fit(train_y)
scaled_train_y = scaler_y.transform(train_y)
scaled_test_y = scaler_y.transform(test_y)

scaled_X = np.append(scaled_train_X, scaled_test_X, axis=0)

print(scaled_train_X.shape, scaled_test_X.shape, scaled_X.shape)

## Baseline Modeling

In [None]:
# baseline
def create_vanilla(n_input, n_features, neuron):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron, activation='relu', input_shape = (n_input, n_features)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
# walking forward validation
def predict_lstm(n_input, n_feature, model, data_test, data):
    lstm_predictions_scaled = list()

    for i in range(data_test.shape[0]): 
        batch = data[-52-n_input:-1, :][i:i+n_input, :]
        current_batch = batch.reshape((1, n_input, n_feature))
        lstm_pred = model.predict(current_batch, verbose=0)[0]
        lstm_predictions_scaled.append(lstm_pred)
    lstm_predictions = scaler_y.inverse_transform(lstm_predictions_scaled)
    lstm_predictions = pd.Series(lstm_predictions.reshape(1,52)[0], index=data_test.index)
    return lstm_predictions

In [None]:
vanilla_area = create_vanilla(13, 9, 512)
vanilla_area.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

In [None]:
plt.figure()
plt.title('Loss History of Multivariate Vanilla LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(vanilla_area.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

## Baseline Evaluation

In [None]:
model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

model_selection.loc['Multivariate Vanilla LSTM, Area'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 9, vanilla_area, data_test, scaled_X)))
model_selection

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, vanilla_area, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Vanilla LSTM Predictions, Area')
plt.legend()
plt.show()

## Stacked LSTM Modelling

In [None]:
# stacked model
def create_2_stacked(n_input, n_features, neuron1, neuron2):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape = (n_input, n_features), return_sequences=True))
    lstm_model.add(LSTM(neuron2))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
stacked_area = create_2_stacked(13, 9, 512, 512)
stacked_area.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_area.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, stacked_area, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Area')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Stacked LSTM, Area, Hidden Layer = 2'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 9, stacked_area, data_test, scaled_X)))
model_selection

## Stacked LSTM, Hidden layer = 3

In [None]:
# stacked model
def create_3_stacked(n_input, n_features, neuron1, neuron2, neuron3):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape = (n_input, n_features), return_sequences=True))
    lstm_model.add(LSTM(neuron2, return_sequences=True))
    lstm_model.add(LSTM(neuron3))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
stacked_area2 = create_3_stacked(13, 9, 512, 512, 512)
stacked_area2.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_area2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, stacked_area2, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Area')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Stacked LSTM, Area, Hidden Layer = 3'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 9, stacked_area2, data_test, scaled_X)))
model_selection

## Stacked LSTM, Hidden Layer = 2, with regularizers

In [None]:
# might be overfitting. Let's add regularizers to stacked_area

def create_2_stacked2(n_input, n_features, neuron1, neuron2, dropout_rate):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape = (n_input, n_features), return_sequences=True))
    lstm_model.add(LSTM(neuron2, recurrent_dropout = dropout_rate, recurrent_regularizer=L1L2(l1=0.001, l2=0.001)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model


In [None]:
stacked_area3 = create_2_stacked2(13, 9, 512, 512, 512, 0.1)
stacked_area3.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_area3.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, stacked_area3, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Area')
plt.legend()
plt.show()

In [None]:
# watch out!!! The title in the output is wrong
model_selection.loc['Multivariate Stacked LSTM, Area, Hidden Layer = 2, with regularizer'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 9, stacked_area3, data_test, scaled_X)))
model_selection

# getting worse.

# Product as Features

## Vanilla
### Data Preparation

In [None]:
# read the dataset, transfer to supervised learning. The target variable is the total bookings for each week.
product_sum = pd.read_csv('./data/product sum.csv', index_col=0, parse_dates=True).squeeze("columns")

test_size = 52
data_train, data_test =  product_sum.iloc[ :-test_size,:], product_sum.iloc[-test_size:,:]

# log transformation
data_train, data_test = np.log(data_train+1), np.log(data_test+1)

#### Log transformation and normalization

In [None]:
# split the dataset for normalization
train_X = data_train.iloc[:,:-1]
train_y = np.array(data_train.iloc[:,-1]).reshape(-1,1)
test_X = data_test.iloc[:,:-1]
test_y = np.array(data_test.iloc[:,-1]).reshape(-1,1)

# normalization

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(train_X)
scaled_train_X = scaler_X.transform(train_X)
scaled_test_X = scaler_X.transform(test_X)

scaler_y.fit(train_y)
scaled_train_y = scaler_y.transform(train_y)
scaled_test_y = scaler_y.transform(test_y)

scaled_X = np.append(scaled_train_X, scaled_test_X, axis=0)

print(scaled_train_X.shape, scaled_test_X.shape, scaled_X.shape)

In [None]:
vanilla_product = create_vanilla(13, 6, 512)
vanilla_product.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

In [None]:
plt.figure()
plt.title('Loss History of Multivariate Vanilla LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(vanilla_product.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

### Evaluation

In [None]:
# model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

model_selection.loc['Multivariate Vanilla LSTM, Product'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 6, vanilla_product, data_test, scaled_X)))
model_selection

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 6, vanilla_product, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Vanilla LSTM Predictions, Product')
plt.legend()
plt.show()

## No Unit 42
### Vanilla

In [None]:
# Probably the performance influenced by Unit 42. Let's delete it.

# read the dataset, transfer to supervised learning. The target variable is the total bookings for each week.
product_sum = pd.read_csv('./data/product sum.csv', index_col=0, parse_dates=True).squeeze("columns")

test_size = 52
data_train, data_test =  product_sum.iloc[ :-test_size,:], product_sum.iloc[-test_size:,:]

# log transformation
data_train, data_test = np.log(data_train+1), np.log(data_test+1)
#### Log transformation and normalization
# split the dataset for normalization
train_X = data_train.iloc[:,:-2]
train_y = np.array(data_train.iloc[:,-1]).reshape(-1,1)
test_X = data_test.iloc[:,:-2]
test_y = np.array(data_test.iloc[:,-1]).reshape(-1,1)

# normalization

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(train_X)
scaled_train_X = scaler_X.transform(train_X)
scaled_test_X = scaler_X.transform(test_X)

scaler_y.fit(train_y)
scaled_train_y = scaler_y.transform(train_y)
scaled_test_y = scaler_y.transform(test_y)

scaled_X = np.append(scaled_train_X, scaled_test_X, axis=0)

print(scaled_train_X.shape, scaled_test_X.shape, scaled_X.shape)

In [None]:
vanilla_product_no42 = create_vanilla(13, 5, 512)
vanilla_product_no42.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Vanilla LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(vanilla_product_no42.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()
# model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Vanilla LSTM Predictions, Product')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Vanilla LSTM, Product, No Unit 42'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 5, vanilla_product_no42, data_test, scaled_X)))
model_selection

### Stacked LSTM, hidden layer = 2

In [None]:
# Probably the performance influenced by Unit 42. Let's delete it.

# read the dataset, transfer to supervised learning. The target variable is the total bookings for each week.
product_sum = pd.read_csv('./data/product sum.csv', index_col=0, parse_dates=True).squeeze("columns")

test_size = 52
data_train, data_test =  product_sum.iloc[ :-test_size,:], product_sum.iloc[-test_size:,:]

# log transformation
data_train, data_test = np.log(data_train+1), np.log(data_test+1)
#### Log transformation and normalization
# split the dataset for normalization
train_X = data_train.iloc[:,:-2]
train_y = np.array(data_train.iloc[:,-1]).reshape(-1,1)
test_X = data_test.iloc[:,:-2]
test_y = np.array(data_test.iloc[:,-1]).reshape(-1,1)

# normalization

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(train_X)
scaled_train_X = scaler_X.transform(train_X)
scaled_test_X = scaler_X.transform(test_X)

scaler_y.fit(train_y)
scaled_train_y = scaler_y.transform(train_y)
scaled_test_y = scaler_y.transform(test_y)

scaled_X = np.append(scaled_train_X, scaled_test_X, axis=0)

print(scaled_train_X.shape, scaled_test_X.shape, scaled_X.shape)

In [None]:
stack_product_no42 = create_2_stacked(13, 5, 512, 512)
stack_product_no42.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM, Product')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stack_product_no42.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()
# model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Product')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Stacked LSTM, Product, No Unit 42, Hidden Layer = 2'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 5, stack_product_no42, data_test, scaled_X)))
model_selection

### Stacled LSTM, Hidden Layer = 3

In [None]:
stack_product_no42_2 = create_3_stacked(13, 5, 512, 512, 512)
stack_product_no42_2.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM, Product')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stack_product_no42_2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()
# model_selection = pd.DataFrame(columns=['rmse', 'mse', 'mae', 'mape'])

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Product')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Stacked LSTM, Product, No Unit 42, Hidden Layer = 3'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 5, stack_product_no42_2, data_test, scaled_X)))
model_selection

### Stacked LSTM with Regularizer

In [None]:
# might be overfitting. Let's add regularizers to stacked_area

def create_3_stacked2(n_input, n_features, neuron1, neuron2, neuron3, dropout_rate):
    lstm_model = Sequential()
    lstm_model.add(LSTM(neuron1, activation='relu', input_shape = (n_input, n_features), return_sequences=True))
    lstm_model.add(LSTM(neuron2, return_sequences=True))
    lstm_model.add(LSTM(neuron3, recurrent_dropout = dropout_rate, recurrent_regularizer=L1L2(l1=0.001, l2=0.001)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model


In [None]:
stacked_product4 = create_3_stacked2(13, 5, 512, 512, 512, 0.1)
stacked_product4.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)

plt.figure()
plt.title('Loss History of Multivariate Stacked LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_product4.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 5, stacked_product4, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Stacked LSTM Predictions, Area')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Stacked LSTM, Product, Hidden Layer = 3, with regularizer'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 5, stacked_area4, data_test, scaled_X)))
model_selection

# getting worse.

# Area + Product as Features

In [None]:
# read the dataset, transfer to supervised learning. The target variable is the total bookings for each week.
area_sum = pd.read_csv('./data/area sum.csv', index_col=0, parse_dates=True).squeeze("columns")
product_sum = pd.read_csv('./data/product sum.csv', index_col=0, parse_dates=True).squeeze("columns")
all_sum = pd.concat([pd.concat([area_sum.iloc[:,:-1], product_sum.iloc[:,:-2]], axis=1),
            product_sum.iloc[:,-1]],axis = 1)

test_size = 52
data_train, data_test =  all_sum.iloc[ :-test_size,:], all_sum.iloc[-test_size:,:]

In [None]:
# log transformation
data_train, data_test = np.log(data_train+1), np.log(data_test+1)
# split the dataset for normalization
train_X = data_train.iloc[:,:-1]
train_y = np.array(data_train.iloc[:,-1]).reshape(-1,1)
test_X = data_test.iloc[:,:-1]
test_y = np.array(data_test.iloc[:,-1]).reshape(-1,1)

# normalization

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(train_X)
scaled_train_X = scaler_X.transform(train_X)
scaled_test_X = scaler_X.transform(test_X)

scaler_y.fit(train_y)
scaled_train_y = scaler_y.transform(train_y)
scaled_test_y = scaler_y.transform(test_y)

scaled_X = np.append(scaled_train_X, scaled_test_X, axis=0)

print(scaled_train_X.shape, scaled_test_X.shape, scaled_X.shape)

In [None]:
data_train.describe()

## Hidden layer = 1

In [None]:
vanilla_all = create_vanilla(13, 14, 512)
vanilla_all.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate Vanilla LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(vanilla_all.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 14, vanilla_all, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Vanilla LSTM Predictions, Hybrid')
plt.legend()
plt.show()

In [None]:
model_selection.loc['Multivariate Vanilla LSTM, Product + Area'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 14, vanilla_all, data_test, scaled_X)))
model_selection

## Hidden layer = 2

In [None]:
stacked_all2 = create_2_stacked(13, 14, 512, 512)
stacked_all2.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate LSTM, hidden layer = 2')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_all2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 14, stacked_all2, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate LSTM Predictions, Hybrid, Hidden layer = 2')
plt.legend()
plt.show()

In [None]:
lstm_report(test_y, predict_lstm(13, 14, stacked_all2, data_test, scaled_X))

## Hidden layer = 3

In [None]:
stacked_all3 = create_3_stacked(13, 14, 512, 512, 512)
stacked_all3.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate LSTM, hidden layer = 2')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(stacked_all3.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 14, stacked_all3, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate LSTM Predictions, Hybrid, Hidden layer = 3')
plt.legend()
plt.show()

In [None]:
lstm_report(test_y, predict_lstm(13, 14, stacked_all3, data_test, scaled_X))

# Bidirenctional LSTM

## hidden layer = 1

In [None]:
def create_bidirectional(n_input, n_feature, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation='relu', input_shape=(n_input, n_feature))))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
bidirectional_area = create_bidirectional(13, 9, 512)
bidirectional_area.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate Bidirectional LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional_area.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, bidirectional_area, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate Bidirectional LSTM Predictions, Area')
plt.legend()
plt.show()

In [None]:
list(lstm_report(test_y,predict_lstm(13, 9, bidirectional_area, data_test, scaled_X)))

In [None]:
bidirectional_area.save('./model/bidirectional_area')

In [None]:
model_selection.loc['Multivariate Vanilla LSTM, Product + Area'] = \
                    list(lstm_report(test_y, 
                         predict_lstm(13, 9, bidirectional_area, data_test, scaled_X)))
model_selection

## Hidden layer = 2

In [None]:
def create_stacked_bidirectional2(n_input, n_feature, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation='relu', input_shape=(n_input, n_feature), return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
bidirectional_area2 = create_stacked_bidirectional2(13, 9, 512)
bidirectional_area2.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate Bidirectional LSTM, 2')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional_area2.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, bidirectional_area2, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate BI-LSTM Predictions, Area, Hidden layer = 2')
plt.legend()
plt.show()

In [None]:
list(lstm_report(test_y,predict_lstm(13, 9, bidirectional_area2, data_test, scaled_X)))

## Hidden layer = 3

In [None]:
def create_bidirectional3(n_input, n_feature, neuron):
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(neuron, activation='relu', input_shape=(n_input, n_feature), return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron, return_sequences=True)))
    lstm_model.add(Bidirectional(LSTM(neuron)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss=keras.losses.MeanSquaredError(), metrics=['mse'])
    return lstm_model

In [None]:
bidirectional_area3 = create_bidirectional3(13, 9, 512)
bidirectional_area3.fit(data_generator(scaled_train_X, scaled_train_y, 13),epochs=20)
plt.figure()
plt.title('Loss History of Multivariate Bidirectional LSTM')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(bidirectional_area3.history.history['loss'], label = "loss")
plt.xticks(np.arange(1,21,1))
plt.legend()
plt.show()

In [None]:
lstm_true = pd.Series(test_y.reshape(-1,), index = data_test.index,)
lstm_predictions = predict_lstm(13, 9, bidirectional_area3, data_test, scaled_X)

plt.figure()
plt.plot(lstm_true, label = 'y_true')
plt.plot(lstm_predictions, label = 'y_pred')
plt.title('Multivariate BI-LSTM Predictions, Area, Hidden layer = 3')
plt.legend()
plt.show()

In [None]:
list(lstm_report(test_y,predict_lstm(13, 9, bidirectional_area3, data_test, scaled_X)))

In [None]:
lstm_predictions =  predict_lstm(13, 9, bidirectional_area, data_test, scaled_X)

exp_mse = np.sqrt(mean_squared_error(np.exp(lstm_predictions),np.exp(test_y)))
exp_mae = np.sqrt(mean_absolute_error(np.exp(lstm_predictions),np.exp(test_y)))
print(f'{exp_mse}, {exp_mae}')

compare = np.concatenate(((np.exp(lstm_predictions.values)),np.exp(test_y).reshape(1,-1)[0])).reshape(2,-1)
compare = pd.DataFrame(compare).transpose()
compare = compare.rename(columns={0:'prediction', 1:'bookings'})
compare['residual'] = compare['prediction']/compare['bookings']
plt.figure()
plt.hist(compare['residual'])
plt.show()

In [None]:
residual_rate = compare[(compare['residual'] >=0.80) & (compare['residual'] <=1.20)].shape[0]/compare.shape[0]
agg_residual_rate = np.mean(compare['residual'].values)

print(f'the residual that is lower than 20% is {residual_rate}, the average residual is {agg_residual_rate}..')
compare[-10:]

# Save Models

In [None]:
vanilla_area.save('./model/vanilla_area.h5')
vanilla_product.save('./model/vanilla_product.h5')
vanilla_product_no42.save('./model/vanilla_product_no42.h5')
stacked_area.save('./model/stacked_area.h5')
stack_product_no42.save('./model/stack_product_no42.h5')
stack_product_no42_2.save('./model/stacked_product_no42_2.h5')
stacked_prodcut4.save('./model/stacked_prodcut.h5')

# Random Search

* The results shows that the product is not informative as areas. And the stacked model is not as informative as the vanilla LSTM. We will do a random search on vanilla on the area features.