### Model 4: LSTM

__Deep learning for forecasting is used when:__
- The dataset is large (more than 10,000 data points).
- Declination of the SARIMAX model takes a long time to fit.
- The residuals of the statistical model still show some correlation.
- There is more than one seasonal period.

#### Settings

In [14]:
import warnings
import pickle
import random
import gc
from itertools import product

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

# main module for evaluation
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# main modules for designing ML pipelines
from mlxtend.feature_selection import ColumnSelector
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer 


# time-related feature engineering
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import SplineTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

warnings.filterwarnings("ignore")
random.seed(42)
np.random.seed(42)

#### Import Data

In [15]:
TARGET = ['sales']

TARGET_ENCODE_COLUMNS = ['family', 'cluster']

CATEGORY_COLUMNS = ['typeholiday','city', 'typestores', 'year']

TIME_COLUMNS = ['day_of_week', 'month']

COL_NAMES_ORIGINAL = ['family', 'typeholiday','onpromotion', 'dcoilwtico', 'city', 'typestores',
                    'cluster', 'year', 'day_of_week','month']

COL_NAMES_AFTER_TRANS=['target_trans__family', 'target_trans__cluster',
       'category_trans__typeholiday_Additional',
       'category_trans__typeholiday_Bridge',
       'category_trans__typeholiday_Event',
       'category_trans__typeholiday_Holiday',
       'category_trans__typeholiday_NDay',
       'category_trans__typeholiday_Transfer',
       'category_trans__city_Ambato', 'category_trans__city_Babahoyo',
       'category_trans__city_Cayambe', 'category_trans__city_Cuenca',
       'category_trans__city_Daule', 'category_trans__city_El Carmen',
       'category_trans__city_Esmeraldas', 'category_trans__city_Guaranda',
       'category_trans__city_Guayaquil', 'category_trans__city_Ibarra',
       'category_trans__city_Latacunga', 'category_trans__city_Libertad',
       'category_trans__city_Loja', 'category_trans__city_Machala',
       'category_trans__city_Manta', 'category_trans__city_Playas',
       'category_trans__city_Puyo', 'category_trans__city_Quevedo',
       'category_trans__city_Quito', 'category_trans__city_Riobamba',
       'category_trans__city_Salinas',
       'category_trans__city_Santo Domingo',
       'category_trans__typestores_A', 'category_trans__typestores_B',
       'category_trans__typestores_C', 'category_trans__typestores_D',
       'category_trans__typestores_E', 'category_trans__year_2013',
       'category_trans__year_2014', 'category_trans__year_2015',
       'category_trans__year_2016', 'category_trans__year_2017',
       'time_trans__day_of_week_sin__day_of_week',
       'time_trans__day_of_week_cos__day_of_week',
       'time_trans__month_sin__month', 'time_trans__month_cos__month',
       'time_trans__day_of_week_sin__day_of_week day_of_week_cos__day_of_week',
       'time_trans__day_of_week_sin__day_of_week month_sin__month',
       'time_trans__day_of_week_sin__day_of_week month_cos__month',
       'time_trans__day_of_week_cos__day_of_week month_sin__month',
       'time_trans__day_of_week_cos__day_of_week month_cos__month',
       'time_trans__month_sin__month month_cos__month',
       'remainder__onpromotion', 'remainder__dcoilwtico']

In [16]:
train_df = pd.read_pickle('processed_data/train_df.pkl')

# Data Preprocessing
X = train_df[COL_NAMES_ORIGINAL]
y = train_df[TARGET]

In [19]:
train_df.set_index('date', inplace = True)
X.index = train_df.index
y.index = train_df.index

In [None]:
def train_test_split(df, train_end_date, val_start_date, val_end_date):
    
    # Split Dataset
    train_data = df[:train_end_date]
    validation_data = df[val_start_date:val_end_date]
    
    return train_data, validation_data

#### Normalize the dataset

In [20]:
# Normalize the sales data
scaler = MinMaxScaler(feature_range=(0, 1))
train_df['sales_normalized'] = scaler.fit_transform(train_df['sales'].values.reshape(-1,1))

#### Data Windowing

In [22]:
train_df.head()

Unnamed: 0_level_0,family,sales,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,month,year,sales_normalized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,AUTOMOTIVE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,2013,0.0
2013-01-01,BABY CARE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,2013,0.0
2013-01-01,BEAUTY,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,2013,0.0
2013-01-01,BEVERAGES,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,2013,0.0
2013-01-01,BOOKS,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,2013,0.0


#### Data Preprocessing Pipeline 

In [18]:
# pipeline for target encoding category features
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
target_encoder = TargetEncoder()

# helper functions to be able to get feature names out of functional transformer 
def f_out(self,input_features):
    return input_features

# functions to transform time features with sine cosine transformation 
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi), feature_names_out=f_out)

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi), feature_names_out=f_out)

## time feat pipeline: to avoid jump between first and last value of periodic range
## (1) sine/cosine transformation from ordinal features to trigonometric features
## (2) polynomial transformation to captrue linear interactions between time features
time_feat = make_pipeline(
                ColumnTransformer([
                            ("day_of_week_sin", sin_transformer(7), ["day_of_week"]),
                            ("day_of_week_cos", cos_transformer(7), ["day_of_week"]),
                            ("month_sin", sin_transformer(12), ["month"]),
                            ("month_cos", cos_transformer(12), ["month"])
                            ],remainder='drop'),
                PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))

# building the pipeline to perform feature engineering
preprocess_pipe = Pipeline(steps=[
    ('encoder', ColumnTransformer(
                    transformers=[
                        ("target_trans", target_encoder, TARGET_ENCODE_COLUMNS),
                        ("category_trans", one_hot_encoder, CATEGORY_COLUMNS),
                        ("time_trans",time_feat,TIME_COLUMNS),
                                ],
                                remainder="passthrough", verbose_feature_names_out=True
                            )),
    ("pandarizer2", FunctionTransformer(lambda x: pd.DataFrame(x, columns = COL_NAMES_AFTER_TRANS)))
                            ],verbose = True)

In [None]:
# Prepare the training and validation sets

train_end_date = '2017-06-30'
val_start_date = '2017-07-01'
val_end_date = '2017-08-15'

X_train, X_val = train_test_split(X, train_end_date, val_start_date, val_end_date)
y_train, y_val = train_test_split(y, train_end_date, val_start_date, val_end_date)

preprocess_pipe.fit(X_train[COL_NAMES_ORIGINAL], y_train)

X_train = preprocess_pipe.transform(X_train[COL_NAMES_ORIGINAL])
X_val = preprocess_pipe.transform(X_val[COL_NAMES_ORIGINAL])

#### Add Lag Features

In [None]:
# Add Lag Features

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # Forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # Concatenate
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [None]:

values = data['sales_normalized'].values
data_supervised = series_to_supervised(values, n_in=1)

# Define training and testing sets
n_train = int(len(data_supervised) * 0.8)
train = data_supervised.values[:n_train, :]
test = data_supervised.values[n_train:, :]

# Split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# Reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))


#### LSTM

In [None]:
# LSTM Model
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

# Fit network
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(train_X, train_y, epochs=100, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[early_stopping])

# Evaluate
predictions = model.predict(test_X)

In [34]:
import pandas as pd
import numpy as np

# Create a date range for one year
dates = pd.date_range(start='2020-01-01', periods=365, freq='D')

# Simulate some sales data
np.random.seed(0)  # for reproducibility
sales_data = np.random.randint(100, 500, size=(365,))
others = np.random.randint(100, 300, size=(365,))

# Create a DataFrame
df = pd.DataFrame({'date': dates, 'sales': sales_data, 'others': others})

# Set 'date' as the index
df.set_index('date', inplace=True)

In [None]:
import tensorflow as tf

class DataWindow():
    def __init__():
        # Store raw data 

        # Define window parameters 

        # 

In [35]:
import tensorflow as tf

class DataWindow():
    def __init__(self, input_width, label_width, shift, data_df, label_columns=None):
        # Store the raw data
        self.data_df = data_df
        
        # Define the window parameters
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        
        # Define column indices
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(data_df.columns)}
        
        self.total_window_size = input_width + shift
        
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32
        )
        ds = ds.map(self.split_to_inputs_labels)
        return ds
    
    def split_to_inputs_labels(self, window):
        inputs = window[:, :self.input_width, :]
        labels = window[:, self.input_width:, :]
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])
        return inputs, labels

# Instantiate the DataWindow class
input_width = 30  # Use the last 30 days to predict
label_width = 1   # We want to predict 1 day in the future
shift = 1         # Shift the window by 1 day for the next window
label_columns = ['sales']

data_window = DataWindow(input_width, label_width, shift, df, label_columns)

# Create the TensorFlow datasets
dataset = data_window.make_dataset(df)

In [36]:
# Take one batch from the dataset
for inputs, labels in dataset.take(1):
    print("Inputs: \n", inputs.numpy())
    print("Labels: \n", labels.numpy())

Inputs: 
 [[[457. 229.]
  [496. 233.]
  [144. 298.]
  ...
  [163. 297.]
  [116. 291.]
  [206. 194.]]

 [[131. 235.]
  [476. 122.]
  [357. 179.]
  ...
  [243. 261.]
  [248. 215.]
  [327. 153.]]

 [[322. 176.]
  [223. 118.]
  [182. 213.]
  ...
  [471. 279.]
  [284. 281.]
  [177. 297.]]

 ...

 [[318. 193.]
  [464. 214.]
  [359. 261.]
  ...
  [229. 234.]
  [309. 166.]
  [468. 192.]]

 [[199. 217.]
  [277. 134.]
  [343. 151.]
  ...
  [373. 170.]
  [435. 138.]
  [488. 267.]]

 [[468. 238.]
  [301. 204.]
  [483. 191.]
  ...
  [186. 286.]
  [143. 136.]
  [460. 199.]]]
Labels: 
 [[[264.]]

 [[379.]]

 [[386.]]

 [[183.]]

 [[385.]]

 [[375.]]

 [[496.]]

 [[236.]]

 [[268.]]

 [[199.]]

 [[193.]]

 [[236.]]

 [[487.]]

 [[127.]]

 [[441.]]

 [[125.]]

 [[247.]]

 [[181.]]

 [[447.]]

 [[476.]]

 [[386.]]

 [[353.]]

 [[187.]]

 [[414.]]

 [[282.]]

 [[440.]]

 [[263.]]

 [[365.]]

 [[123.]]

 [[391.]]

 [[205.]]

 [[111.]]]
