https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/

## Set up 

In [0]:
%pip install keras

In [0]:
%restart_python

In [0]:
spark.sql('use catalog dbacademy')
spark.sql('use schema labuser9128531_1738705451')

In [0]:
path='/Volumes/dbacademy/labuser9128531_1738705451/air/shampoo.csv'
# Read the CSV file into a DataFrame
df = spark.read.csv(path, header=True, inferSchema=True)

In [0]:

# load and plot dataset
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot

In [0]:
df=df.toPandas()
df.info()

In [0]:
df.head(5)

In [0]:
date=[]
for i in df['Month']:
    str_d='190'+i[0]+i[1:]
    date.append(str_d)
print(date)

In [0]:
date_df = pd.DataFrame(date)
df = pd.concat([df, date_df], axis=1)

In [0]:
df.head(5)

In [0]:
df.columns

In [0]:
df.rename(columns={0: 'Date'}, inplace=True)

In [0]:
df.columns

In [0]:
df['Date']=pd.to_datetime(df['Date'], format='%Y-%m')

In [0]:
df.info()

In [0]:
df.drop('Month', axis=1, inplace=True)

In [0]:
df.shape

In [0]:
df.head(5)

In [0]:
from matplotlib import pyplot as plt
plt.plot(df['Date'], df['Sales'])

## Breaks down steps

### Train and test setup

In [0]:

# split data into train and test
series=df['Sales']
X = series.values
train, test = X[0:-12], X[-12:]

In [0]:
train.shape, test.shape

### Walk-forward model validation: current month will be used to predict the next month
A rolling forecast scenario will be used, also called walk-forward model validation.
Each time step of the test dataset will be walked one at a time. A model will be used to make a forecast for the time step, then the actual expected value from the test set will be taken and made available to the model for the forecast on the next time step.
This mimics a real-world scenario where new Shampoo Sales observations would be available each month and used in the forecasting of the following month.

Finally, all forecasts on the test dataset will be collected and an error score calculated to summarize the skill of the model. The root mean squared error (RMSE) will be used as it punishes large errors and results in a score that is in the same units as the forecast data, namely monthly shampoo sales.


In [0]:
# walk-forward validation
history = [x for x in train] #only training data
print(history)
predictions = list()
for i in range(len(test)):
	# make prediction
	predictions.append(history[-1])
	# observation train+test
	history.append(test[i])
print(history)

In [0]:
predictions #start from the last element of the training data and go to the test data, the last one is the second last of the test data

### Preformance

In [0]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from matplotlib import pyplot

In [0]:
test[:5], predictions[:5]

In [0]:
# report performance
rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)
# line plot of observed vs predicted
pyplot.plot(test)
pyplot.plot(predictions)
pyplot.show()

### Establish baseline performance

https://machinelearningmastery.com/persistence-time-series-forecasting-with-python/

## LSTM

### Test all functions for data Prep

#### Transform to a supervised learning problem
The LSTM model in Keras assumes that your data is divided into input (X) and output (y) components.
For a time series problem, we can achieve this by using the observation from the last time step (t-1) as the input and the observation at the current time step (t) as the output.

In [0]:
from pandas import read_csv
from pandas import datetime
from pandas import DataFrame
from pandas import concat

# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1): #lag is 1 meaning previous value predict current value
	df = DataFrame(data)
    #push all values in a series down by a specified number places
	columns = [df.shift(i) for i in range(1, lag+1)]
	columns.append(df)
    # concatenate these two series together to create a DataFrame ready for supervised learning
	df = concat(columns, axis=1)
	df.fillna(0, inplace=True)
	return df

In [0]:
df.head(5)

In [0]:
# transform to supervised learning
X = df.Sales.values
supervised = timeseries_to_supervised(X, 1)
print(supervised.head(),type(supervised))

####Transform to Stationary
This means that there is a structure in the data that is dependent on the time. Specifically, there is an increasing trend in the data.

Stationary data is easier to model and will very likely result in more skillful forecasts.
The trend can be removed from the observations, then added back to forecasts later to return the prediction to the original scale and calculate a comparable error score
A standard way to remove a trend is by differencing the data. That is the observation from the previous time step (t-1) is subtracted from the current observation (t). This removes the trend and we are left with a difference series, or the changes to the observations from one time step to the next.

In [0]:
df.set_index('Date', inplace=True)

In [0]:
df.head(5)

In [0]:
df.Sales[:5]

In [0]:
from pandas import Series
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset.iloc[i] - dataset.iloc[i - interval]
        # print(value)
        diff.append(value)
    print(diff)
    return Series(diff)

In [0]:
# transform to be stationary
differenced = difference(df.Sales, 1)
print(differenced.head())

In [0]:
differenced.head()

In [0]:
len(df),len(differenced)

In [0]:
# invert differenced value
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

In [0]:
# invert transform
inverted = list()
for i in range(len(differenced)):#35
	value = inverse_difference(df.Sales, differenced[i], len(df.Sales)-i)
	inverted.append(value)
inverted = Series(inverted)
print(inverted.head(), len(inverted))

#### Transform to Scale
Like other neural networks, LSTMs expect data to be within the scale of the activation function used by the network.

The default activation function for LSTMs is the hyperbolic tangent (tanh), which outputs values between -1 and 1. This is the preferred range for the time series data.

To make the experiment fair, the scaling coefficients (min and max) values must be calculated on the training dataset and applied to scale the test dataset and any forecasts. This is to avoid contaminating the experiment with knowledge from the test dataset, which might give the model a small edge.

MinMaxScaler transform the data [-1,1]

In [0]:
df.values[:5]

In [0]:
from sklearn.preprocessing import MinMaxScaler
# transform scale
X = df.values
X = X.reshape(len(X), 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(X)
scaled_X = scaler.transform(X)

In [0]:
scaled_X[:5]

In [0]:
# invert transform
inverted_X = scaler.inverse_transform(scaled_X)
print(inverted_X[:5])
inverted_series = Series(inverted_X[:, 0])
print(inverted_series.head())

### LSTM development
The Long Short-Term Memory network (LSTM) is a type of Recurrent Neural Network (RNN).

A benefit of this type of network is that it can learn and remember over long sequences and does not rely on a pre-specified window lagged observation as input.
In Keras, this is referred to as stateful, and **involves setting the “stateful” argument to “True” when defining an LSTM layer**.

By default, an LSTM layer in Keras maintains state between data within one batch. A batch of data is a fixed-sized number of rows from the training dataset that defines how many patterns to process before updating the weights of the network. **State in the LSTM layer between batches is cleared by default, therefore we must make the LSTM stateful.** This gives us fine-grained control over when state of the LSTM layer is cleared, by calling the reset_states() function.

The LSTM layer expects input to be in a matrix with the dimensions: [samples, time steps, features].

`Samples`: These are independent observations from the domain, typically rows of data.
`Time steps`: These are separate time steps of a given variable for a given observation.
`Features`: These are separate measures observed at the time of observation.

In [0]:
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from math import sqrt
from matplotlib import pyplot
import numpy
 

In [0]:
df.head(5)

In [0]:
raw_values=df.values
raw_values[:5]

In [0]:
diff_values = difference(df.Sales, 1)

In [0]:
diff_values[:5], type(diff_values), len(diff_values)

In [0]:
supervised = timeseries_to_supervised(diff_values, 1)
supervised_values = supervised.values

In [0]:
supervised_values[:5], type(supervised_values), len(supervised_values)

In [0]:
train, test = supervised_values[0:-12], supervised_values[-12:]

In [0]:
train.shape, test.shape

In [0]:
type(train), type(test)

In [0]:
train[:5]

In [0]:
# scale train and test data to [-1, 1]
def scale(train, test):
    # fit scaler
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train)

    # transform train
    print(train.shape)
    train = train.reshape(train.shape[0], train.shape[1])  # 23,2
    print(train.shape)
    train_scaled = scaler.transform(train)
    print(train_scaled.shape)
    
    # transform test
    test = test.reshape(test.shape[0], test.shape[1])
    test_scaled = scaler.transform(test)
    return scaler, train_scaled, test_scaled

scaler, train_scaled, test_scaled = scale(train, test)

In [0]:
train_scaled[:5]

In [0]:
# Ensure train is a 2D array of shape (samples, features+1)
X_train, y_train = train_scaled[:, 0:-1], train_scaled[:, -1] # X: input features, y: target value
print(X_train.shape, y_train.shape)

# Reshaping X to (samples, timesteps, features)
## 1 timestep per sample, with X.shape[1] features
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1]) #23,1,1
print(X_train.shape, y_train.shape)

In [0]:
# Ensure train is a 2D array of shape (samples, features+1)
X_test, y_test = test_scaled[:, 0:-1], test_scaled[:, -1] # X: input features, y: target value
print(X_test.shape, y_test.shape)

# Reshaping X to (samples, timesteps, features)
## 1 timestep per sample, with X.shape[1] features
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) #23,1,1
print(X_test.shape, y_test.shape)

In [0]:
# Verify the shape of training and testing data
print(f"X shape: {X_train.shape}")  # Should be (23, 1, 1)
print(f"y shape: {y_train.shape}")  # Should be (23,)
print(f"X_test shape: {X_test.shape}")  # Should be (num_samples_test, 1, 1)
print(f"y_test shape: {y_test.shape}")  # Should be (num_samples_test,)

In [0]:
assert not np.any(np.isnan(X_train)), "X contains NaN values"
assert not np.any(np.isnan(y_train)), "y contains NaN values"
assert not np.any(np.isnan(X_test)), "X_test contains NaN values"
assert not np.any(np.isnan(y_test)), "y_test contains NaN values"

In [0]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers import Input
from keras.callbacks import EarlyStopping
def fit_lstm(Xtrain, ytrain,batch_size, nb_epoch, neurons):
     # define model
    model = Sequential()
    # Use Input layer to define the input shape
    # batch_size=1
    model.add(Input(batch_shape=(batch_size, Xtrain.shape[1], 1)))
    # Add LSTM layer with stateful=True for time series
    model.add(LSTM(units=neurons, return_sequences=True, stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    print(model.summary())
    # early stopping to avoid overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    for i in range(nb_epoch):
        model.fit(Xtrain, ytrain, epochs=1, batch_size=batch_size,
        #  validation_data=(Xtrain, ytrain),
         verbose=1, callbacks=[early_stop], shuffle=False)
        # Reset states of the LSTM layer (model.layers[0] is the LSTM layer)
        model.layers[0].reset_states()
    return model


In [0]:
model1=fit_lstm(X_train, y_train,1, 50, 32)

In [0]:
# # fit an LSTM network to training data
# from keras.models import Sequential
# from keras.layers import LSTM, Dense
# from keras.layers import Input
# from keras.callbacks import EarlyStopping
#  # define model
# model = Sequential()
# # Use Input layer to define the input shape
# batch_size=1
# model.add(Input(batch_shape=(batch_size, X.shape[1], 1)))
# # Add LSTM layer with stateful=True for time series
# model.add(LSTM(units=32, return_sequences=True, stateful=True))
# model.add(Dense(1))
# model.compile(loss='mean_squared_error', optimizer='adam')
# print(model.summary())

In [0]:
# from keras.callbacks import EarlyStopping
# early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [0]:
# try:
#     history = model.fit(X_train, y_train, epochs=50, batch_size=1, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stop], shuffle=False)
# except Exception as e:
#     print(f"Error during training: {e}")

#### Forcast

In [0]:
# make a one-step forecast
def forecast_lstm(model, batch_size, X):
	X = X.reshape(1, 1, len(X))
	yhat = model.predict(X, batch_size=batch_size)
	return yhat[0,0]

In [0]:
len(test_scaled)

In [0]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
	new_row = [x for x in X] + [value]
	array = numpy.array(new_row)
	array = array.reshape(1, len(array))
	inverted = scaler.inverse_transform(array)
	return inverted[0, -1]

In [0]:
# walk-forward validation on the test data
predictions = list()
for i in range(len(test_scaled)):#(0,12)
	# make one-step forecast
	yhat = forecast_lstm(model, 1, X_test[i])
 
	# invert scaling
	yhat = invert_scale(scaler, X_test[i], yhat)
 
	# invert differencing
	yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
 
	# store forecast
	predictions.append(yhat)
	expected = raw_values[len(train) + i + 1]
	print('Month=%d, Predicted=%f, Expected=%f' % (i+1, yhat, expected))

In [0]:
len(predictions)

In [0]:
# report performance
rmse = sqrt(mean_squared_error(raw_values[-12:], predictions))
print('Test RMSE: %.3f' % rmse)

In [0]:
# line plot of observed vs predicted
pyplot.plot(raw_values[-12:])
pyplot.plot(predictions)
pyplot.show()

In [0]:
# repeat experiment
repeats = 30
error_scores = list()
for r in range(repeats):
    # fit the model
    lstm_model = fit_lstm(X_train,y_train, 1, 3000, 4) # 3000 epoch, 4 neurons

    # walk-forward validation on the test data
    predictions = list()
    for i in range(len(test_scaled)):#(0,12)
        # make one-step forecast
        yhat = forecast_lstm(lstm_model, 1, X_test[i])
    
        # invert scaling
        yhat = invert_scale(scaler, X_test[i], yhat)
    
        # invert differencing
        yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
    
        # store forecast
        predictions.append(yhat)
        rmse = sqrt(mean_squared_error(raw_values[-12:], predictions))
        print('%d) Test RMSE: %.3f' % (r+1, rmse))
        error_scores.append(rmse)

In [0]:
# summarize results
results = DataFrame()
results['rmse'] = error_scores
print(results.describe())
results.boxplot()
pyplot.show()