In [None]:
"""
This is a trained LSTM for time series forecasts.
Author: Valentin Todorov
"""

# Import the packages I'll need
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras import backend as K
from keras import optimizers

from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


### Provide input values to parameters
# Read in the data
cwd = os.getcwd()
dataLocation = "/Users/valentin/Google Drive/Data/"
inputFile = "file_for_lstm_model_logdiff"

inputDf = pd.read_csv(dataLocation + inputFile + ".csv")
inputDf.head(10)

# Remove the forecasts from the data
# inputData.Price[[inputData.Date > 2017-01-01 00:00:00]] = 0

# Format "Date" field as date
inputDf[['date']] = pd.to_datetime(inputDf.date)

# Convert the data frame to a Numpy array
targetArr = inputDf.iloc[:, 0:].values

# Select random seed
randSeed = 7896

# Provide names of input features
inputDataFrame = inputDf
inputData = targetArr

# Create a Numpy array from the input data
dataframe = inputData[:, 1:, ]
# xVarColumns = [1, 2, 3]                                   # Select features: BY DEFAULT, it uses all features in columns 1:END for predictors
yVarColumns = [0]                                           # Select target: The target should always be in the first column
number_of_features = len(list(inputDf)) - 2            # Calculate the number of features to be used in the network


In [None]:
########################################################
# Sanity checks of the data
########################################################

print(number_of_features, "\nThe analytical array shape is (includes the target):", dataframe.shape)
inputDf.head(10)


In [None]:
########################################################
# Transform the data
########################################################

# Fix random seed for reproducibility
np.random.seed(randSeed)

## NO NEED FOR ARRAYS - Extract the NumPy array from the dataframe and convert the integer values to
# floating point values, which are more suitable for modeling with a neural network
# dataset = dataframe.values
# dataset = dataset.astype('float32')

# Normalize the data between 0 - 1
scaler = MinMaxScaler(feature_range = (0, 1), copy = True)
dataset = scaler.fit_transform(dataframe)

# Split the data in train and validaiton
trainSize = 166                         # This is the number of records used in the training set (03/2000 - 12/2013)
train = dataset[0:trainSize, :]
validate = dataset[trainSize:len(dataset), :]

## Modify the data for the LSTM network - The LSTM network expects the input data (X)
# to be provided with a specific array structure in the form of: [samples, time steps, features].
trainX = train[:, 1:]
validateX = validate[:, 1:]

trainY = train[:, yVarColumns]
validateY = validate[:, yVarColumns]

dataframe_length = len(trainY)
# dataframe_dim = Need to figure out how to count the columns of the array

# reshape input to be [samples, time steps, features]
trainX = trainX.reshape(trainX.shape[0], 1, trainX.shape[1])
validateX = validateX.reshape(validateX.shape[0], 1, validateX.shape[1])


In [None]:
########################################################
# Sanity checks of the data
########################################################

print(len(train), len(validate), len(dataset))
print(trainX.shape)

train[:, 1]


In [None]:
########################################################
# Create model
########################################################

## The LSTM network expects the input data (X) to be provided with a specific
# array structure in the form of: [samples, time steps, features]
# Define the network

modelFit = Sequential()
modelFit.add(LSTM(10,
                  #activation = 'sigmoid',
                  input_shape = (1, number_of_features)))
modelFit.add(Dropout(.05))
#modelFit.add(Dense(10, activation = 'relu'))
modelFit.add(Dense(1, activation = 'linear'))

# Before training the model, configure the learning process via the compile method
# Default settings of the optimization algorithms: https://keras.io/optimizers/
# sgd = optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
# adagrad = optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=0.0)
adam = optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, decay = 0.0)

#sgd = optimizers.SGD(lr = 0.001, momentum = 0.0, decay = 0.0, nesterov = False)
modelFit.compile(optimizer = adam,
                 loss = 'mean_squared_error',
                 metrics = ['accuracy'])

print(modelFit.summary())

# Train the model
modelEstimate = modelFit.fit(trainX, trainY,
                             epochs = 50,
                             batch_size = 50,
                             verbose = 1,
                             validation_data = (validateX, validateY))

# make predictions
trainPredict = modelFit.predict(trainX)
validatePredict = modelFit.predict(validateX)

# print the training accuracy and validation loss at each epoch
# print the number of models of the network
print(modelEstimate.history)
print(len(modelFit.layers))


In [None]:
########################################################
# Accuracy evaluation of results
########################################################

# Invert the scaling
df_train = np.column_stack((trainPredict, train[:, 1:]))
trainPredict2 = scaler.inverse_transform(df_train)

df_validate = np.column_stack((validatePredict, validate[:, 1:]))
validatePredict2 = scaler.inverse_transform(df_validate)

# Plot the errors of the epochs and MSE
plt.plot(modelEstimate.history['loss'])
plt.plot(modelEstimate.history['val_loss'])
#  plt.plot(modelEstimate.history['val_acc'])
plt.title('Model Error History')
plt.ylabel('Mean Squared Error')
plt.xlabel('Epochs')
plt.legend(['Training Error', 'Validation Error'])
plt.show()


In [None]:
########################################################
# Output final results
########################################################

# Combine the final datasest - merge the training and validation datasets and rename columns
combined_dataframe = pd.concat([pd.DataFrame(trainPredict2), pd.DataFrame(validatePredict2)])
combined_dataframe.index = range(len(combined_dataframe))

# Add columns names to the data frame with the forecasts
names_list = list(inputDataFrame)[1:]
names_list[0] = 'lstm_forecast_price'

combined_dataframe.columns = names_list

actual_value_target = pd.DataFrame(dataframe[:, 0])
actual_value_target.columns = ['actual_price']

# Output the forecasts. Create the dataframe and write it to a CSV file
final_forecast_file = pd.concat([actual_value_target, combined_dataframe], axis = 1)
final_forecast_file.to_csv(dataLocation + inputFile + "_withForecasts" + ".csv", sep = ',')
