# Develop lstm model for time series prediction

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

# keras stuff
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
class time_series_prediction():

    def __init__(self,dates,one_d_time_series,lag_window_length,n_ahead_prediction):

        # raw input data + settings for time series -> supervised learning ML problem
        self.one_d_time_series = np.array(one_d_time_series)      # time series array, to array ensure index works as expected for class methods
        self.time_series_dates = np.array(dates)                  # time stamp / date for each data point
        self.lag_window_length = lag_window_length                # length of lag window
        self.n_ahead_prediction = n_ahead_prediction              # time ahead to predict

        # transfromed data: set after calling .sliding_window_1()
        self.input_data = None
        self.target_data = None

        # testing and training data: set after calling .train_test_split()
        self.training_split = None
        self.X_test = None
        self.X_train = None
        self.y_test = None
        self.y_train = None

        # predictions from various models - set after calling each models training
        self.linear_reg_predictions = None
        self.svm_predictions = None
        self.neural_net_predictions = None
        self.naive_predictions = None

        # cumprod results from predictions - set after calling .vis_results_time_series()
        self.real_vals_cumprod = None
        self.linear_reg_predictions_cumprod = None
        self.svm_predictions_cumprod = None
        self.neural_net_predictions_cumprod = None
        self.lstm_predictions = None

# ****************************************************************************************************************
    # data wrangling
# ****************************************************************************************************************

    # method to transfroms 1-D time series to supervised ML problem: one step ahead forecasting   
    def sliding_window_1(self,verbose):
        # initialize input array
        num_rows = len(self.one_d_time_series) - self.lag_window_length
        array = np.zeros((num_rows, self.lag_window_length + 1))
        
        # loop through data and populate array
        for i in range(num_rows):
            # input features
            array[i,0:self.lag_window_length+1] = self.one_d_time_series[i:i+self.lag_window_length+1]
            # target feature/s
            array[i,-1] = self.one_d_time_series[i+self.lag_window_length]
            
            if verbose == 1:
                # show pattern
                print(array[i,0:self.lag_window_length],' : ',array[i,self.lag_window_length])

        # save results as a class attribute
        self.input_data = array[:,0:self.lag_window_length]
        self.target_data = array[:,self.lag_window_length]

    # method to perform a training and testing split for dataset with only a single column of target variables
    def train_test_split(self,split):
        self.training_split = split
        self.X_train = self.input_data[0:split,:]
        self.X_test = self.input_data[split:,:]
        self.y_train = self.target_data[0:split]
        self.y_test = self.target_data[split:]

    # method to plot testing and training split of data
    def test_train_plot(self):
        fig, ax = plt.subplots(figsize=(10,5))
        ax.plot(self.time_series_dates[0:self.training_split] ,self.one_d_time_series[0:self.training_split],'k-',label='Training data') # replace returns with sp_500 for other data plotting
        ax.plot(self.time_series_dates[self.training_split:] ,self.one_d_time_series[self.training_split:],'r-',label='Testing data')
        ax.plot(self.time_series_dates[self.training_split+self.lag_window_length:] ,self.y_test,'o',label='Windowed testing data') # important to match time by start 5 (length of time window) after where segmented our testing and training data
        plt.legend(loc=0) 
        ax.set_xticks([self.time_series_dates[x] for x in range(0,len(self.time_series_dates),150)])
        ax.tick_params(rotation=30) 
        plt.tight_layout()

# ****************************************************************************************************************
    # predictive models
# ****************************************************************************************************************

    def linear_regression(self):
        print('Training multivariate linear regression:')
        # train model
        reg_model = LinearRegression().fit(self.X_train,self.y_train)
        print('\nLinear regression coefficients: \n',reg_model.coef_)

        # test model
        predictions = reg_model.predict(self.X_test)

        # evaluate: use sklearn metric methods to calc rmse and mae
        mse = mean_squared_error(self.y_test,predictions)
        mae = mean_absolute_error(self.y_test,predictions)

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.linear_reg_predictions = predictions

    def support_vector_machine(self):
        print('\nTraining support vector machine:')
        # train model
        svm_regres = LinearSVR(max_iter=1000,C=0.5).fit(self.X_train,self.y_train)

        # predict
        svm_predictions = svm_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,svm_predictions[:])
        mae = mean_absolute_error(self.y_test,svm_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.svm_predictions = svm_predictions

    def neural_net_mlp(self,verbose=0):
        print('\nTraining neural network: ')
        # train neural network
        nn_regres = MLPRegressor(hidden_layer_sizes=(int(self.lag_window_length*2)),shuffle=False,random_state=1, 
                                max_iter=1000,verbose=verbose).fit(self.X_train,self.y_train)

        # make predictions
        nn_predictions = nn_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,nn_predictions[:])
        mae = mean_absolute_error(self.y_test,nn_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.neural_net_predictions = nn_predictions

    def lstm(self):
        print('\nTraining LSTM: ')

        # transform data
        trainX = np.reshape(self.X_train, (self.X_train.shape[0], 1, self.X_train.shape[1]))
        testX = np.reshape(self.X_test, (self.X_test.shape[0], 1, self.X_train.shape[1]))
        trainY = normal.y_train
        testY = normal.y_test

        # create and fit the LSTM network
        model = Sequential()
        model.add(LSTM(4, input_shape=(1, self.lag_window_length)))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

        # make predictions
        trainPredict = model.predict(trainX)
        testPredict = model.predict(testX)

        # calculate root mean squared error
        trainScore = math.sqrt(mean_squared_error(trainY, trainPredict))
        print('Train Score: %.2f RMSE' % (trainScore))
        testScore = math.sqrt(mean_squared_error(testY, testPredict))
        print('Test Score: %.2f RMSE' % (testScore))

        # save predictions
        self.lstm_predictions = testPredict

    def naive_model(self): # t's prediction is t-1's value, note that this means you miss the first time point
        preds = np.zeros(len(self.one_d_time_series)-1)
        preds[0] = np.nan()
        preds[1:] = self.one_d_time_series[0:-2]
        self.naive_predictions = preds

# ****************************************************************************************************************
    # visualize results
# ****************************************************************************************************************
    def error(self,real_data,predicted_data):
        error = np.zeros(len(real_data))
        error = (real_data - predicted_data) / real_data
        return error

    # visualize orignal time series signal aswell as predictions    
    def vis_results_time_series(self,second_plot='error'):
        # plot prediction against actual + training data
        fig, ax = plt.subplots(2,1,figsize=(10,7),sharex=True)

        # original time series
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.one_d_time_series[self.training_split+self.lag_window_length:],'o-',linewidth=3,label='real values',markersize=5) 

        # predicted y values
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions,'o-',label='linear regression prediction',markersize=5)
        # ax[0].plot(self.time_series_dates,self.naive_predictions,'.--',label='naive prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions,'.--',label='svm prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions,'.--',label='nn prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.lstm_predictions,'.--',label='lstm prediction',markersize=5)

        ax[0].legend()
        ax[0].set_title('Real values vs model predictions')

        # plot error plot
        if second_plot == 'error':
            error_linreg = self.error(self.y_test,self.linear_reg_predictions)
            # error_naive = error(np.array(test_data[:,-1]),naive_predictions)
            error_svm = self.error(self.y_test,self.svm_predictions)
            error_nn = self.error(self.y_test,self.neural_net_predictions)
            error_lstm = self.error(self.y_test,self.lstm_predictions)

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_linreg,'r-',label='linear reg error')
            # ax[1].plot(self.time_series_dates,error_naive[1:],'-',label='naive error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_svm,'-',label='svm error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_nn,'-',label='nn error')
            # ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_lstm,'-',label='lstm error')

            ax[1].set_title('Error signal for predictive models')
            ax[1].set_xlabel('Dates')
            ax[1].legend()
            # ax[1].set_ylim([-10,10])
            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
        
        elif second_plot == 'cumprod':

            # plot cummulative prod plots - this should only be done if input data is percentage retunrs
            self.real_vals_cumprod = (self.y_test+1).cumprod()
            self.linear_reg_predictions_cumprod = (self.linear_reg_predictions + 1).cumprod()
            self.svm_predictions_cumprod = (self.svm_predictions + 1).cumprod()
            self.neural_net_predictions_cumprod = (self.neural_net_predictions + 1).cumprod()

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.real_vals_cumprod,'-',label='real vals cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions_cumprod,'-',label='linear reg cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions_cumprod,'-',label='svm cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions_cumprod,'-',label='nn cumprod')

            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
            ax[1].legend()

        # titles and save figures
        # title_string = 'S&P500 predictions _ y is '+str(column)+'_ window len is '+ str(window_length)
        # fig.suptitle(title_string)
        
        # fig_name = '../results/univariate_single_step_ahead/'+title_string+'.png'
        # plt.savefig(fig_name,facecolor='w')
        plt.tight_layout()

    # visualize predictions against real values using scatter plot
    def vis_results_scatter(self):

        # create dataframe to hold all results
        df_predictions = pd.DataFrame(index=self.time_series_dates[self.training_split+self.lag_window_length:],columns=['Real_values','linear_reg_predictions','svm_predictions','neural_net_predictions'])
        df_predictions['Real_values'] = self.y_test
        df_predictions['linear_reg_predictions'] = self.linear_reg_predictions
        df_predictions['svm_predictions'] = self.svm_predictions
        df_predictions['neural_net_predictions'] = self.neural_net_predictions

        # scatter plot with hues
        fig, ax = plt.subplots(3,1,figsize=(7,10))
        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['linear_reg_predictions'],ax=ax[0])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[0],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['svm_predictions'],ax=ax[1])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[1],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['neural_net_predictions'],ax=ax[2])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[2],color='red')

        # plot formatting
        plt.tight_layout()

In [8]:
########################################################################
# data
########################################################################
df = pd.read_csv('./test_data/AirPassengers.csv')
x = sp_500['#Passengers']#[-4000:]
dates = sp_500['Month']#[-4000:]

# percentage returns
x_pct = x.pct_change().fillna(0)
x_pct

# create new df hold both
# df = pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod']) # ,'log_transform'
# df['Month'] = dates
# df['Volume'] =  x
# df['pct_change'] = x_pct
# df['pct_change_cumprod'] = (x_pct + 1).cumprod()

########################################################################
# initialize class object
########################################################################
normal = time_series_prediction(df['Month'],df['#Passengers'],10,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=100) # testing and training dataset split
normal.test_train_plot()    # visualize training split

########################################################################
# perform some prediction tasks
########################################################################
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()
normal.lstm()


normal.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.26215382  0.2450409  -0.27074476 -0.02019418 -0.07375448  0.38694387
 -0.42111804  0.23977833 -0.39626002  1.0571671 ]
RMSE:  37.051892047287666
MAE:  31.448017880592797

Training support vector machine:
RMSE:  43.547451577664276
MAE:  33.15561210470844

Training neural network: 
RMSE:  88.23679464192749
MAE:  78.89174629726706

Training LSTM: 




Epoch 1/100
100/100 - 9s - loss: 65952.0781
Epoch 2/100
100/100 - 0s - loss: 65696.8438
Epoch 3/100
100/100 - 0s - loss: 65616.4375
Epoch 4/100
100/100 - 0s - loss: 65535.7656
Epoch 5/100
100/100 - 0s - loss: 65455.3008
Epoch 6/100
100/100 - 0s - loss: 65374.7852
Epoch 7/100
100/100 - 0s - loss: 65294.5195
Epoch 8/100
100/100 - 0s - loss: 65214.1797
Epoch 9/100
100/100 - 0s - loss: 65134.0117
Epoch 10/100
100/100 - 0s - loss: 65053.5781
Epoch 11/100
100/100 - 0s - loss: 64973.6094
Epoch 12/100
100/100 - 0s - loss: 64893.3555
Epoch 13/100
100/100 - 0s - loss: 64813.4492
Epoch 14/100
100/100 - 0s - loss: 64733.4609
Epoch 15/100
100/100 - 0s - loss: 64653.6602
Epoch 16/100
100/100 - 0s - loss: 64573.7734
Epoch 17/100
100/100 - 0s - loss: 64493.9258
Epoch 18/100
100/100 - 0s - loss: 64414.3281
Epoch 19/100
100/100 - 0s - loss: 64334.7305
Epoch 20/100
100/100 - 0s - loss: 64254.9492
Epoch 21/100
100/100 - 0s - loss: 64175.6250
Epoch 22/100
100/100 - 0s - loss: 64096.0469
Epoch 23/100
100/10

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
sp_500

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


## Build lstm

In [76]:
trainX = np.reshape(normal.X_train, (normal.X_train.shape[0], 1, normal.X_train.shape[1]))
testX = np.reshape(normal.X_test, (normal.X_test.shape[0], 1, normal.X_train.shape[1]))
trainY = normal.y_train
testY = normal.y_test

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(1, 5)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY, trainPredict))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY, testPredict))
print('Test Score: %.2f RMSE' % (testScore))

Epoch 1/100
3500/3500 - 2s - loss: 1.3003
Epoch 2/100
3500/3500 - 1s - loss: 0.4772
Epoch 3/100
3500/3500 - 1s - loss: 0.4081
Epoch 4/100
3500/3500 - 1s - loss: 0.3913
Epoch 5/100
3500/3500 - 1s - loss: 0.3805
Epoch 6/100
3500/3500 - 1s - loss: 0.3764
Epoch 7/100
3500/3500 - 1s - loss: 0.3747
Epoch 8/100
3500/3500 - 1s - loss: 0.3765
Epoch 9/100
3500/3500 - 1s - loss: 0.3737
Epoch 10/100
3500/3500 - 1s - loss: 0.3764
Epoch 11/100
3500/3500 - 1s - loss: 0.3683
Epoch 12/100
3500/3500 - 1s - loss: 0.3710
Epoch 13/100
3500/3500 - 1s - loss: 0.3645
Epoch 14/100
3500/3500 - 1s - loss: 0.3699
Epoch 15/100
3500/3500 - 1s - loss: 0.3657
Epoch 16/100
3500/3500 - 1s - loss: 0.3639
Epoch 17/100
3500/3500 - 1s - loss: 0.3678
Epoch 18/100
3500/3500 - 1s - loss: 0.3634
Epoch 19/100
3500/3500 - 1s - loss: 0.3622
Epoch 20/100
3500/3500 - 1s - loss: 0.3590
Epoch 21/100
3500/3500 - 1s - loss: 0.3586
Epoch 22/100
3500/3500 - 1s - loss: 0.3630
Epoch 23/100
3500/3500 - 1s - loss: 0.3635
Epoch 24/100
3500/35

In [75]:
trainPredict

array([[2.7830205],
       [2.6712413],
       [2.2477574],
       ...,
       [3.8193555],
       [3.7082005],
       [3.7810779]], dtype=float32)

In [67]:
1

IndexError: tuple index out of range