# Multi-variate prediction

In [7]:
%matplotlib widget

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

# cross validation and hyper-parameter search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

In [70]:
# this function determines whether predictions determine the correct movement for tomorrow.
def hit_rate(dates,original_values, predictions): # pass lists / arrays of dates, original values, and predictions
    # initialise dataframe
    df = pd.DataFrame(columns=['Date','Original Value','Daily PCT','Movement','Prediction','Predicted Movement'])

    # add known data as passed to function
    df['Date'] = dates#.to_list()
    df['Original Value'] = original_values#.to_list()
    df['Prediction'] = predictions#.to_list()

    # determine actually movement from time t to t+1 and predicted movement
    df['Daily PCT'] = df['Original Value'].pct_change() # percentange change between t and t+1
    df['Movement'] = df['Daily PCT'].apply(lambda x: 1 if x > 0 else 0)
    df['Predicted Movement'] = df['Prediction'].pct_change().apply(lambda x: 1 if x > 0 else 0)

    # calculate classification evaluation metrics
    y_true = df['Movement']
    y_pred = df['Predicted Movement']
    matrix = confusion_matrix(y_true,y_pred)
    accuracy = accuracy_score(y_true,y_pred)

    # display eval metrics
    print(f'Movement prediction accuracy: {round(accuracy*100,2)} %')
    print(f'Confusion matrix:\n{matrix}')
    return df

In [4]:
# some data
df = pd.read_csv('./test_data/GSPC.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2000 non-null   object 
 1   Open       2000 non-null   float64
 2   High       2000 non-null   float64
 3   Low        2000 non-null   float64
 4   Close      2000 non-null   float64
 5   Adj Close  2000 non-null   float64
 6   Volume     2000 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 109.5+ KB


In [8]:
# multi variate train-test split
df.plot(subplots=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>,
       <AxesSubplot:>, <AxesSubplot:>], dtype=object)

# Brownlee example

In [69]:
# multivariate mlp example
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import Dense

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

# data wrangle into dataset for split_squences function: ie data transformation to supervised machine learning problem
df['target'] = df['Close'].shift(periods=-1) # predict next day closing price
df.dropna(inplace=True)

# min max scale each column - not ideal atm because we are causing information leakage by doing so prior to test train split
for column in ['Open','Close','High','Low','Close','Volume','target','Adj Close']:
    scaler = MinMaxScaler()
    df[column] =  scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))

display(df)

# correct type = numpy
training_data_array = df.iloc[0:-200,1:].to_numpy() # keep OHLC and volume only
testing_data_array = df.iloc[-200:,1:].to_numpy() # keep OHLC and volume only

# choose a number of time steps
n_steps = 10

# convert into input/output
X_train, y_train = split_sequences(training_data_array, n_steps)
X_test, y_test = split_sequences(testing_data_array, n_steps)

# flatten input
n_input = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape((X_train.shape[0], n_input))
X_test = X_test.reshape((X_test.shape[0], n_input))

# define model
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=n_input))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# fit model
model.fit(X_train, y_train, epochs=2000, verbose=0, workers=10, use_multiprocessing=True)

# demonstrate prediction
predictions = []
for i in range(X_test.shape[0]):
    x_input = X_test[i].reshape(1,n_input)
    yhat = model.predict(x_input, verbose=0)
    predictions.append(yhat[0][0])
    print(f'Pred: {yhat} \t True: {y_test[i]}')

# some evaluation metric
mse = mean_squared_error(y_test,predictions)
mae = mean_absolute_error(y_test,predictions)
mape = mean_absolute_percentage_error(y_test,predictions)

print('MAPE:',mape)
print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = df['Date'].iloc[-191:]
original_values = y_test
nn_predictions =  predictions

# hit rate calculations
print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target
0,2010-06-28,0.027099,0.026985,0.032892,0.028098,0.028098,0.436886,0.010085
1,2010-06-29,0.023619,0.020735,0.013186,0.010085,0.010085,0.777747,0.004394
2,2010-06-30,0.007018,0.008223,0.009464,0.004394,0.004394,0.615004,0.002589
3,2010-07-01,0.001875,0.000342,0.000000,0.002589,0.002589,0.823250,0.000000
4,2010-07-02,0.000000,0.000000,0.002727,0.000000,0.000000,0.447854,0.002962
...,...,...,...,...,...,...,...,...
1974,2018-05-01,0.878086,0.881734,0.877174,0.882143,0.882143,0.385678,0.871804
1975,2018-05-02,0.884218,0.884778,0.880591,0.871804,0.871804,0.454286,0.868594
1976,2018-05-03,0.869998,0.871880,0.860445,0.868594,0.868594,0.430048,0.886801
1977,2018-05-04,0.866393,0.890245,0.871692,0.886801,0.886801,0.350283,0.891779


Pred: [[0.78343385]] 	 True: 0.7859578673636097
Pred: [[0.7829141]] 	 True: 0.7881628372761657
Pred: [[0.7860372]] 	 True: 0.7849255124912574
Pred: [[0.78523743]] 	 True: 0.7844391552438182
Pred: [[0.7825949]] 	 True: 0.7650854014729908
Pred: [[0.76788414]] 	 True: 0.7667662768124847
Pred: [[0.766367]] 	 True: 0.7800182636225531
Pred: [[0.77681464]] 	 True: 0.7793535132680006
Pred: [[0.77910733]] 	 True: 0.7812451085373046
Pred: [[0.7806326]] 	 True: 0.7606536904672409
Pred: [[0.7649021]] 	 True: 0.7582432787161321
Pred: [[0.75979644]] 	 True: 0.7597674007983938
Pred: [[0.7572371]] 	 True: 0.7728139457699094
Pred: [[0.7695445]] 	 True: 0.7682363008914116
Pred: [[0.7682246]] 	 True: 0.7654961532788832
Pred: [[0.76554716]] 	 True: 0.767701255062652
Pred: [[0.7675527]] 	 True: 0.7683443655673238
Pred: [[0.76750845]] 	 True: 0.7694577363841487
Pred: [[0.7676885]] 	 True: 0.775559503344908
Pred: [[0.7718289]] 	 True: 0.7831582112448207
Pred: [[0.78189296]] 	 True: 0.7858065240688473
Pred: [

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'

In [76]:
df_nn[['Original Value','Prediction']].plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:>