# Multi-variate prediction

In [4]:
%matplotlib widget

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

# cross validation and hyper-parameter search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

In [2]:
# this function determines whether predictions determine the correct movement for tomorrow.
def hit_rate(dates,original_values, predictions): # pass lists / arrays of dates, original values, and predictions
    # initialise dataframe
    df = pd.DataFrame(columns=['Date','Original Value','Daily PCT','Movement','Prediction','Predicted Movement'])

    # add known data as passed to function
    df['Date'] = dates#.to_list()
    df['Original Value'] = original_values#.to_list()
    df['Prediction'] = predictions#.to_list()

    # determine actually movement from time t to t+1 and predicted movement
    df['Daily PCT'] = df['Original Value'].pct_change() # percentange change between t and t+1
    df['Movement'] = df['Daily PCT'].apply(lambda x: 1 if x > 0 else 0)
    df['Predicted Movement'] = df['Prediction'].pct_change().apply(lambda x: 1 if x > 0 else 0)

    # calculate classification evaluation metrics
    y_true = df['Movement']
    y_pred = df['Predicted Movement']
    matrix = confusion_matrix(y_true,y_pred)
    accuracy = accuracy_score(y_true,y_pred)

    # display eval metrics
    print(f'Movement prediction accuracy: {round(accuracy*100,2)} %')
    print(f'Confusion matrix:\n{matrix}')
    return df

In [3]:
# some data
df = pd.read_csv('./test_data/GSPC.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2000 non-null   object 
 1   Open       2000 non-null   float64
 2   High       2000 non-null   float64
 3   Low        2000 non-null   float64
 4   Close      2000 non-null   float64
 5   Adj Close  2000 non-null   float64
 6   Volume     2000 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 109.5+ KB


In [4]:
# multi variate train-test split
df.plot(subplots=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>,
       <AxesSubplot:>, <AxesSubplot:>], dtype=object)

# Brownlee example

In [7]:
# multivariate mlp example
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import Dense

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

# data wrangle into dataset for split_squences function: ie data transformation to supervised machine learning problem
df['target'] = df['Close'].shift(periods=-1) # predict next day closing price
df.dropna(inplace=True)

# min max scale each column - not ideal atm because we are causing information leakage by doing so prior to test train split
for column in ['Open','Close','High','Low','Close','Volume','target','Adj Close']:
    scaler = MinMaxScaler()
    df[column] =  scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))

display(df)

# correct type = numpy
training_data_array = df.iloc[0:-200,1:].to_numpy() # keep OHLC and volume only
testing_data_array = df.iloc[-200:,1:].to_numpy() # keep OHLC and volume only

# choose a number of time steps
n_steps = 10

# convert into input/output
X_train, y_train = split_sequences(training_data_array, n_steps)
X_test, y_test = split_sequences(testing_data_array, n_steps)

# flatten input
n_input = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape((X_train.shape[0], n_input))
X_test = X_test.reshape((X_test.shape[0], n_input))

# define model
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=n_input))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# fit model
model.fit(X_train, y_train, epochs=2000, verbose=0, workers=10, use_multiprocessing=True)

# demonstrate prediction
predictions = []
for i in range(X_test.shape[0]):
    x_input = X_test[i].reshape(1,n_input)
    yhat = model.predict(x_input, verbose=0)
    predictions.append(yhat[0][0])
    # print(f'Pred: {yhat} \t True: {y_test[i]}')

# some evaluation metric
mse = mean_squared_error(y_test,predictions)
mae = mean_absolute_error(y_test,predictions)
mape = mean_absolute_percentage_error(y_test,predictions)

print('MAPE:',mape)
print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = df['Date'].iloc[-191:]
original_values = y_test
nn_predictions =  predictions

# hit rate calculations
print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target
0,2010-06-28,0.027099,0.026985,0.032892,0.028098,0.028098,0.436886,0.010085
1,2010-06-29,0.023619,0.020735,0.013186,0.010085,0.010085,0.777747,0.004394
2,2010-06-30,0.007018,0.008223,0.009464,0.004394,0.004394,0.615004,0.002589
3,2010-07-01,0.001875,0.000342,0.000000,0.002589,0.002589,0.823250,0.000000
4,2010-07-02,0.000000,0.000000,0.002727,0.000000,0.000000,0.447854,0.002962
...,...,...,...,...,...,...,...,...
1993,2018-05-29,0.911871,0.911844,0.905100,0.901091,0.901091,0.412615,0.919548
1994,2018-05-30,0.910414,0.921991,0.919020,0.919548,0.919548,0.385861,0.909420
1995,2018-05-31,0.920498,0.918274,0.918069,0.909420,0.909420,0.488459,0.925282
1996,2018-06-01,0.919259,0.926116,0.927859,0.925282,0.925282,0.404587,0.931903


MAPE: 0.012622203585981514
RMSE:  0.014993646117358667
MAE:  0.011156568030627364
NN:
Movement prediction accuracy: 46.07 %
Confusion matrix:
[[23 56]
 [47 65]]


In [6]:
df_nn[['Original Value','Prediction']].plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:>

# try some smoothing and other TIs

In [28]:
# import data

df = pd.read_csv('./test_data/GSPC.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-100:,0:2].reset_index(drop=True) # only look at last 2000 days
df.info()

# some simple moving averages
# df['MA_5'] = df['Open'].rolling(window=5).mean()
# df['MA_10'] = df['Open'].rolling(window=10).mean()
# df['MA_15'] = df['Open'].rolling(window=15).mean()
df['alpha 0.1'] = df['Open'].ewm(alpha=0.1).mean()
df['alpha 0.5'] = df['Open'].ewm(alpha=0.5).mean()
df['alpha 0.9'] = df['Open'].ewm(alpha=0.9).mean()



df.plot(subplots=False,figsize=(10,4),xlabel='Timestep',ylabel='Open price')
plt.tight_layout()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    100 non-null    object 
 1   Open    100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [25]:
df

Unnamed: 0,Date,Open,EMA_1,EMA_5,EMA_10
0,2018-01-12,2770.179932,2770.179932,2770.179932,2770.179932
1,2018-01-16,2798.959961,2785.327316,2789.366618,2796.343595
2,2018-01-17,2784.989990,2785.202841,2786.865688,2786.115122
3,2018-01-18,2802.399902,2790.203440,2795.150602,2800.772890
4,2018-01-19,2802.600098,2793.230633,2798.995503,2802.417394
...,...,...,...,...,...
95,2018-05-31,2720.979980,2707.745799,2714.903501,2719.170855
96,2018-06-01,2718.699951,2708.841254,2716.801726,2718.747041
97,2018-06-04,2741.669922,2712.124229,2729.235824,2739.377634
98,2018-06-05,2748.459961,2715.757909,2738.847893,2747.551728


In [45]:
df = pd.read_csv('./test_data/AirPassengers.csv')

df['log'] = df['#Passengers'].apply(lambda x: np.log(x))
df['shift'] = df['#Passengers'].shift(periods=1)
df['shift_log'] = df['log'].shift(periods=1)
df['diff'] = df['#Passengers'] - df['shift']
df['diff_log'] = df['log'] - df['shift_log']

df.loc[:,['#Passengers','diff','diff_log']].plot(subplots=True,figsize=(10,12),xlabel='Timestep',grid=True)

df

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,Month,#Passengers,log,shift,shift_log,diff,diff_log
0,1949-01,112,4.718499,,,,
1,1949-02,118,4.770685,112.0,4.718499,6.0,0.052186
2,1949-03,132,4.882802,118.0,4.770685,14.0,0.112117
3,1949-04,129,4.859812,132.0,4.882802,-3.0,-0.022990
4,1949-05,121,4.795791,129.0,4.859812,-8.0,-0.064022
...,...,...,...,...,...,...,...
139,1960-08,606,6.406880,622.0,6.432940,-16.0,-0.026060
140,1960-09,508,6.230481,606.0,6.406880,-98.0,-0.176399
141,1960-10,461,6.133398,508.0,6.230481,-47.0,-0.097083
142,1960-11,390,5.966147,461.0,6.133398,-71.0,-0.167251


In [61]:
df = pd.read_csv('./test_data/chart_c.csv')

df['log'] = df['In-Game'].apply(lambda x: np.log(x))
df['shift'] = df['In-Game'].shift(periods=1)
df['shift_log'] = df['log'].shift(periods=1)
df['diff'] = df['In-Game'] - df['shift']
df['diff_log'] = df['log'] - df['shift_log']

df.loc[:,['In-Game','diff','diff_log']].plot(subplots=True,figsize=(10,12),xlabel='Timestep',grid=True)

df

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,DateTime,Users,Users Trend,In-Game,log,shift,shift_log,diff,diff_log
0,2021-09-03 08:50:00,16995985,1.948013e+07,3664115,15.114097,,,,
1,2021-09-03 09:00:00,17066265,1.949637e+07,3667309,15.114969,3664115.0,15.114097,3194.0,0.000871
2,2021-09-03 09:10:00,17136411,1.951287e+07,3684229,15.119572,3667309.0,15.114969,16920.0,0.004603
3,2021-09-03 09:20:00,17241081,1.952978e+07,3707665,15.125913,3684229.0,15.119572,23436.0,0.006341
4,2021-09-03 09:30:00,17327721,1.954682e+07,3719017,15.128970,3707665.0,15.125913,11352.0,0.003057
...,...,...,...,...,...,...,...,...,...
2011,2021-09-17 08:00:00,16465737,1.828497e+07,3578403,15.090427,3548782.0,15.082115,29621.0,0.008312
2012,2021-09-17 08:10:00,16563757,1.829027e+07,3619142,15.101748,3578403.0,15.090427,40739.0,0.011320
2013,2021-09-17 08:20:00,16669358,1.829550e+07,3660871,15.113212,3619142.0,15.101748,41729.0,0.011464
2014,2021-09-17 08:30:00,16761577,1.830059e+07,3693956,15.122209,3660871.0,15.113212,33085.0,0.008997


In [63]:
df.loc[662:663,'diff_log'] = 0
df.loc[1670:1672,'diff_log'] = 0

df.loc[:,['In-Game','diff','diff_log']].plot(subplots=True,figsize=(10,12),xlabel='Timestep',grid=True)
df.to_csv('./test_data/chart_c_stationary.csv')


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Take this green signal above:
- it has been logged and difference and looks stationary
- maybe try some noise reduction on this now as well
- then forecast - its univariate
- then invert the differencing and log
- evaluate performance

In [60]:
df = pd.read_csv('./test_data/GSPC.csv')

df['log'] = df['Open'].apply(lambda x: np.log(x))
df['shift'] = df['Open'].shift(periods=1)
df['shift_log'] = df['log'].shift(periods=1)
df['diff'] = df['Open'] - df['shift']
df['diff_log'] = df['log'] - df['shift_log']

df.loc[:,['Open','diff','diff_log']].plot(subplots=True,figsize=(10,12),xlabel='Timestep',grid=True)

df

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,log,shift,shift_log,diff,diff_log
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000,2.813011,,,,
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000,2.824351,16.660000,2.813011,0.190000,0.011340
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000,2.829087,16.850000,2.824351,0.080000,0.004737
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000,2.832036,16.930000,2.829087,0.050000,0.002949
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000,2.837908,16.980000,2.832036,0.100000,0.005872
...,...,...,...,...,...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000,7.908747,2702.429932,7.901907,18.550048,0.006841
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000,7.907909,2720.979980,7.908747,-2.280029,-0.000838
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000,7.916322,2718.699951,7.907909,22.969971,0.008413
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000,7.918796,2741.669922,7.916322,6.790039,0.002474
