# Code up class to perform different tasks

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor


# 1.0 Class for univariate one-step ahead forecasting 

In [2]:
class time_series_prediction():

    def __init__(self,dates,one_d_time_series,lag_window_length,n_ahead_prediction):

        # raw input data + settings for time series -> supervised learning ML problem
        self.one_d_time_series = np.array(one_d_time_series)      # time series array, to array ensure index works as expected for class methods
        self.time_series_dates = np.array(dates)                  # time stamp / date for each data point
        self.lag_window_length = lag_window_length                # length of lag window
        self.n_ahead_prediction = n_ahead_prediction              # time ahead to predict

        # transfromed data: set after calling .sliding_window_1()
        self.input_data = None
        self.target_data = None

        # testing and training data: set after calling .train_test_split()
        self.training_split = None
        self.X_test = None
        self.X_train = None
        self.y_test = None
        self.y_train = None

        # predictions from various models - set after calling each models training
        self.linear_reg_predictions = None
        self.svm_predictions = None
        self.neural_net_predictions = None
        self.naive_predictions = None

        # cumprod results from predictions - set after calling .vis_results_time_series()
        self.real_vals_cumprod = None
        self.linear_reg_predictions_cumprod = None
        self.svm_predictions_cumprod = None
        self.neural_net_predictions_cumprod = None
    

# ****************************************************************************************************************
    # data wrangling
# ****************************************************************************************************************

    # method to transfroms 1-D time series to supervised ML problem: one step ahead forecasting   
    def sliding_window_1(self,verbose):
        # initialize input array
        num_rows = len(self.one_d_time_series) - self.lag_window_length
        array = np.zeros((num_rows, self.lag_window_length + 1))
        
        # loop through data and populate array
        for i in range(num_rows):
            # input features
            array[i,0:self.lag_window_length+1] = self.one_d_time_series[i:i+self.lag_window_length+1]
            # target feature/s
            array[i,-1] = self.one_d_time_series[i+self.lag_window_length]
            
            if verbose == 1:
                # show pattern
                print(array[i,0:self.lag_window_length],' : ',array[i,self.lag_window_length])

        # save results as a class attribute
        self.input_data = array[:,0:self.lag_window_length]
        self.target_data = array[:,self.lag_window_length]

    # method to perform a training and testing split for dataset with only a single column of target variables
    def train_test_split(self,split):
        self.training_split = split
        self.X_train = self.input_data[0:split,:]
        self.X_test = self.input_data[split:,:]
        self.y_train = self.target_data[0:split]
        self.y_test = self.target_data[split:]

    # method to plot testing and training split of data
    def test_train_plot(self):
        fig, ax = plt.subplots(figsize=(10,5))
        ax.plot(self.time_series_dates[0:self.training_split] ,self.one_d_time_series[0:self.training_split],'k-',label='Training data') # replace returns with sp_500 for other data plotting
        ax.plot(self.time_series_dates[self.training_split:] ,self.one_d_time_series[self.training_split:],'r-',label='Testing data')
        ax.plot(self.time_series_dates[self.training_split+self.lag_window_length:] ,self.y_test,'o',label='Windowed testing data') # important to match time by start 5 (length of time window) after where segmented our testing and training data
        plt.legend(loc=0) 
        ax.set_xticks([self.time_series_dates[x] for x in range(0,len(self.time_series_dates),150)])
        ax.tick_params(rotation=30) 
        plt.tight_layout()

# ****************************************************************************************************************
    # predictive models
# ****************************************************************************************************************

    def linear_regression(self):
        print('Training multivariate linear regression:')
        # train model
        reg_model = LinearRegression().fit(self.X_train,self.y_train)
        print('\nLinear regression coefficients: \n',reg_model.coef_)

        # test model
        predictions = reg_model.predict(self.X_test)

        # evaluate: use sklearn metric methods to calc rmse and mae
        mse = mean_squared_error(self.y_test,predictions)
        mae = mean_absolute_error(self.y_test,predictions)

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.linear_reg_predictions = predictions

    def support_vector_machine(self):
        print('\nTraining support vector machine:')
        # train model
        svm_regres = LinearSVR(max_iter=1000,C=0.5).fit(self.X_train,self.y_train)

        # predict
        svm_predictions = svm_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,svm_predictions[:])
        mae = mean_absolute_error(self.y_test,svm_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.svm_predictions = svm_predictions

    def neural_net_mlp(self,verbose=0):
        print('\nTraining neural network: ')
        # train neural network
        nn_regres = MLPRegressor(hidden_layer_sizes=(20),shuffle=False,random_state=1, 
                                max_iter=1000,verbose=verbose).fit(self.X_train,self.y_train)

        # make predictions
        nn_predictions = nn_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,nn_predictions[:])
        mae = mean_absolute_error(self.y_test,nn_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.neural_net_predictions = nn_predictions

    def naive_model(self): # t's prediction is t-1's value, note that this means you miss the first time point
        preds = np.zeros(len(self.one_d_time_series)-1)
        preds[0] = np.nan()
        preds[1:] = self.one_d_time_series[0:-2]
        self.naive_predictions = preds

# ****************************************************************************************************************
    # visualize results
# ****************************************************************************************************************
    def error(self,real_data,predicted_data):
        error = np.zeros(len(real_data))
        error = (real_data - predicted_data) / real_data
        return error

    # visualize orignal time series signal aswell as predictions    
    def vis_results_time_series(self,second_plot='error'):
        # plot prediction against actual + training data
        fig, ax = plt.subplots(2,1,figsize=(10,7),sharex=True)

        # original time series
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.one_d_time_series[self.training_split+self.lag_window_length:],'o-',linewidth=3,label='real values',markersize=5) 

        # predicted y values
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions,'o-',label='linear regression prediction',markersize=5)
        # ax[0].plot(self.time_series_dates,self.naive_predictions,'.--',label='naive prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions,'.--',label='svm prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions,'.--',label='nn prediction',markersize=5)

        ax[0].legend()
        ax[0].set_title('Real values vs model predictions')

        # plot error plot
        if second_plot == 'error':
            error_linreg = self.error(self.y_test,self.linear_reg_predictions)
            # error_naive = error(np.array(test_data[:,-1]),naive_predictions)
            error_svm = self.error(self.y_test,self.svm_predictions)
            error_nn = self.error(self.y_test,self.neural_net_predictions)

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_linreg,'r-',label='linear reg error')
            # ax[1].plot(self.time_series_dates,error_naive[1:],'-',label='naive error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_svm,'-',label='svm error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_nn,'-',label='nn error')
            ax[1].set_title('Error signal for predictive models')
            ax[1].set_xlabel('Dates')
            ax[1].legend()
            # ax[1].set_ylim([-10,10])
            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
        
        elif second_plot == 'cumprod':

            # plot cummulative prod plots - this should only be done if input data is percentage retunrs
            self.real_vals_cumprod = (self.y_test+1).cumprod()
            self.linear_reg_predictions_cumprod = (self.linear_reg_predictions + 1).cumprod()
            self.svm_predictions_cumprod = (self.svm_predictions + 1).cumprod()
            self.neural_net_predictions_cumprod = (self.neural_net_predictions + 1).cumprod()

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.real_vals_cumprod,'-',label='real vals cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions_cumprod,'-',label='linear reg cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions_cumprod,'-',label='svm cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions_cumprod,'-',label='nn cumprod')

            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
            ax[1].legend()

        # titles and save figures
        # title_string = 'S&P500 predictions _ y is '+str(column)+'_ window len is '+ str(window_length)
        # fig.suptitle(title_string)
        
        # fig_name = '../results/univariate_single_step_ahead/'+title_string+'.png'
        # plt.savefig(fig_name,facecolor='w')
        plt.tight_layout()

    # visualize predictions against real values using scatter plot
    def vis_results_scatter(self):

        # create dataframe to hold all results
        df_predictions = pd.DataFrame(index=self.time_series_dates[self.training_split+self.lag_window_length:],columns=['Real_values','linear_reg_predictions','svm_predictions','neural_net_predictions'])
        df_predictions['Real_values'] = self.y_test
        df_predictions['linear_reg_predictions'] = self.linear_reg_predictions
        df_predictions['svm_predictions'] = self.svm_predictions
        df_predictions['neural_net_predictions'] = self.neural_net_predictions

        # scatter plot with hues
        fig, ax = plt.subplots(3,1,figsize=(7,10))
        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['linear_reg_predictions'],ax=ax[0])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[0],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['svm_predictions'],ax=ax[1])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[1],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['neural_net_predictions'],ax=ax[2])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[2],color='red')

        # plot formatting
        plt.tight_layout()

# 2.0 Import some test data

In [3]:
# import some data
sp_500 = pd.read_csv('../test_data/GSPC.csv')
sp_500

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000


# 3.0 Example of using class

In [4]:
# initialize class object
normal = time_series_prediction(sp_500['Date'][-2000:],sp_500['Volume'][-2000:]/1e9,5,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1200) # testing and training dataset split
normal.test_train_plot()    # visualize training split

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()
# normal.naive_model()

Training multivariate linear regression:

Linear regression coefficients: 
 [0.10077061 0.04071057 0.03651907 0.14864236 0.43403925]
RMSE:  0.5491520347058507
MAE:  0.366115072166878

Training support vector machine:
RMSE:  0.5535767665711491
MAE:  0.363937042880904

Training neural network: 
RMSE:  0.5542544123599602
MAE:  0.3655117729032284


In [6]:
normal.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- even with the volume data which seems more stationary than open price data, the forecasts are still dominated by t-1

In [7]:
# plot predicted vs real value scatter plots
# normal.vis_results_scatter()

# Play around with Zander's standardization stuff

In [8]:
# some misc data
x = sp_500['Open'][-2000:]
dates = sp_500['Date'][-2000:]
# percentage returns
x_pct = x.pct_change().fillna(0)
x_pct

# create new df hold both
df = pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod','log_transform'])
df['Dates'] = dates
df['Open'] =  x
df['pct_change'] = x_pct
df['pct_change_cumprod'] = (x_pct + 1).cumprod()
df['log_transform'] = np.log(df['Open'] )

df.reset_index(inplace=True,drop=True)

# plot
df.plot(subplots=True,sharex=True,figsize=(7,7))
plt.tight_layout()

# view data
df

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,Dates,Open,pct_change,pct_change_cumprod,log_transform
0,2010-06-28,1077.500000,0.000000,1.000000,6.982399
1,2010-06-29,1071.099976,-0.005940,0.994060,6.976441
2,2010-06-30,1040.560059,-0.028513,0.965717,6.947514
3,2010-07-01,1031.099976,-0.009091,0.956937,6.938381
4,2010-07-02,1027.650024,-0.003346,0.953736,6.935030
...,...,...,...,...,...
1995,2018-05-31,2720.979980,0.006864,2.525271,7.908747
1996,2018-06-01,2718.699951,-0.000838,2.523155,7.907909
1997,2018-06-04,2741.669922,0.008449,2.544473,7.916322
1998,2018-06-05,2748.459961,0.002477,2.550775,7.918796


- unsure what the log transform is required for

In [9]:
normal = time_series_prediction(df['Dates'],df['pct_change'],10,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1500) # testing and training dataset split
normal.test_train_plot() 

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.0399997  -0.03879977 -0.00294204 -0.00418228 -0.00415419 -0.0923927
 -0.0174214  -0.07352073  0.02916561 -0.01048691]
RMSE:  0.006783875497107386
MAE:  0.00434815357749492

Training support vector machine:
RMSE:  0.006664687686071732
MAE:  0.004260153648427088

Training neural network: 
RMSE:  0.007033618242642029
MAE:  0.004595420014010832


In [11]:
normal.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
# plot predicted vs real value scatter plots
# normal.vis_results_scatter()

# Try denoising use fft 

In [13]:
# import scipy fft functions
from scipy.fft import fft, ifft, fftfreq

In [30]:
# apply discrete fourier transform
signal = np.array(df['pct_change'])#np.array(sp_500['Open'][-2000:])
fft_coefficients = fft(signal)
fft_coefficients

array([1.01462727-0.j        , 0.07742948-0.12909129j,
       0.08978698+0.29639753j, ..., 0.02326016+0.07084956j,
       0.08978698-0.29639753j, 0.07742948+0.12909129j])

In [34]:
# plot amplitude vs frequency 
n = len(signal)

# get frequencies and psd
freqs = fftfreq(signal.shape[0])
psd = np.abs(fft_coefficients)/n # psd is amplitude/N

# plot psd
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(freqs[0:int(n/2)],psd[0:int(n/2)])
ax.set_ylabel('Power spectrum')
ax.set_xlabel('Frequencies')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Frequencies')

In [33]:
# plot inverse fourier transform as sanity check
inverse_fft = ifft(fft_coefficients)
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(range(0,len(inverse_fft)),inverse_fft,'-',label='Inverse fourier')
ax.plot(range(0,len(inverse_fft)),signal,'.',label='Real data')
ax.legend()
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


In [37]:
# try denoise data
psd_indices = psd > 0.0004 # mask
fft_filtered = fft_coefficients*psd_indices

inverse_transform_filtered = ifft(fft_filtered)

# plot this
fig,ax = plt.subplots(figsize=(12,5))
ax.plot(range(0,len(inverse_fft)),signal,'-',label='Real data')
ax.plot(range(0,len(inverse_fft)),inverse_transform_filtered,'-',label='Inverse fourier filtered')
ax.legend()
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


## now train on filtered data

In [40]:
fft_denoised = time_series_prediction(df['Dates'],inverse_transform_filtered,10,1) # pass time series, lag window length, a number of steps ahead to predict
fft_denoised.sliding_window_1(verbose=0) # time series to supervised learning ML problem
fft_denoised.train_test_split(split=1500) # testing and training dataset split
fft_denoised.test_train_plot() 

  array[i,0:self.lag_window_length+1] = self.one_d_time_series[i:i+self.lag_window_length+1]
  array[i,-1] = self.one_d_time_series[i+self.lag_window_length]


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [41]:
# perform some prediction tasks
fft_denoised.linear_regression()
fft_denoised.support_vector_machine()
fft_denoised.neural_net_mlp()

Training multivariate linear regression:

Linear regression coefficients: 
 [-0.06694886  0.00248511 -0.54125478  0.36460841 -0.51534226  0.19963451
 -0.72572638  0.26936598 -0.63479409  0.28630021]
RMSE:  0.0020272530362578436
MAE:  0.0015866217625426359

Training support vector machine:
RMSE:  0.0023151390174558136
MAE:  0.001827396063214591

Training neural network: 
RMSE:  0.002971738483058432
MAE:  0.0023715682383264676


In [43]:
fft_denoised.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


In [120]:
# compare results for denoised and normal data
fig,ax = plt.subplots(figsize=(10,5))

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.real_vals_cumprod,'-',label='real vals cumprod')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.linear_reg_predictions_cumprod,'-',label='linear reg cumprod - denoised')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.svm_predictions_cumprod,'-',label='svm cumprod - denoised')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.neural_net_predictions_cumprod,'-',label='nn cumprod - denoised')

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.linear_reg_predictions_cumprod,'--',label='linear reg cumprod')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.svm_predictions_cumprod,'--',label='svm cumprod')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.neural_net_predictions_cumprod,'--',label='nn cumprod')

ax.set_xticks([normal.time_series_dates[x] for x in range(normal.training_split,len(normal.time_series_dates),28)])
ax.tick_params(rotation=30)
ax.legend()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x1eb6f98bbe0>

# Wavelet denoising

In [119]:
import pywt

# single level wavelet denoising
data = sp_500['Volume'][-2000:]/1e9
plt.figure(figsize=(15,5))
data.plot()

x = np.array(data)                
(ca, cd) = pywt.dwt(x, "sym20")                
cat = pywt.threshold(ca, 0.5, mode="hard")                
cdt = pywt.threshold(cd, 0.5, mode="soft")                
tx = pywt.idwt(cat, cdt, "sym20")

plt.plot(sp_500['Volume'][-2000:].index,tx,'-.')
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [407]:
pywt.wavelist(kind='discrete')

['bior1.1',
 'bior1.3',
 'bior1.5',
 'bior2.2',
 'bior2.4',
 'bior2.6',
 'bior2.8',
 'bior3.1',
 'bior3.3',
 'bior3.5',
 'bior3.7',
 'bior3.9',
 'bior4.4',
 'bior5.5',
 'bior6.8',
 'coif1',
 'coif2',
 'coif3',
 'coif4',
 'coif5',
 'coif6',
 'coif7',
 'coif8',
 'coif9',
 'coif10',
 'coif11',
 'coif12',
 'coif13',
 'coif14',
 'coif15',
 'coif16',
 'coif17',
 'db1',
 'db2',
 'db3',
 'db4',
 'db5',
 'db6',
 'db7',
 'db8',
 'db9',
 'db10',
 'db11',
 'db12',
 'db13',
 'db14',
 'db15',
 'db16',
 'db17',
 'db18',
 'db19',
 'db20',
 'db21',
 'db22',
 'db23',
 'db24',
 'db25',
 'db26',
 'db27',
 'db28',
 'db29',
 'db30',
 'db31',
 'db32',
 'db33',
 'db34',
 'db35',
 'db36',
 'db37',
 'db38',
 'dmey',
 'haar',
 'rbio1.1',
 'rbio1.3',
 'rbio1.5',
 'rbio2.2',
 'rbio2.4',
 'rbio2.6',
 'rbio2.8',
 'rbio3.1',
 'rbio3.3',
 'rbio3.5',
 'rbio3.7',
 'rbio3.9',
 'rbio4.4',
 'rbio5.5',
 'rbio6.8',
 'sym2',
 'sym3',
 'sym4',
 'sym5',
 'sym6',
 'sym7',
 'sym8',
 'sym9',
 'sym10',
 'sym11',
 'sym12',
 'sym13',

In [107]:
import matplotlib.pyplot as plt
import pywt
import sys

# Data format:
# Raw data should be in a .txt file with two columns, separated by tabs:
#  - The first column should be a time-series index
#  - The second column should contain the data to be filtered

# Get data:
data = sp_500['Open'][-2000:]#/1e9
index = sp_500['Open'][-2000:].index

# Create wavelet object and define parameters
w = pywt.Wavelet('sym8') # sym family look good too
maxlev = pywt.dwt_max_level(len(data), w.dec_len)
# maxlev = 1 # Override if desired
print("maximum level is " + str(maxlev))
threshold = 0.5 # Threshold for filtering

# Decompose into wavelet components, to the level selected:
coeffs = pywt.wavedec(data, w, level=maxlev)

plt.figure(figsize=(8,15))
for i in range(1, len(coeffs)):
    plt.subplot(maxlev, 1, i)
    plt.plot(coeffs[i],label='Original coefficients')
    coeffs[i] = pywt.threshold(coeffs[i], threshold*max(coeffs[i]),mode='hard')
    plt.plot(coeffs[i],label='Thresholded coefficients')
    plt.ylabel('Scale: '+str(maxlev-i+1))
    plt.legend()
    plt.tight_layout()

datarec = pywt.waverec(coeffs, w)

plt.figure(figsize=(15,5))
plt.plot(index, data,label='Raw signal')
plt.plot(index, datarec,label="De-noised signal using wavelet techniques")
plt.legend()
plt.tight_layout()
plt.show()

maximum level is 7


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

so first perform a wavelet transform of the open data then computes returns and do forecasting

In [108]:

# create new df for normal data
df_normal = pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod'])
df_normal['Dates'] = sp_500['Date'][-2000:]
df_normal['Open'] =   sp_500['Open'][-2000:]
df_normal['pct_change'] = df_normal['Open'].pct_change().fillna(0)
df_normal['pct_change_cumprod'] = (df_normal['pct_change']  + 1).cumprod()

df_normal.reset_index(inplace=True,drop=True)

# create new df for wavelet denoised data
df_denoised= pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod'])
df_denoised['Dates'] = sp_500['Date'][-2000:]
df_denoised['Open'] =   datarec
df_denoised['pct_change'] = df_denoised['Open'].pct_change().fillna(0)
df_denoised['pct_change_cumprod'] = (df_denoised['pct_change']  + 1).cumprod()

df_denoised.reset_index(inplace=True,drop=True)

In [109]:
########################################################################
# forecasting on normal data
########################################################################

normal = time_series_prediction(df_normal['Dates'],df_normal['pct_change'],5,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1500) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()

#visualize results
normal.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [-0.10334584 -0.01491812 -0.07344753  0.03088411 -0.01014383]
RMSE:  0.006749482367231604
MAE:  0.004326723581147357

Training support vector machine:
RMSE:  0.006648038074642522
MAE:  0.0042460717215129316

Training neural network: 
RMSE:  0.0066358447310458785
MAE:  0.004320974885499472


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [110]:
########################################################################
# forecasting on denoised data
########################################################################

denoised = time_series_prediction(df_denoised['Dates'],df_denoised['pct_change'],5,1) # pass time series, lag window length, a number of steps ahead to predict
denoised.sliding_window_1(verbose=0) # time series to supervised learning ML problem
denoised.train_test_split(split=1500) # testing and training dataset split
denoised.test_train_plot() 

# perform some prediction tasks
denoised.linear_regression()
denoised.support_vector_machine()
denoised.neural_net_mlp()

#visualize results
denoised.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.03753123 -0.15570638 -0.07851535  0.14137091  0.25596489]
RMSE:  0.0048424349108406465
MAE:  0.0017908449987325303

Training support vector machine:
RMSE:  0.004947773368530203
MAE:  0.0015063315082727289

Training neural network: 
RMSE:  0.004466321693126055
MAE:  0.002059845947976687


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [112]:
# compare results for denoised and normal data
fig,ax = plt.subplots(figsize=(10,5))

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.real_vals_cumprod,'-',label='real vals cumprod',linewidth=3)
ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.linear_reg_predictions_cumprod,'-',label='linear reg cumprod - denoised')
ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.svm_predictions_cumprod,'-',label='svm cumprod - denoised')
ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.neural_net_predictions_cumprod,'-',label='nn cumprod - denoised')

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.linear_reg_predictions_cumprod,'--',label='linear reg cumprod')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.svm_predictions_cumprod,'--',label='svm cumprod')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.neural_net_predictions_cumprod,'--',label='nn cumprod')

ax.set_xticks([normal.time_series_dates[x] for x in range(normal.training_split,len(normal.time_series_dates),28)])
ax.tick_params(rotation=30)
ax.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Summary of takeaways from denoising using signal processing techniques:
- Question, should you denoise before are after computing returns? 
    - Fourier transform needs to be computed on stationary signal(the sinusoids continue through infinity ie stationary), therefore you must do returns first?
    - Wavelet transform can be computed for non-stationary signals - see nice denosing of s&p 500 open prices

- How do we compare forecasting results for different denosing results?
    - RMSE or MAE against the denoised signals means we are comparing the forecasting results of fourier and wavelet denoised against different signals?
    - If we look at cumulative returns over testing dataset, then do we compare against the original cummulative returns?

- Some hyperparameters for wavelet transform:
    - type of wavelet, should be chosen based on data, all papers I've read have used the haar wavelet. Sym look better in my results.
    - Once the dwt transform is applied then a thresholding approach can be applied to set low coefficients to zero. Then iDWT taken to retrieve denoised signal. This threshold value and type of thresholding are another hyperparameter. 
    - The level of decomposition is also a hyperparameter. 

- Some hyperparameters for fourier transform:
    - thresholding value of different frequencies.