# Code up class to perform different tasks

In [14]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

# cross validation and hyper-parameter search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV


# 1.0 Class for univariate one-step ahead forecasting 

In [2]:
class time_series_prediction():

    def __init__(self,dates,one_d_time_series,lag_window_length,n_ahead_prediction):

        # raw input data + settings for time series -> supervised learning ML problem
        self.one_d_time_series = np.array(one_d_time_series)      # time series array, to array ensure index works as expected for class methods
        self.time_series_dates = np.array(dates)                  # time stamp / date for each data point
        self.lag_window_length = lag_window_length                # length of lag window
        self.n_ahead_prediction = n_ahead_prediction              # time ahead to predict

        # transfromed data: set after calling .sliding_window_1()
        self.input_data = None
        self.target_data = None

        # testing and training data: set after calling .train_test_split()
        self.training_split = None
        self.X_test = None
        self.X_train = None
        self.y_test = None
        self.y_train = None

        # predictions from various models - set after calling each models training
        self.linear_reg_predictions = None
        self.svm_predictions = None
        self.neural_net_predictions = None
        self.naive_predictions = None

        # cumprod results from predictions - set after calling .vis_results_time_series()
        self.real_vals_cumprod = None
        self.linear_reg_predictions_cumprod = None
        self.svm_predictions_cumprod = None
        self.neural_net_predictions_cumprod = None
    

# ****************************************************************************************************************
    # data wrangling
# ****************************************************************************************************************

    # method to transfroms 1-D time series to supervised ML problem: one step ahead forecasting   
    def sliding_window_1(self,verbose):
        # initialize input array
        num_rows = len(self.one_d_time_series) - self.lag_window_length
        array = np.zeros((num_rows, self.lag_window_length + 1))
        
        # loop through data and populate array
        for i in range(num_rows):
            # input features
            array[i,0:self.lag_window_length+1] = self.one_d_time_series[i:i+self.lag_window_length+1]
            # target feature/s
            array[i,-1] = self.one_d_time_series[i+self.lag_window_length]
            
            if verbose == 1:
                # show pattern
                print(array[i,0:self.lag_window_length],' : ',array[i,self.lag_window_length])

        # save results as a class attribute
        self.input_data = array[:,0:self.lag_window_length]
        self.target_data = array[:,self.lag_window_length]

    # method to perform a training and testing split for dataset with only a single column of target variables
    def train_test_split(self,split):
        # sequentially splits data for testing and training
        self.training_split = split
        self.X_train = self.input_data[0:split,:]
        self.X_test = self.input_data[split:,:]
        self.y_train = self.target_data[0:split]
        self.y_test = self.target_data[split:]

        # generate different folds from training data for cross validation during hyperparameter tuning

        # different folds for cross validation
        tscv = TimeSeriesSplit(n_splits=5)

        # visualize cross validation splits
        fig,ax = plt.subplots(5,1,sharex=True)
        i = 0
        training_data = self.one_d_time_series[0:self.training_split]
        for tr_index, val_index in tscv.split(training_data): # training and validation splits for 5 folds
            # print(tr_index, val_index)
            ax[i].plot(tr_index,training_data[tr_index[0]:tr_index[-1]+1],'b-',label='training set')
            ax[i].plot(val_index,training_data[val_index[0]:val_index[-1]+1],'r-',label='validation set')
            ax[i].legend()
            i += 1
        ax[0].set_title('Cross validation sets for hyperparameter tuning')
        plt.tight_layout()
        plt.show()




# ****************************************************************************************************************
    # predictive models
# ****************************************************************************************************************

    def linear_regression(self):
        print('Training multivariate linear regression:')
        # train model
        reg_model = LinearRegression().fit(self.X_train,self.y_train)
        print('\nLinear regression coefficients: \n',reg_model.coef_)

        # test model
        predictions = reg_model.predict(self.X_test)

        # evaluate: use sklearn metric methods to calc rmse and mae
        mse = mean_squared_error(self.y_test,predictions)
        mae = mean_absolute_error(self.y_test,predictions)

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.linear_reg_predictions = predictions

    def support_vector_machine(self,model_tunning=True,C=None,kernel=None,epsilon=None):
        print('\nTraining support vector machine:')

        if model_tunning == False: #hyperparameter are known
            # train model
            svm_regres = SVR(max_iter=1000,C=C, kernel=kernel, epsilon=epsilon).fit(self.X_train,self.y_train)
            print('Model params: ', svm_regres.get_params())
            # predict on test set
            svm_predictions = svm_regres.predict(self.X_test)

            # evaluate
            mse = mean_squared_error(self.y_test,svm_predictions[:])
            mae = mean_absolute_error(self.y_test,svm_predictions[:])

            print('RMSE: ',np.sqrt(mse))
            print('MAE: ',mae)

            # save predictions
            self.svm_predictions = svm_predictions
        
        else: # must hyperparameter tune model

            # define model: support vector machine for regression
            model = SVR()

            # hyperparameter values to check
            param_grid = [
            {'C': [0.1, 1, 10, 100], 'kernel': ['linear','rbf','sigmoid'],'epsilon':[0.1,1,10,100]},
            ]

            # perform grid search, using cross validaiton
            tscv = TimeSeriesSplit(n_splits=5)
            gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=-3)
            gsearch.fit(self.X_train, self.y_train)
            print('best_score: ', gsearch.best_score_)
            print('best_model: ', gsearch.best_estimator_)
            print('best_params: ',gsearch.best_params_)

            # predict on test set
            svm_predictions = gsearch.best_estimator_.predict(self.X_test)

            # evaluate
            mse = mean_squared_error(self.y_test,svm_predictions[:])
            mae = mean_absolute_error(self.y_test,svm_predictions[:])

            print('RMSE: ',np.sqrt(mse))
            print('MAE: ',mae)

            # save predictions
            self.svm_predictions = svm_predictions

    def neural_net_mlp(self,verbose=0,model_tunning=True,hidden_layer_sizes=None,activation=None,learning_rate=None,learning_rate_init=None):
        print('\nTraining neural network: ')

        if model_tunning == False:
            # train neural network
            nn_regres = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,activation=activation,learning_rate=learning_rate,learning_rate_init=learning_rate_init,shuffle=False,random_state=1,max_iter=1000,verbose=verbose).fit(self.X_train,self.y_train)
            print('Model params:', nn_regres.get_params())
            # make predictions
            nn_predictions = nn_regres.predict(self.X_test)

            # evaluate
            mse = mean_squared_error(self.y_test,nn_predictions[:])
            mae = mean_absolute_error(self.y_test,nn_predictions[:])

            print('RMSE: ',np.sqrt(mse))
            print('MAE: ',mae)

            # save predictions
            self.neural_net_predictions = nn_predictions
        
        else: # perform hyperparameter tuning
            MLP = MLPRegressor(shuffle=False,max_iter=1000) # must set shuffle to false to avoid leakage of information due to sequance problem

            # hyperparameter values to check
            param_grid = [
            {'hidden_layer_sizes': [(10,),(100,),(1000,)], 'activation': ['logistic', 'tanh', 'relu'],'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init':[0.001,0.01,1]}
 ]
            # perform grid search, using cross validaiton
            tscv = TimeSeriesSplit(n_splits=5)
            gsearch = GridSearchCV(estimator=MLP, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=-3)
            gsearch.fit(self.X_train, self.y_train)
            print('best_score: ', gsearch.best_score_)
            print('best_model: ', gsearch.best_estimator_)
            print('best_params: ',gsearch.best_params_)

            # model
            mlp_predictions = gsearch.best_estimator_.predict(self.X_test)

            # evaluate
            mse = mean_squared_error(self.y_test,mlp_predictions)
            mae = mean_absolute_error(self.y_test,mlp_predictions)

            print('RMSE: ',np.sqrt(mse))
            print('MAE: ',mae)

             # save predictions
            self.neural_net_predictions = mlp_predictions

    def naive_model(self): # t's prediction is t-1's value, note that this means you miss the first time point
        preds = np.zeros(len(self.one_d_time_series)-1)
        preds[0] = np.nan()
        preds[1:] = self.one_d_time_series[0:-2]
        self.naive_predictions = preds

# ****************************************************************************************************************
    # visualize results
# ****************************************************************************************************************
    def error(self,real_data,predicted_data):
        error = np.zeros(len(real_data))
        error = (real_data - predicted_data) / real_data
        return error

    # visualize orignal time series signal aswell as predictions    
    def vis_results_time_series(self,second_plot='error'):
        # plot prediction against actual + training data
        fig, ax = plt.subplots(2,1,figsize=(10,7),sharex=True)

        # original time series
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.one_d_time_series[self.training_split+self.lag_window_length:],'o-',linewidth=3,label='real values',markersize=5) 

        # predicted y values
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions,'o-',label='linear regression prediction',markersize=5)
        # ax[0].plot(self.time_series_dates,self.naive_predictions,'.--',label='naive prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions,'.--',label='svm prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions,'.--',label='nn prediction',markersize=5)

        ax[0].legend()
        ax[0].set_title('Real values vs model predictions')

        # plot error plot
        if second_plot == 'error':
            error_linreg = self.error(self.y_test,self.linear_reg_predictions)
            # error_naive = error(np.array(test_data[:,-1]),naive_predictions)
            error_svm = self.error(self.y_test,self.svm_predictions)
            error_nn = self.error(self.y_test,self.neural_net_predictions)

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_linreg,'r-',label='linear reg error')
            # ax[1].plot(self.time_series_dates,error_naive[1:],'-',label='naive error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_svm,'-',label='svm error')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_nn,'-',label='nn error')
            ax[1].set_title('Error signal for predictive models')
            ax[1].set_xlabel('Dates')
            ax[1].legend()
            # ax[1].set_ylim([-10,10])
            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
        
        elif second_plot == 'cumprod':

            # plot cummulative prod plots - this should only be done if input data is percentage retunrs
            self.real_vals_cumprod = (self.y_test+1).cumprod()
            self.linear_reg_predictions_cumprod = (self.linear_reg_predictions + 1).cumprod()
            self.svm_predictions_cumprod = (self.svm_predictions + 1).cumprod()
            self.neural_net_predictions_cumprod = (self.neural_net_predictions + 1).cumprod()

            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.real_vals_cumprod,'-',label='real vals cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions_cumprod,'-',label='linear reg cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions_cumprod,'-',label='svm cumprod')
            ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions_cumprod,'-',label='nn cumprod')

            ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
            ax[1].tick_params(rotation=30)
            ax[1].legend()

        # titles and save figures
        # title_string = 'S&P500 predictions _ y is '+str(column)+'_ window len is '+ str(window_length)
        # fig.suptitle(title_string)
        
        # fig_name = '../results/univariate_single_step_ahead/'+title_string+'.png'
        # plt.savefig(fig_name,facecolor='w')
        plt.tight_layout()

    # visualize predictions against real values using scatter plot
    def vis_results_scatter(self):

        # create dataframe to hold all results
        df_predictions = pd.DataFrame(index=self.time_series_dates[self.training_split+self.lag_window_length:],columns=['Real_values','linear_reg_predictions','svm_predictions','neural_net_predictions'])
        df_predictions['Real_values'] = self.y_test
        df_predictions['linear_reg_predictions'] = self.linear_reg_predictions
        df_predictions['svm_predictions'] = self.svm_predictions
        df_predictions['neural_net_predictions'] = self.neural_net_predictions

        # scatter plot with hues
        fig, ax = plt.subplots(3,1,figsize=(7,10))
        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['linear_reg_predictions'],ax=ax[0])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[0],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['svm_predictions'],ax=ax[1])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[1],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['neural_net_predictions'],ax=ax[2])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[2],color='red')

        # plot formatting
        plt.tight_layout()

    # method to plot testing and training split of data
    def test_train_plot(self):
        fig, ax = plt.subplots(figsize=(10,5))
        ax.plot(self.time_series_dates[0:self.training_split] ,self.one_d_time_series[0:self.training_split],'k-',label='Training data') # replace returns with sp_500 for other data plotting
        ax.plot(self.time_series_dates[self.training_split:] ,self.one_d_time_series[self.training_split:],'r-',label='Testing data')
        ax.plot(self.time_series_dates[self.training_split+self.lag_window_length:] ,self.y_test,'o',label='Windowed testing data') # important to match time by start 5 (length of time window) after where segmented our testing and training data
        plt.legend(loc=0) 
        ax.set_xticks([self.time_series_dates[x] for x in range(0,len(self.time_series_dates),150)])
        ax.tick_params(rotation=30) 
        ax.set_title('Test traing split')
        plt.tight_layout()

# 2.0 Import some test data

In [3]:
# import some data
sp_500 = pd.read_csv('./test_data/GSPC.csv') 
sp_500

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000


# 3.0 Example of using class

In [7]:
# initialize class object
normal = time_series_prediction(sp_500['Date'][-2000:],sp_500['Volume'][-2000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=1200) # testing and training dataset split
normal.test_train_plot()    # visualize training split

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)
# normal.naive_model()

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.01592332 -0.04798209  0.01470487 -0.03113864  0.01997245 -0.00756195
  0.01128548 -0.02102782  0.00054607  0.02279579  0.01401249  0.03641352
 -0.01377384  0.03288997  0.03230216 -0.05591278  0.02344589  0.02939591
  0.01446684 -0.05212056  0.03234467  0.04403855  0.04477377 -0.01877487
 -0.01662812 -0.04726332  0.03443288 -0.00166518  0.03537087  0.01852641
 -0.01398799  0.03454426  0.0537333  -0.03814313 -0.01196013 -0.03994896
  0.03781376 -0.00117192  0.00285367  0.01492658  0.02168905 -0.00160041
 -0.01661934 -0.01063986  0.00086509  0.08185323  0.02305453  0.03169653
  0.14131047  0.41711918]
RMSE:  0.5568350714633039
MAE:  0.37348103238908165

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 150 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-3)]: Done

In [9]:
# visualize results
normal.vis_results_time_series(second_plot='error')


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- even with the volume data which seems more stationary than open price data, the forecasts are still dominated by t-1

In [56]:
# plot predicted vs real value scatter plots
normal.vis_results_scatter()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 4.0 Play around with standardization and prediction returns

In [10]:
# some misc data
x = sp_500['Open'][-2000:]
dates = sp_500['Date'][-2000:]
# percentage returns
x_pct = x.pct_change().fillna(0)
x_pct

# create new df hold both
df = pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod']) # ,'log_transform'
df['Dates'] = dates
df['Open'] =  x
df['pct_change'] = x_pct
df['pct_change_cumprod'] = (x_pct + 1).cumprod()
# df['log_transform'] = np.log(df['Open'] )

df.reset_index(inplace=True,drop=True)

# plot
df.plot(subplots=True,sharex=True,figsize=(7,7))
plt.tight_layout()

# view data
df

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,Dates,Open,pct_change,pct_change_cumprod
0,2010-06-28,1077.500000,0.000000,1.000000
1,2010-06-29,1071.099976,-0.005940,0.994060
2,2010-06-30,1040.560059,-0.028513,0.965717
3,2010-07-01,1031.099976,-0.009091,0.956937
4,2010-07-02,1027.650024,-0.003346,0.953736
...,...,...,...,...
1995,2018-05-31,2720.979980,0.006864,2.525271
1996,2018-06-01,2718.699951,-0.000838,2.523155
1997,2018-06-04,2741.669922,0.008449,2.544473
1998,2018-06-05,2748.459961,0.002477,2.550775


- unsure what the log transform is required for

## 4.1 predicting using returns data

In [38]:
scaler = MinMaxScaler()
df['pct_change_normalised'] = scaler.fit_transform(df['pct_change'].to_numpy().reshape(-1, 1))

normal = time_series_prediction(df['Dates'],df['pct_change_normalised'],10,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1500) # testing and training dataset split
normal.test_train_plot() 

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [39]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.0399997  -0.03879977 -0.00294204 -0.00418228 -0.00415419 -0.0923927
 -0.0174214  -0.07352073  0.02916561 -0.01048691]
RMSE:  0.060951546047454785
MAE:  0.03906715020243121

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  74 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-3)]: Done 152 out of 240 | elapsed:    1.4s remaining:    0.8s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    1.7s finished
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
best_score:  -0.0072276237637182664
best_model:  SVR(C=0.1, kernel='sigmoid')
best_params:  {'C': 0.1, 'epsilon': 0.1, 'kernel': 'sigmoid'}
RMSE:  0.05981790295000705
MAE:  0.03859594759057162

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[P

In [41]:
normal.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

some remarks on predictions using returns of open price:
- prediction accuracy of models look terrible. Is it even worth comparing feature engineering approaches if predictions are this bad?
- evaluating models using cummulative gains seems reasonable

# 5.0 Denosing using fourier transform 

In [157]:
# import scipy fft functions
from scipy.fft import fft, ifft, fftfreq

In [168]:
# apply discrete fourier transform
signal = np.array(sp_500['Volume'][-2100:]/1e9) # data
fft_coefficients = fft(signal) # fourier transform
fft_coefficients

array([7841.0835      -0.j        ,  344.33876287 -95.53936485j,
       -101.66935261-291.99051477j, ...,  133.3537504 +170.01192444j,
       -101.66935261+291.99051477j,  344.33876287 +95.53936485j])

In [169]:
# plot orignal signal and inverse fourier transform, shows you can transform signal to frequency domain, then back to time domain
inverse_fft = ifft(fft_coefficients)
fig,ax = plt.subplots(figsize=(10,4))
ax.plot(sp_500['Date'][-2100:],inverse_fft,'-',label='Inverse fourier data')
ax.plot(range(0,len(inverse_fft)),signal,'.',label='Real data')
ax.set_xlabel('Days')
ax.set_xticks([sp_500['Date'][-2100:].iloc[x] for x in range(0,len(sp_500['Date'][-2100:]),120)])
ax.legend()
ax.tick_params(rotation=30,labelsize=15)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


In [170]:
# plot amplitude vs frequency 
n = len(signal)

# get frequencies and psd
freqs = fftfreq(signal.shape[0]) # x axis of amplitude vs frequency graphs
psd = np.abs(fft_coefficients)/n # psd is amplitude/N, psd or power spectrum density is the magnitude of the coefficients resulting from fourier transform

# plot psd
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(freqs[1:int(n/2)],psd[1:int(n/2)])
ax.set_ylabel('Power spectrum',fontsize=15)
ax.set_xlabel('Frequencies',fontsize=15)
ax.set_title('FFT')
ax.tick_params(labelsize=15)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

observations from coefficient magnitude vs frequency graph:
- most frequencies have low amplitude
- can denoise signal by setting coefficients with low amplitude to zero - ie a thresholding approach. Here the threshold might be something like 0.06
- fyi: frequency = 1 / #days therefore 

In [178]:
# Threshold coefficients to denoise signal
psd_indices = psd > 0.06 # mask
fft_filtered = fft_coefficients*psd_indices

# low pass filtering
freq_indices = freqs < 0.003 
fft_filtered = fft_coefficients*freq_indices

# inverse transform filter coefficients
inverse_transform_filtered = ifft(fft_filtered)

# plot this
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(sp_500['Date'][-2100:],signal,'-',label='Real data')
ax.plot(sp_500['Date'][-2100:],inverse_transform_filtered,'-',label='Inverse fourier filtered')
ax.legend()
ax.set_title('Threshold = 0.06')
ax.set_xlabel('Days',fontsize=15)
ax.set_xticks([sp_500['Date'][-2100:].iloc[x] for x in range(0,len(sp_500['Date'][-2100:]),120)])
ax.tick_params(rotation=30,labelsize=15)

plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


In [180]:
# put together artifical data, here the training set is denoised and the testing set is left unchanged
df_data = pd.DataFrame(columns=['Dates','artificial_data'])
df_data['Dates'] = sp_500['Date'][-2000:]
df_data['artificial_data'] = np.concatenate((np.real(inverse_transform_filtered)[-2000:-500],sp_500['Volume'][-500:].to_numpy()/1e9),axis=None)

# plot this
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(sp_500['Date'][-2000:],sp_500['Volume'][-2000:]/1e9,'-',label='Real data')
ax.plot(df_data['Dates'],df_data['artificial_data'],'-',label='Artificial testing and training data')
ax.legend()
ax.set_title('Threshold = 0.06')
ax.set_xlabel('Days',fontsize=15)
ax.set_xticks([sp_500['Date'][-2000:].iloc[x] for x in range(0,len(sp_500['Date'][-2000:]),120)])
ax.tick_params(rotation=30,labelsize=15)

plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 5.1 now run predictions by training on filtered data

In [181]:
# initializing predicition class object for denoise
fft_denoised = time_series_prediction(sp_500['Date'][-2000:],df_data['artificial_data'],5,1) # pass time series, lag window length, a number of steps ahead to predict
fft_denoised.sliding_window_1(verbose=0) # time series to supervised learning ML problem
fft_denoised.train_test_split(split=1500) # testing and training dataset split
fft_denoised.test_train_plot() 

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [182]:
# perform some prediction tasks
fft_denoised.linear_regression()
fft_denoised.support_vector_machine(model_tunning=True,C= 100, epsilon= 0.1, kernel= 'linear') # these values come from first training model on normal data
fft_denoised.neural_net_mlp(model_tunning=True,activation= 'tanh', hidden_layer_sizes= (1000,), learning_rate= 'constant', learning_rate_init= 0.001) # these values come from first training model on normal data

Training multivariate linear regression:

Linear regression coefficients: 
 [0.12485823 0.04414433 0.06316444 0.17700903 0.47536663]
RMSE:  0.5346242481632774
MAE:  0.34899960149011056

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  75 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-3)]: Done 152 out of 240 | elapsed:    1.5s remaining:    0.8s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    3.3s finished
best_score:  -0.08275972620987947
best_model:  SVR(C=100, kernel='linear')
best_params:  {'C': 100, 'epsilon': 0.1, 'kernel': 'linear'}
RMSE:  0.5356419260767656
MAE:  0.34698266067530154

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    5.0s
[Parallel(n_

In [183]:
fft_denoised.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 5.2 Compare denoised results to normal 

In [186]:
# run predictions on volume data without denoising

# initializing predicition class object for denoise
normal = time_series_prediction(sp_500['Date'][-2000:],sp_500['Volume'][-2000:]/1e9,5,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1500) # testing and training dataset split
normal.test_train_plot() 

# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=False,C= 100, epsilon= 0.1, kernel= 'linear')
normal.neural_net_mlp(model_tunning=False,activation= 'relu', hidden_layer_sizes= (100,), learning_rate= 'adaptive', learning_rate_init= 0.01)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [0.09282714 0.02080688 0.04822414 0.15111368 0.44476971]
RMSE:  0.5274919871056635
MAE:  0.3471744942971835

Training support vector machine:
Model params:  {'C': 100, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': 1000, 'shrinking': True, 'tol': 0.001, 'verbose': False}
RMSE:  0.6609905743797734
MAE:  0.44956223654614363

Training neural network: 
Model params: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.01, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 1, 'shuffle': False, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
RMSE:  0.5354893

In [187]:
# compare results for denoised and normal data: pulling data from predicito objects
fig,ax = plt.subplots(figsize=(10,5))

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.y_test,'-',label='real vals')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.linear_reg_predictions,'-',label='linear reg - denoised')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.svm_predictions,'-',label='svm - denoised')
ax.plot(fft_denoised.time_series_dates[fft_denoised.training_split+fft_denoised.lag_window_length:],fft_denoised.neural_net_predictions,'-',label='nn - denoised')

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.linear_reg_predictions,'--',label='linear reg ')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.svm_predictions,'--',label='svm ')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.neural_net_predictions,'--',label='nn ')

ax.set_xticks([normal.time_series_dates[x] for x in range(normal.training_split,len(normal.time_series_dates),28)])
ax.tick_params(rotation=30)
ax.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

How to compare results before and after denoising?
- rmse for predictions of denoised data cant be compared to remse of predictions using normal data because you are comparing against two different signals, one noisy and one denoised.

# 6.0 Wavelet denoising

Drawbacks of fourier transform / denoising:
- requires stationary data
- no localization of when different frequencies occured
- thresholding fourier trasnform coefficients requires setting a hyperparameter - the threshold 

Benefits of wavelets transform:
- data does not need to be stationary
- localization of when frequencies occur

Drawbacks of wavelets for denoising:
- more hyperparameters, threshold value as well as selecting wavelet type

## 6.1 Wavelet transform / decomposition of time series signal

Wavelet denoisng method:
- First perform a wavelet transform of the open data, denoise by thresholding coefficients, then computes returns of denoised signal. Compute returns and perform forecasting. Transform predictions to value and compare. 

In [54]:

import pywt
import sys

# Data format:
# Raw data should be in a .txt file with two columns, separated by tabs:
#  - The first column should be a time-series index
#  - The second column should contain the data to be filtered

# Time series / data:
data = sp_500['Open'][-5000:] 

index = sp_500['Open'][-5000:].index

# Create wavelet object and define parameters
w = pywt.Wavelet('sym8') # sym family look good too sym8, this is where you should change the wavelet type, haar wavelet is simply 'haar'
maxlev = pywt.dwt_max_level(len(data), w.dec_len)
print("maximum level is " + str(maxlev))
threshold = 0.8 # Threshold for filtering coefficients as part of denoising, the higher this value the more coefficients you set to zero, ie more of the original signal you truncate away / denoise

# Decompose into wavelet components, to the level selected:
coeffs = pywt.wavedec(data, w, level=4)

# Threshold the wavelet coefficients, thereby removing noise

# plt.figure(figsize=(8,15))
for i in range(1, len(coeffs)):
    # plt.subplot(maxlev, 1, i)
    # plt.plot(coeffs[i],label='Original coefficients')
    coeffs[i] = pywt.threshold(coeffs[i], threshold*max(coeffs[i]),mode='hard')
    # plt.plot(coeffs[i],label='Thresholded coefficients')
    # plt.ylabel('Scale: '+str(maxlev-i+1))
    # plt.legend()
    plt.tight_layout()

# inverse transform coefficient to reconstruct time series signal, minus noise
datarec = pywt.waverec(coeffs, w)

plt.figure(figsize=(15,5))
plt.plot(index, data,label='Raw signal')
plt.plot(index, datarec,label="De-noised signal using wavelet techniques")
plt.legend()
plt.tight_layout()
plt.show()

# Distance measures between true signal and denoised


maximum level is 8


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [48]:
coeffs = pywt.wavedec(data, w, level=maxlev)
coeffs_array = np.array(coeffs)
for i in range(len(coeffs)):
    print(i,' : ',len(coeffs[i]))

0  :  34
1  :  34
2  :  53
3  :  92
4  :  170
5  :  326
6  :  638
7  :  1261
8  :  2507
  coeffs_array = np.array(coeffs)


## 6.2 Predictions: normal data vs wavelet denoised

predict sp500 open price one day ahead, using precentage returns

In [5]:
# some data processing: 

# create new df for normal data
df_normal = pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod'])
df_normal['Dates'] = sp_500['Date'][-5000:]
df_normal['Open'] =   sp_500['Open'][-5000:]
df_normal['pct_change'] = df_normal['Open'].pct_change().fillna(0)
df_normal['pct_change_cumprod'] = (df_normal['pct_change']  + 1).cumprod()

df_normal.reset_index(inplace=True,drop=True)

# create new df for wavelet denoised data
df_denoised= pd.DataFrame(columns=['Dates','Open','pct_change','pct_change_cumprod'])
df_denoised['Dates'] = sp_500['Date'][-5000:]
df_denoised['Open'] =   datarec
df_denoised['pct_change'] = df_denoised['Open'].pct_change().fillna(0)
df_denoised['pct_change_cumprod'] = (df_denoised['pct_change']  + 1).cumprod()

df_denoised.reset_index(inplace=True,drop=True)

In [6]:
fig,ax = plt.subplots(figsize=(10,7))
df_normal.plot(subplots=True,ax=ax)
fig,ax = plt.subplots(figsize=(10,7))
df_denoised.plot(subplots=True,ax=ax)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  plot_obj.generate()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  plot_obj.generate()


array([<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>], dtype=object)

## 6.2 Perform prediction on data

In [7]:
########################################################################
# forecasting on normal data
########################################################################

normal = time_series_prediction(df_normal['Dates'],df_normal['pct_change'],10,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=4500) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)

#visualize results
normal.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.00827728 -0.01519066  0.01269019 -0.03919226 -0.00528929 -0.05123576
 -0.011838    0.00053218 -0.05485617 -0.0432033 ]
RMSE:  0.006622549754153012
MAE:  0.004311865851622058

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  84 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-3)]: Done 213 out of 240 | elapsed:    1.6s remaining:    0.1s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    1.6s finished
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
best_score:  -0.0002024966760961271
best_model:  SVR(C=0.1, epsilon=100, kernel='linear')
best_params:  {'C': 0.1, 'epsilon': 100, 'kernel': 'linear'}
RMSE:  0.011372765579198503
MAE:  0.009777494110736069

Training neural network: 
Fitting 5 folds for each of 81 candidates, totall

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
########################################################################
# forecasting on denoised data
########################################################################

denoised = time_series_prediction(df_denoised['Dates'],df_denoised['pct_change'],10,1) # pass time series, lag window length, a number of steps ahead to predict
denoised.sliding_window_1(verbose=0) # time series to supervised learning ML problem
denoised.train_test_split(split=4500) # testing and training dataset split
denoised.test_train_plot() 

# perform some prediction tasks
denoised.linear_regression()
denoised.support_vector_machine(model_tunning=True,C= 0.1, epsilon= 10, kernel= 'linear')
denoised.neural_net_mlp(model_tunning=True,activation= 'relu', hidden_layer_sizes= (100,), learning_rate= 'adaptive', learning_rate_init= 0.001)

#visualize results
denoised.vis_results_time_series(second_plot='cumprod')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.0237145  -0.02246668  0.04740009  0.00701641  0.03584964 -0.03737366
 -0.03726144  0.05776496  0.03446961  0.17203609]
RMSE:  0.004456301705215824
MAE:  0.0016611758073728459

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  70 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    1.4s finished
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
best_score:  -0.00023433945042440927
best_model:  SVR(C=0.1, epsilon=10, kernel='linear')
best_params:  {'C': 0.1, 'epsilon': 10, 'kernel': 'linear'}
RMSE:  0.015381430721325244
MAE:  0.014829075969529875

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    4.5s
[Paral

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 6.3 Compare results for normal vs denoised

In [114]:
# compare results for denoised and normal data
fig,ax = plt.subplots(figsize=(10,5))

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.real_vals_cumprod,'-',label='real vals cumprod',linewidth=3)
ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.linear_reg_predictions_cumprod,'-',label='linear reg cumprod - denoised')
# ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.svm_predictions_cumprod,'-',label='svm cumprod - denoised')
ax.plot(denoised.time_series_dates[denoised.training_split+denoised.lag_window_length:],denoised.neural_net_predictions_cumprod,'-',label='nn cumprod - denoised')

ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.linear_reg_predictions_cumprod,'--',label='linear reg cumprod')
# ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.svm_predictions_cumprod,'--',label='svm cumprod')
ax.plot(normal.time_series_dates[normal.training_split+normal.lag_window_length:],normal.neural_net_predictions_cumprod,'--',label='nn cumprod')

ax.set_xticks([normal.time_series_dates[x] for x in range(normal.training_split,len(normal.time_series_dates),28)])
ax.tick_params(rotation=30)
ax.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 6.4 transform return predictions back to price data

When testing denoising methods, we need to compare against predictions without denoising. But once you denoise the original signal, you cant compare the RMSE metric of the denoised results to that of the normal (without denoising) prediction method because these metrics are computed against different based y_true values. So: 


- 1) transform price to returns
- 2) predict returns with and without denoising
- 3) convert returns to price and compute rmse, with and without denosing

In [115]:
# define some new dataframes to hold all data
df_normal_results = df_normal.iloc[4521:,:]
df_denoised_results = df_denoised.iloc[4521:,:]

# no denoising
df_normal_results['linear_reg_prices'] = df_normal['Open'][4521] * normal.linear_reg_predictions_cumprod
df_normal_results['svm_reg_prices'] = df_normal['Open'][4521] * normal.svm_predictions_cumprod
df_normal_results['nn_reg_prices'] = df_normal['Open'][4521] * normal.neural_net_predictions_cumprod

# with denoising
df_denoised_results['linear_reg_prices'] = df_denoised['Open'][4521] * denoised.linear_reg_predictions_cumprod
df_denoised_results['svm_reg_prices'] = df_denoised['Open'][4521] * denoised.svm_predictions_cumprod
df_denoised_results['nn_reg_prices'] = df_denoised['Open'][4521] * denoised.neural_net_predictions_cumprod

# plot results
plt.figure(figsize=(10,4))
plt.plot(df_normal_results['Dates'],df_normal_results['Open'],label='Real open data')

plt.plot(df_normal_results['Dates'],df_normal_results['linear_reg_prices'],label='linear normal')
# plt.plot(df_normal_results['Dates'],df_normal_results['svm_reg_prices'],label='svm normal')
plt.plot(df_normal_results['Dates'],df_normal_results['nn_reg_prices'],label='nn normal')

plt.plot(df_normal_results['Dates'],df_denoised_results['linear_reg_prices'],'--',label='linear denoised')
# plt.plot(df_normal_results['Dates'],df_denoised_results['svm_reg_prices'],label='svm denoised')
plt.plot(df_normal_results['Dates'],df_denoised_results['nn_reg_prices'],'--',label='nn denoised')


plt.xticks([df_normal_results['Dates'].iloc[x] for x in range(0,len(df_normal_results['Dates'][:]),28)])
plt.tick_params(rotation=30)
plt.legend()
plt.tight_layout()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_normal_results['linear_reg_prices'] = df_normal['Open'][4521] * normal.linear_reg_predictions_cumprod
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_normal_results['svm_reg_prices'] = df_normal['Open'][4521] * normal.svm_predictions_cumprod
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [116]:
# compute evaluation metrics: look at RMSE between cummulative gains of real data vs predictions with and without denoising

# data
y_true = df_normal_results['Open']

# no denoising
y_pred_1 = df_normal_results['linear_reg_prices']
y_pred_2 = df_normal_results['svm_reg_prices']
y_pred_3 = df_normal_results['nn_reg_prices']

# with denoising
y_pred_4 = df_denoised_results['linear_reg_prices']
y_pred_5 = df_denoised_results['svm_reg_prices']
y_pred_6 = df_denoised_results['nn_reg_prices']

# metrics

rmse_linear_normal = mean_squared_error(y_true,y_pred_1)
rmse_svm_normal = mean_squared_error(y_true,y_pred_2)
rmse_ann_normal = mean_squared_error(y_true,y_pred_3)

rmse_linear_denoised = mean_squared_error(y_true,y_pred_4)
rmse_svn_denoised = mean_squared_error(y_true,y_pred_5)
rmse_ann_denoised = mean_squared_error(y_true,y_pred_6)

# print metrics
print('Linear normal - RMSE cumulatic gains:\t',rmse_linear_normal**0.5)
print('SVM normal - RMSE cumulatic gains:\t',rmse_svm_normal**0.5)
print('ANN normal - RMSE cumulatic gains:\t',rmse_ann_normal**0.5)

print('Linear denoised - RMSE cumulatic gains:\t',rmse_linear_denoised**0.5)
print('SVM denoised - RMSE cumulatic gains:\t',rmse_svn_denoised**0.5)
print('ANN denoised - RMSE cumulatic gains:\t',rmse_ann_denoised**0.5)



Linear normal - RMSE cumulatic gains:	 273.7957156081383
SVM normal - RMSE cumulatic gains:	 73853.8022660274
ANN normal - RMSE cumulatic gains:	 166.67343763907598
Linear denoised - RMSE cumulatic gains:	 178.58542289582076
SVM denoised - RMSE cumulatic gains:	 809173.9539076653
ANN denoised - RMSE cumulatic gains:	 306.7646744121744


Linear normal - RMSE cumulatic gains:	 103.78796952459727
SVM normal - RMSE cumulatic gains:	 2119.0777666368795
ANN normal - RMSE cumulatic gains:	 229.97477972358672
Linear denoised - RMSE cumulatic gains:	 49.60068080046763
SVM denoised - RMSE cumulatic gains:	 14452.351946378596
ANN denoised - RMSE cumulatic gains:	 115.447881267899

Summary of takeaways from denoising using signal processing techniques:
- Question, should you denoise before are after computing returns? 
    - Fourier transform needs to be computed on stationary signal(the sinusoids continue through infinity ie stationary), therefore you must do returns first?
    - Wavelet transform can be computed for non-stationary signals - see nice denosing of s&p 500 open prices

- How do we compare forecasting results for different denosing results?
    - RMSE or MAE against the denoised signals means we are comparing the forecasting results of fourier and wavelet denoised against different signals?
    - If we look at cumulative returns over testing dataset, then do we compare against the original cummulative returns?

- Some hyperparameters for wavelet transform:
    - type of wavelet, should be chosen based on data, all papers I've read have used the haar wavelet. Sym look better in my results.
    - Once the dwt transform is applied then a thresholding approach can be applied to set low coefficients to zero. Then iDWT taken to retrieve denoised signal. This threshold value and type of thresholding are another hyperparameter. 
    - The level of decomposition is also a hyperparameter. 

- Some hyperparameters for fourier transform:
    - thresholding value of different frequencies.

# some random extras

In [28]:
import pywt

# single level wavelet denoising
data = sp_500['Volume'][-2000:]/1e9
plt.figure(figsize=(15,5))
data.plot()

x = np.array(data)                
(ca, cd) = pywt.dwt(x, "sym20")                
cat = pywt.threshold(ca, 0.5, mode="hard")                
cdt = pywt.threshold(cd, 0.5, mode="soft")                
tx = pywt.idwt(cat, cdt, "sym20")

plt.plot(sp_500['Volume'][-2000:].index,tx,'-.')
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [29]:
pywt.wavelist(kind='discrete')

['bior1.1',
 'bior1.3',
 'bior1.5',
 'bior2.2',
 'bior2.4',
 'bior2.6',
 'bior2.8',
 'bior3.1',
 'bior3.3',
 'bior3.5',
 'bior3.7',
 'bior3.9',
 'bior4.4',
 'bior5.5',
 'bior6.8',
 'coif1',
 'coif2',
 'coif3',
 'coif4',
 'coif5',
 'coif6',
 'coif7',
 'coif8',
 'coif9',
 'coif10',
 'coif11',
 'coif12',
 'coif13',
 'coif14',
 'coif15',
 'coif16',
 'coif17',
 'db1',
 'db2',
 'db3',
 'db4',
 'db5',
 'db6',
 'db7',
 'db8',
 'db9',
 'db10',
 'db11',
 'db12',
 'db13',
 'db14',
 'db15',
 'db16',
 'db17',
 'db18',
 'db19',
 'db20',
 'db21',
 'db22',
 'db23',
 'db24',
 'db25',
 'db26',
 'db27',
 'db28',
 'db29',
 'db30',
 'db31',
 'db32',
 'db33',
 'db34',
 'db35',
 'db36',
 'db37',
 'db38',
 'dmey',
 'haar',
 'rbio1.1',
 'rbio1.3',
 'rbio1.5',
 'rbio2.2',
 'rbio2.4',
 'rbio2.6',
 'rbio2.8',
 'rbio3.1',
 'rbio3.3',
 'rbio3.5',
 'rbio3.7',
 'rbio3.9',
 'rbio4.4',
 'rbio5.5',
 'rbio6.8',
 'sym2',
 'sym3',
 'sym4',
 'sym5',
 'sym6',
 'sym7',
 'sym8',
 'sym9',
 'sym10',
 'sym11',
 'sym12',
 'sym13',