# Initial time series forecasting: S&P 500

- simple one step ahead forecasting using a sliding window approach
- tested linear reg, naive, svm and nn models. 
- No hpyerparameter tuning for svm or nn yet. 

In [14]:
# import some shit
%matplotlib widget
# %matplotlib notebook 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize

# %matplotlib inline
# import some data
sp_500 = pd.read_csv('../test_data/GSPC.csv')
sp_500

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000


In [15]:
# plot the data
sp_500.plot(x='Date',figsize=(10,10),subplots=True)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Calculate returns on open price as data for prediction tasks

In [16]:
# calculate returns on 
returns = sp_500[['Date','Open']]
returns['Shift'] = sp_500['Open'].shift(periods=-1) # use pandas shift method to create shift daily open price one time period forward
returns['returns'] = returns['Shift'] - returns['Open'] # calculates the 24hr return
returns.drop(labels=returns.index[-1],axis=0,inplace=True) # need to delete last row as the shift value is nan and therefore no return could be calculated
display(returns)

fig, ax = plt.subplots(figsize=(12,5))
returns.plot(x='Date',y='returns',ax=ax)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  returns['Shift'] = sp_500['Open'].shift(periods=-1) # use pandas shift method to create shift daily open price one time period forward
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  returns['returns'] = returns['Shift'] - returns['Open'] # calculates the 24hr return
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Date,Open,Shift,returns
0,1950-01-03,16.660000,16.850000,0.190000
1,1950-01-04,16.850000,16.930000,0.080000
2,1950-01-05,16.930000,16.980000,0.050000
3,1950-01-06,16.980000,17.080000,0.100000
4,1950-01-09,17.080000,17.030001,-0.049999
...,...,...,...,...
17212,2018-05-30,2702.429932,2720.979980,18.550048
17213,2018-05-31,2720.979980,2718.699951,-2.280029
17214,2018-06-01,2718.699951,2741.669922,22.969971
17215,2018-06-04,2741.669922,2748.459961,6.790039


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:xlabel='Date'>

# 0. Useful functions

In [17]:
# write function to perform sliding window over data: transform time series in supervised learning problem

def slide_window(data, slide_step_size): # column of data, integer
    # initialize input array
    num_rows = len(data) - slide_step_size
    array = np.zeros((num_rows, slide_step_size + 1))
    
    # loop through data and populate array
    for i in range(num_rows):
        # input features
        array[i,0:slide_step_size+1] = data[i:i+slide_step_size+1]
        # target feature
        array[i,-1] = data[i+slide_step_size]
        # show pattern
        print(array[i,0:slide_step_size],' : ',array[i,slide_step_size])
    return array 

dummy_data = [1,2,3,4,5,6,7,8,9,10]
dummy_result = slide_window(dummy_data,5)  

[1. 2. 3. 4. 5.]  :  6.0
[2. 3. 4. 5. 6.]  :  7.0
[3. 4. 5. 6. 7.]  :  8.0
[4. 5. 6. 7. 8.]  :  9.0
[5. 6. 7. 8. 9.]  :  10.0


# 1. Data preporation: 

Re-framing time series data as supervised machine learning problem using a sliding window approach.

In [18]:
# inputs to select and prepare data
column = 'returns' # which stock price feature to use for prediction
window_length = 5 # how many previous time steps is used to set up supvervised learning problem

In [19]:
# testing training data split
# training_data = slide_window(list(sp_500[column][-2000:-500]),window_length)
# print('\n*************************************************************************************************\n')
# test_data = slide_window(list(sp_500[column][-500:]),window_length)

# rather use returns data
training_data = slide_window(list(returns[column][-2000:-500]),window_length)
print('\n*************************************************************************************************\n')
test_data = slide_window(list(returns[column][-500:]),window_length)

# fourier filtered results
inverse_transform_filtered_real = np.real(inverse_transform_filtered) # only take real part, this comes from taking the inverse fourier transform after filtering out low psd frequencies

training_data = slide_window(inverse_transform_filtered_real[-2000:-500],window_length)
print('\n*************************************************************************************************\n')
test_data = slide_window(inverse_transform_filtered_real[-500:],window_length)

15.839844   9.830078   4.369873]  :  8.130126999999902
[16.680176 15.839844  9.830078  4.369873  8.130127]  :  4.959961000000021
[15.839844  9.830078  4.369873  8.130127  4.959961]  :  -2.8300779999999577
[ 9.830078  4.369873  8.130127  4.959961 -2.830078]  :  3.9599609999995664
[ 4.369873  8.130127  4.959961 -2.830078  3.959961]  :  0.020019000000502274
[ 8.130127  4.959961 -2.830078  3.959961  0.020019]  :  15.630126999999902
[ 4.959961 -2.830078  3.959961  0.020019 15.630127]  :  6.5500489999999445
[-2.830078  3.959961  0.020019 15.630127  6.550049]  :  -5.910155999999915
[ 3.959961  0.020019 15.630127  6.550049 -5.910156]  :  0.11010699999997087
[ 0.020019 15.630127  6.550049 -5.910156  0.110107]  :  2.239990999999918
[15.630127  6.550049 -5.910156  0.110107  2.239991]  :  2.119873000000098
[ 6.550049 -5.910156  0.110107  2.239991  2.119873]  :  -10.51001000000042
[ -5.910156   0.110107   2.239991   2.119873 -10.51001 ]  :  8.270019000000502
[  0.110107   2.239991   2.119873 -10.51

NameError: name 'inverse_transform_filtered' is not defined

In [10]:
training_data.shape

(1495, 6)

In [11]:
test_data.shape

(495, 6)

In [12]:
# normalize training data
training_data = normalize(training_data)
test_data = normalize(test_data)

# 2. Visualize data

In [16]:
# plot training and testing data
fig, ax = plt.subplots(figsize=(10,5))

returns[column][-2000:-500].plot(ax=ax,style='k-',label='Training data') # replace returns with sp_500 for other data plotting
returns[column][-500:].plot(ax=ax,style='r-',label='test data')
plt.plot(returns[column][-500+window_length:].index,test_data[:,-1],'o',label='test data') # important to match time by start 5 (length of time window) after where segmented our testing and training data
plt.legend(loc=0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x245c710b070>

# 3. Predictive modelling

Train three models:
- baseline model: naive model which says tomorrow's price is today's price.
- multivariate regression
- support vector regression

## 3.1 Linear regression

In [22]:
# Train simple linear regression model and lets see what we get
from sklearn.linear_model import LinearRegression

# data
X = training_data[:,0:-1]
Y = training_data[:,-1]

# model
reg_model = LinearRegression().fit(X,Y)

# view trained model parameters
reg_model.coef_

array([-0.0812423 , -0.02592546, -0.04534536,  0.01252938,  0.01347756])

#### 3.1.1 Look at linear regression coefficients

In [23]:
for i in range(len(reg_model.coef_)):
    print(reg_model.coef_[i])

-0.08124230356015348
-0.02592545715684523
-0.04534536491860347
0.012529375658548267
0.013477564837446668


For non-stationary data, eg only using Open price:
- The weights of the previous time step is extremely strong / large, relative to previous time steps / features. ie this model is saying the best predictor for tomorrow is today. Essentially, predict tomorrow's price as todays price.

For stationary data, eg using returns:
- all weights are more similar in magnitude. This means that the model is not simply relying on X_t-1 to predict X_t

#### 3.1.2 Test model on test data, ie last 500 days

In [24]:
df_test_data = pd.DataFrame(data=test_data)
df_test_data

Unnamed: 0,0,1,2,3,4,5
0,-17.820068,-15.100098,0.950196,-11.239991,11.839844,-2.619873
1,-15.100098,0.950196,-11.239991,11.839844,-2.619873,9.609863
2,0.950196,-11.239991,11.839844,-2.619873,9.609863,4.560059
3,-11.239991,11.839844,-2.619873,9.609863,4.560059,3.050049
4,11.839844,-2.619873,9.609863,4.560059,3.050049,11.010010
...,...,...,...,...,...,...
490,-24.360108,16.959961,-7.339843,-18.489991,-2.680175,18.550048
491,16.959961,-7.339843,-18.489991,-2.680175,18.550048,-2.280029
492,-7.339843,-18.489991,-2.680175,18.550048,-2.280029,22.969971
493,-18.489991,-2.680175,18.550048,-2.280029,22.969971,6.790039


In [25]:
# use trained model to predict

# loop through each test data pattern and predict result
predicted_results = []
print('Prediction\tReal values')
for i in range(500-window_length):
    X_test = test_data[i,0:-1]
    prediction = reg_model.predict(X_test.reshape(1,-1))
    predicted_results.append(prediction)
    print(prediction,'\t',test_data[i,-1])
    
# full prediction
predictions = reg_model.predict(test_data[:,0:-1])

Prediction	Real values
[2.62498472] 	 -2.619873000000098
[2.63495951] 	 9.609863000000132
[0.58412512] 	 4.56005899999991
[1.71698176] 	 3.0500489999999445
[-0.42138642] 	 11.010009999999966
[0.75363941] 	 -72.36010799999985
[-1.06443491] 	 -24.779906999999866
[-1.37928712] 	 36.0198969999999
[3.73305588] 	 30.479980999999952
[3.77735616] 	 26.17016599999988
[6.43251134] 	 -4.290038999999979
[0.77740432] 	 -10.620116999999937
[-4.29001821] 	 15.989990000000034
[-2.06765212] 	 6.5500489999999445
[-0.43459777] 	 24.75
[1.12453905] 	 7.780029000000013
[1.37630769] 	 14.31005899999991
[-1.49072157] 	 4.069823999999699
[-0.48232777] 	 7.25
[-1.90252878] 	 -3.08984399999963
[-0.32830636] 	 1.75
[-0.80186699] 	 2.31005899999991
[0.48467823] 	 6.809814000000188
[0.34257703] 	 -6.43994100000009
[0.90954321] 	 7.239990000000034
[0.31614132] 	 -4.739990000000034
[0.76473764] 	 0.8400879999999233
[0.04745587] 	 -3.7600099999999657
[1.32039136] 	 2.780029000000013
[0.29706548] 	 4.319824000000153
[

### 3.1.3 Model evaluation

In [26]:
# use sklearn metric methods to calc rmse and mae
mse = mean_squared_error(test_data[:,-1],predictions)
mae = mean_absolute_error(test_data[:,-1],predictions)

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  16.86634481747447
MAE:  10.622687756850034


Results without filtering:
- RMSE:  0.40624498456287134
- MAE:  0.33153581191446285

Results with filtering:
- RMSE:  0.3814258615492118
- MAE:  0.3079928467008209

## 3.2 Naive model

In [27]:
#  a naive time series predictive model predicts the next results as the current ie x[i+1] = x[i]
# ie this model will return n-1 values for n time stamps
def naive_model(data):
    preds = np.zeros(len(data))
    preds[1:] = data[0:-1]
    return  preds

### 3.2.1 predict and evaluate

In [28]:
# call naive model function
naive_predictions = naive_model(test_data[:,-1])

# evaluate predictions
mse = mean_squared_error(test_data[1:,-1],naive_predictions[1:])
mae = mean_absolute_error(test_data[1:,-1],naive_predictions[1:])

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  23.712971341110972
MAE:  14.81026027530362


## 3.3 Support vector machine

In [29]:
from sklearn.svm import LinearSVR

# train model
svm_regres = LinearSVR(max_iter=1000).fit(training_data[:,0:-1],training_data[:,-1])

# predict
svm_predictions = svm_regres.predict(test_data[:,0:-1])

# evaluate
mse = mean_squared_error(test_data[:,-1],svm_predictions[:])
mae = mean_absolute_error(test_data[:,-1],svm_predictions[:])

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  16.782414569385743
MAE:  10.558048698419523


Results without filtering:
- RMSE:  0.4054163757214758
- MAE:  0.3283511990590936

Results with filtering:
- RMSE:  0.3816363957338525
- MAE:  0.30785723389532904

## 3.4 Multilayer perceptron neural network

In [30]:
from sklearn.neural_network import MLPRegressor

# train neural network
nn_regres = MLPRegressor(hidden_layer_sizes=(100,100,100),shuffle=False,random_state=1, 
                         max_iter=1000,verbose=1).fit(training_data[:,0:-1],training_data[:,-1])

# make predictions
nn_predictions = nn_regres.predict(test_data[:,0:-1])

# evaluate
mse = mean_squared_error(test_data[:,-1],nn_predictions[:])
mae = mean_absolute_error(test_data[:,-1],nn_predictions[:])

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

Iteration 1, loss = 103.32976534
Iteration 2, loss = 99.75439713
Iteration 3, loss = 98.16790533
Iteration 4, loss = 97.14273911
Iteration 5, loss = 96.17799503
Iteration 6, loss = 95.26168915
Iteration 7, loss = 94.35631261
Iteration 8, loss = 93.49320139
Iteration 9, loss = 92.64089224
Iteration 10, loss = 91.79803645
Iteration 11, loss = 90.98854293
Iteration 12, loss = 90.09149357
Iteration 13, loss = 89.22519723
Iteration 14, loss = 88.36181790
Iteration 15, loss = 87.42986131
Iteration 16, loss = 86.51468218
Iteration 17, loss = 85.55915134
Iteration 18, loss = 84.56239894
Iteration 19, loss = 83.61265671
Iteration 20, loss = 82.60543071
Iteration 21, loss = 81.63646444
Iteration 22, loss = 80.61610209
Iteration 23, loss = 79.65987101
Iteration 24, loss = 78.65449402
Iteration 25, loss = 77.59611495
Iteration 26, loss = 76.62123003
Iteration 27, loss = 75.60010314
Iteration 28, loss = 74.55825619
Iteration 29, loss = 73.63472442
Iteration 30, loss = 72.57114777
Iteration 31, loss

Results without filtering:
- RMSE:  0.5356554382964639
- MAE:  0.4010986581137097

Results with filtering:
- RMSE:  0.47539037753112584
- MAE:  0.35201690512468486

# 4. Plot results of different models

In [31]:
# this function computes the error between two vectors / arrays, give the one is a vector of predicted values and the other is of real values
def error(real_data,predicted_data):
    error = np.zeros(len(real_data))
    error = (real_data - predicted_data) / real_data
    return error

In [32]:
# plot prediction against actual + training data
fig, ax = plt.subplots(2,1,figsize=(9,10),sharex=True)

# test and real y values data
# returns[column][-500:].plot(ax=ax[0],style='o-',linewidth=3,label='real values',markersize=5) # for plotting unormalized values
ax[0].plot(returns[column][-500+window_length:].index,test_data[:,-1],'o-',linewidth=3,label='real values',markersize=5) # plotting normalized training data
# predict y values
ax[0].plot(returns[column][-500+window_length:].index,predicted_results[:],'o-',label='linear regression prediction',markersize=5)
ax[0].plot(returns[column][-500+window_length+1:].index,naive_predictions[1:],'.--',label='naive prediction',markersize=5)
ax[0].plot(returns[column][-500+window_length:].index,svm_predictions[:],'.--',label='svm prediction',markersize=5)
ax[0].plot(returns[column][-500+window_length:].index,nn_predictions[:],'.--',label='nn prediction',markersize=5)

ax[0].legend()
ax[0].set_title('Real values vs model predictions')

# plot error plot
error_linreg = error(np.array(test_data[:,-1]),predictions)
error_naive = error(np.array(test_data[:,-1]),naive_predictions)
error_svm = error(np.array(test_data[:,-1]),svm_predictions)
error_nn = error(np.array(test_data[:,-1]),nn_predictions)

ax[1].plot(returns[column][-500+window_length:].index,error_linreg,'r-',label='linear reg error')
ax[1].plot(returns[column][-500+window_length+1:].index,error_naive[1:],'-',label='naive error')
ax[1].plot(returns[column][-500+window_length:].index,error_svm[:],'-',label='svm error')
ax[1].plot(returns[column][-500+window_length:].index,error_nn[:],'-',label='nn error')
ax[1].set_title('Error signal for predictive models')
ax[1].set_xlabel('Days since 1950 for s&p500')
ax[1].legend()
ax[1].set_ylim([-10,10])


# titles and save figures
title_string = 'S&P500 predictions _ y is '+str(column)+'_ window len is '+ str(window_length)
fig.suptitle(title_string)
plt.tight_layout()
fig_name = '../results/univariate_single_step_ahead/'+title_string+'.png'
plt.savefig(fig_name,facecolor='w')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  error = (real_data - predicted_data) / real_data


## 4.1 Discussion of results

- from the graph above we can see that our suspiciouns have been confirmed: the linear regression is essentially saying the next value is the previous. This can be seen as the predicted results is almost just the actual results shift forward.
- another validation of the above hypothesis is the fact that the naive model performs near identically to linear regression
- if you zoom in you can see all models seem to favour yesterday's result as the best for today. 
- as you expand and contract the slide window, the linear reg and svm models perform very similary. The neural network however smoothens the results a lot more the longer the sliding window.

# Denoising data using FFT

In [33]:
# import scipy fft functions
from scipy.fft import fft, ifft, fftfreq

In [34]:
# dataframe to np array
length_of_time = -2000
signal = np.array(returns['returns'][length_of_time:])
signal

array([  2.400024,  -6.400024, -30.539917, ...,  22.969971,   6.790039,
         4.790039])

In [35]:
# plot original signal
fig,ax = plt.subplots(figsize=(12,5))
ax.plot(returns['returns'][length_of_time:].index,signal,'-')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x21409a63400>]

In [36]:
# apply discrete fourier transform
fft_coefficients = fft(signal)
fft_coefficients

array([1678.150024    -0.j        ,  145.60226937 +12.50087687j,
        183.89946634+709.8176786j , ..., -105.58107383 +34.61480548j,
        183.89946634-709.8176786j ,  145.60226937 -12.50087687j])

In [37]:
# plot amplitude vs frequency 
n = len(signal)

# get frequencies and psd
freqs = fftfreq(signal.shape[0])
psd = np.abs(fft_coefficients)/n # psd is amplitude/N

# plot psd
fig,ax = plt.subplots()
ax.plot(freqs[0:int(n/2)],psd[0:int(n/2)])
ax.set_ylabel('Power spectrum')
ax.set_xlabel('Frequencies')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Frequencies')

- Volume data: we can see there is a low frequency component that is very large and many low amplitude frequency components.
- Returns data: looks very much like a random process

In [38]:
sp_500['Volume'][length_of_time:]

15218    3896410000
15219    6136700000
15220    5067080000
15221    6435770000
15222    3968500000
            ...    
17213    4235370000
17214    3684130000
17215    3376510000
17216    3517790000
17217    3651640000
Name: Volume, Length: 2000, dtype: int64

In [39]:
# plot inverse fourier transform as sanity check
inverse_fft = ifft(fft_coefficients)
fig,ax = plt.subplots(figsize=(10,5))
ax.plot(returns['returns'][length_of_time:].index,inverse_fft,'-',label='Inverse fourier')
ax.plot(returns['returns'][length_of_time:].index,returns['returns'][length_of_time:],'.',label='Real data')
ax.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


## Denoise by removing frequencies with low amplitude

In [40]:
# try denoise data
psd_indices = psd > 0.6 # mask
fft_filtered = fft_coefficients*psd_indices

inverse_transform_filtered = ifft(fft_filtered)

# plot this
fig,ax = plt.subplots(figsize=(12,5))
ax.plot(returns['returns'][length_of_time:].index,returns['returns'][length_of_time:],'-',label='Real data')
ax.plot(returns['returns'][length_of_time:].index,inverse_transform_filtered,'-',label='Inverse fourier filtered')
ax.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  return array(a, dtype, copy=False, order=order)


"The Fourier transform is only able to char-
acterize truly periodic and stationary signals, as time is stripped out via the
integration in (2.18a). For a signal with non-stationary frequency content, such
as a musical composition, it is important to simultaneously characterize the
frequency content and its evolution in time." - Steve Burton text book

In [41]:
# see which frequencies are left over
blah = freqs*psd_indices

for i in range(len(blah)):
    if np.abs(blah[i]) > 0:
        print(blah[i])

0.0275
0.052000000000000005
0.07200000000000001
0.0935
0.10350000000000001
0.1075
0.11800000000000001
0.121
0.132
0.146
0.1555
0.1695
0.1715
0.17550000000000002
0.176
0.1845
0.1875
0.1925
0.2085
0.2565
0.275
0.28250000000000003
0.2915
0.2985
0.3005
0.303
0.309
0.314
0.3325
0.3355
0.3375
0.34500000000000003
0.363
0.3795
0.386
0.4045
0.41600000000000004
0.4225
0.445
-0.445
-0.4225
-0.41600000000000004
-0.4045
-0.386
-0.3795
-0.363
-0.34500000000000003
-0.3375
-0.3355
-0.3325
-0.314
-0.309
-0.303
-0.3005
-0.2985
-0.2915
-0.28250000000000003
-0.275
-0.2565
-0.2085
-0.1925
-0.1875
-0.1845
-0.176
-0.17550000000000002
-0.1715
-0.1695
-0.1555
-0.146
-0.132
-0.121
-0.11800000000000001
-0.1075
-0.10350000000000001
-0.0935
-0.07200000000000001
-0.052000000000000005
-0.0275


# Spectogram

In [42]:
from scipy import signal

f, t, Sxx = signal.spectrogram(sp_500['Volume'][-2000:], fs=1,noverlap=49,nperseg=50)

fig,ax = plt.subplots()
ax.pcolormesh(t, f, Sxx, shading='gouraud')
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [days]')
plt.ylim([0,0.1])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [43]:
Sxx

array([[5.63692933e+16, 1.32385057e+17, 4.26608158e+16, ...,
        6.73891146e+16, 4.24321582e+16, 4.41197199e+16],
       [3.93966393e+18, 3.72354054e+18, 3.66739112e+18, ...,
        8.57639685e+16, 1.08110694e+17, 1.21823104e+17],
       [1.55935367e+16, 7.92197875e+16, 3.17023750e+17, ...,
        1.50629460e+18, 1.49136756e+18, 1.53511289e+18],
       ...,
       [1.07690148e+17, 1.70599939e+17, 2.31434011e+17, ...,
        9.53720439e+16, 1.18116493e+17, 1.33473627e+17],
       [4.12520827e+17, 2.54366254e+17, 1.73914317e+17, ...,
        3.25827455e+16, 2.49922579e+16, 2.07734163e+16],
       [6.72550316e+16, 2.19158849e+16, 5.25357651e+15, ...,
        2.62991654e+17, 2.73245010e+17, 2.73310669e+17]])

# Multi resolution analysis: wavelet transfroms

In [44]:
import pywt

In [45]:
data = sp_500['Volume'][-2000:]/1e9
index = sp_500['Volume'][-2000:].index

# Create wavelet object and define parameters
w = pywt.Wavelet('sym5')
maxlev = pywt.dwt_max_level(len(data), w.dec_len) # max level of decomposition ie scale, is determined by length of signal and discrete wavelet
# maxlev = 2 # Override if desired
print("maximum level is " + str(maxlev))
threshold = 0.25 # Threshold for filtering

# Decompose into wavelet components, to the level selected:
coeffs = pywt.wavedec(data, 'sym5', level=maxlev)

#cA = pywt.threshold(cA, threshold*max(cA))
plt.figure(figsize=(10,15))
for i in range(1, len(coeffs)):
    plt.subplot(maxlev, 1, i)
    plt.plot(coeffs[i],label='Wavelet convolution result')
    coeffs[i] = pywt.threshold(coeffs[i], threshold*max(coeffs[i]))
    plt.plot(coeffs[i],label='Wavelet convolution after threshold applied')
    plt.legend()
    plt.title('Scale '+str(len(coeffs)-i))

# inverse transfer: filtered wavelet decomposition back to time-domain signal 
datarec = pywt.waverec(coeffs, 'sym5')

plt.figure()
plt.subplot(2, 1, 1)
plt.plot(index[:], data[:])
plt.title("Raw signal")
plt.subplot(2, 1, 2)
plt.plot(index[:], datarec[:])
plt.title("De-noised signal using wavelet techniques")

plt.tight_layout()
plt.show()

maximum level is 7


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [46]:
coeffs[6]

array([ 0.85932648, -0.75737687,  0.76355347, -0.34913312, -0.16279403,
       -0.76114776,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.12235676,  0.26248507, -0.06482176,
       -0.        ,  0.        , -0.        ,  0.        , -0.        ,
       -0.        , -0.63698799,  0.        ,  0.        ,  0.38769524,
       -0.56960617,  0.        , -0.10405331,  0.        ,  0.07282566,
        0.89455122,  0.16268321, -0.        ,  0.42022552, -0.32984069,
       -0.13124772,  0.212767  , -0.        ,  0.        , -0.        ,
        0.        ,  0.        , -0.04108364,  0.43650019, -0.25840387,
        1.35908357,  0.78448203,  0.        ,  0.72527156, -0.        ,
        0.        , -0.        ,  0.        ,  0.        ,  0.        ,
       -0.46010958, -0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.        , -0.37279977, -0.        , -0.        ,  0.  