# Code up class to perform different tasks

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize

# predictive models
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor


# 1.0 Class for univariate one-step ahead forecasting 

In [37]:
class time_series_prediction():

    def __init__(self,dates,one_d_time_series,lag_window_length,n_ahead_prediction):

        # raw input data + settings for time series -> supervised learning ML problem
        self.one_d_time_series = np.array(one_d_time_series)      # time series array, to array ensure index works as expected for class methods
        self.time_series_dates = np.array(dates)                  # time stamp / date for each data point
        self.lag_window_length = lag_window_length                # length of lag window
        self.n_ahead_prediction = n_ahead_prediction              # time ahead to predict

        # transfromed data: set after calling .sliding_window_1()
        self.input_data = None
        self.target_data = None

        # testing and training data: set after calling .train_test_split()
        self.training_split = None
        self.X_test = None
        self.X_train = None
        self.y_test = None
        self.y_train = None

        # predictions from various models
        self.linear_reg_predictions = None
        self.svm_predictions = None
        self.neural_net_predictions = None
        self.naive_predictions = None
    

# ****************************************************************************************************************
    # data wrangling
# ****************************************************************************************************************

    # method to transfroms 1-D time series to supervised ML problem: one step ahead forecasting   
    def sliding_window_1(self,verbose):
        # initialize input array
        num_rows = len(self.one_d_time_series) - self.lag_window_length
        array = np.zeros((num_rows, self.lag_window_length + 1))
        
        # loop through data and populate array
        for i in range(num_rows):
            # input features
            array[i,0:self.lag_window_length+1] = self.one_d_time_series[i:i+self.lag_window_length+1]
            # target feature/s
            array[i,-1] = self.one_d_time_series[i+self.lag_window_length]
            
            if verbose == 1:
                # show pattern
                print(array[i,0:self.lag_window_length],' : ',array[i,self.lag_window_length])

        # save results as a class attribute
        self.input_data = array[:,0:self.lag_window_length]
        self.target_data = array[:,self.lag_window_length]

    # method to perform a training and testing split for dataset with only a single column of target variables
    def train_test_split(self,split):
        self.training_split = split
        self.X_train = self.input_data[0:split,:]
        self.X_test = self.input_data[split:,:]
        self.y_train = self.target_data[0:split]
        self.y_test = self.target_data[split:]

    # method to plot testing and training split of data
    def test_train_plot(self):
        num_days = range(len(self.one_d_time_series)) # hacking this as x-axis of plot
        fig, ax = plt.subplots(figsize=(10,5))

        ax.plot(num_days[0:self.training_split] ,self.one_d_time_series[0:self.training_split],'k-',label='Training data') # replace returns with sp_500 for other data plotting
        ax.plot(num_days[self.training_split:] ,self.one_d_time_series[self.training_split:],'r-',label='Testing data')
        plt.plot(num_days[self.training_split+self.lag_window_length:] ,self.y_test,'o',label='Windowed testing data') # important to match time by start 5 (length of time window) after where segmented our testing and training data
        plt.legend(loc=0)  

# ****************************************************************************************************************
    # predictive models
# ****************************************************************************************************************

    def linear_regression(self):
        print('Training multivariate linear regression:')
        # train model
        reg_model = LinearRegression().fit(self.X_train,self.y_train)
        print('\nLinear regression coefficients: \n',reg_model.coef_)

        # test model
        predictions = reg_model.predict(self.X_test)

        # evaluate: use sklearn metric methods to calc rmse and mae
        mse = mean_squared_error(self.y_test,predictions)
        mae = mean_absolute_error(self.y_test,predictions)

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.linear_reg_predictions = predictions

    def support_vector_machine(self):
        print('\nTraining support vector machine:')
        # train model
        svm_regres = LinearSVR(max_iter=10000).fit(self.X_train,self.y_train)

        # predict
        svm_predictions = svm_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,svm_predictions[:])
        mae = mean_absolute_error(self.y_test,svm_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.svm_predictions = svm_predictions

    def neural_net_mlp(self,verbose=0):
        print('\nTraining neural network: ')
        # train neural network
        nn_regres = MLPRegressor(hidden_layer_sizes=(100,100,100),shuffle=False,random_state=1, 
                                max_iter=1000,verbose=verbose).fit(self.X_train,self.y_train)

        # make predictions
        nn_predictions = nn_regres.predict(self.X_test)

        # evaluate
        mse = mean_squared_error(self.y_test,nn_predictions[:])
        mae = mean_absolute_error(self.y_test,nn_predictions[:])

        print('RMSE: ',np.sqrt(mse))
        print('MAE: ',mae)

        # save predictions
        self.neural_net_predictions = nn_predictions

    def naive_model(self): # t's prediction is t-1's value, note that this means you miss the first time point
        preds = np.zeros(len(self.one_d_time_series)-1)
        preds[0] = np.nan()
        preds[1:] = self.one_d_time_series[0:-2]
        self.naive_predictions = preds

# ****************************************************************************************************************
    # visualize results
# ****************************************************************************************************************
    def error(self,real_data,predicted_data):
        error = np.zeros(len(real_data))
        error = (real_data - predicted_data) / real_data
        return error

    # visualize orignal time series signal aswell as predictions    
    def vis_results_time_series(self):
        # plot prediction against actual + training data
        fig, ax = plt.subplots(2,1,figsize=(15,10),sharex=True)

        # original time series
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.one_d_time_series[self.training_split+self.lag_window_length:],'o-',linewidth=3,label='real values',markersize=5) 

        # predicted y values
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.linear_reg_predictions,'o-',label='linear regression prediction',markersize=5)
        # ax[0].plot(self.time_series_dates,self.naive_predictions,'.--',label='naive prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.svm_predictions,'.--',label='svm prediction',markersize=5)
        ax[0].plot(self.time_series_dates[self.training_split+self.lag_window_length:],self.neural_net_predictions,'.--',label='nn prediction',markersize=5)

        ax[0].legend()
        ax[0].set_title('Real values vs model predictions')

        # plot error plot
        # error_linreg = self.error(self.y_test,self.linear_reg_predictions)
        # # error_naive = error(np.array(test_data[:,-1]),naive_predictions)
        # error_svm = self.error(self.y_test,self.svm_predictions)
        # error_nn = self.error(self.y_test,self.neural_net_predictions)

        # ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_linreg,'r-',label='linear reg error')
        # # ax[1].plot(self.time_series_dates,error_naive[1:],'-',label='naive error')
        # ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_svm,'-',label='svm error')
        # ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],error_nn,'-',label='nn error')
        # ax[1].set_title('Error signal for predictive models')
        # ax[1].set_xlabel('Dates')
        # ax[1].legend()
        # # ax[1].set_ylim([-10,10])
        # ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
        # ax[1].tick_params(rotation=30)
        
        # titles and save figures
        # title_string = 'S&P500 predictions _ y is '+str(column)+'_ window len is '+ str(window_length)
        # fig.suptitle(title_string)
        
        # fig_name = '../results/univariate_single_step_ahead/'+title_string+'.png'
        # plt.savefig(fig_name,facecolor='w')

        # plot cummulative prod plots - this should only be done if input data is percentage retunrs
        real_vals_cumprod = (self.y_test+1).cumprod()
        linear_reg_predictions_cumprod = (self.linear_reg_predictions + 1).cumprod()
        svm_predictions_cumprod = (self.svm_predictions + 1).cumprod()
        neural_net_predictions_cumprod = (self.neural_net_predictions + 1).cumprod()

        ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],real_vals_cumprod,'-',label='real vals cumprod')
        ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],linear_reg_predictions_cumprod,'-',label='linear reg cumprod')
        ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],svm_predictions_cumprod),'-',label='svm cumprod')
        ax[1].plot(self.time_series_dates[self.training_split+self.lag_window_length:],neural_net_predictions_cumprod,'-',label='nn cumprod')

        ax[1].set_xticks([self.time_series_dates[x] for x in range(self.training_split,len(self.time_series_dates),28)])
        ax[1].tick_params(rotation=30)
        ax[1].legend()
        plt.tight_layout()
    # visualize predictions against real values using scatter plot
    def vis_results_scatter(self):

        # create dataframe to hold all results
        df_predictions = pd.DataFrame(index=self.time_series_dates[self.training_split+self.lag_window_length:],columns=['Real_values','linear_reg_predictions','svm_predictions','neural_net_predictions'])
        df_predictions['Real_values'] = self.y_test
        df_predictions['linear_reg_predictions'] = self.linear_reg_predictions
        df_predictions['svm_predictions'] = self.svm_predictions
        df_predictions['neural_net_predictions'] = self.neural_net_predictions

        # scatter plot with hues
        fig, ax = plt.subplots(3,1,figsize=(7,21))
        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['linear_reg_predictions'],ax=ax[0])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[0],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['svm_predictions'],ax=ax[1])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[1],color='red')

        sns.scatterplot(y=df_predictions['Real_values'],x=df_predictions['neural_net_predictions'],ax=ax[2])
        sns.lineplot(x=self.y_test,y=self.y_test,ax=ax[2],color='red')

        # plot formatting
        plt.tight_layout()

# 2.0 Import some test data

In [4]:
# import some data
sp_500 = pd.read_csv('../test_data/GSPC.csv')
sp_500

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000


# 3.0 Example of using class

In [103]:
# initialize class object
normal = time_series_prediction(sp_500['Date'][-2000:],sp_500['Volume'][-2000:],5,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1200) # testing and training dataset split
normal.test_train_plot()    # visualize training split

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [104]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()
# normal.naive_model()

Training multivariate linear regression:

Linear regression coefficients: 
 [0.10077061 0.04071057 0.03651907 0.14864236 0.43403925]
RMSE:  549152034.7058507
MAE:  366115072.16687787

Training support vector machine:
RMSE:  592994583.5140637
MAE:  394600158.67625666

Training neural network: 
RMSE:  579135949.6703322
MAE:  394829788.2229638


In [106]:
normal.vis_results_time_series()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [105]:
normal.vis_results_scatter()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Play around with Zander's standardization stuff

In [38]:
# some misc data
x = sp_500['Open'][-2000:]

# percentage returns
x_pct = x.pct_change().fillna(0)
x_pct

# create new df hold both
df = pd.DataFrame(columns=['Open','pct_change','pct_change_cumprod','log_transform'])
df['Open'] =  x
df['pct_change'] = x_pct
df['pct_change_cumprod'] = (x_pct + 1).cumprod()
df['log_transform'] = np.log(df['Open'] )

# plot
df.plot(subplots=True,sharex=True,figsize=(12,12))
plt.tight_layout()

# view data
df

  fig = plt.figure(**fig_kw)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,Open,pct_change,pct_change_cumprod,log_transform
15218,1077.500000,0.000000,1.000000,6.982399
15219,1071.099976,-0.005940,0.994060,6.976441
15220,1040.560059,-0.028513,0.965717,6.947514
15221,1031.099976,-0.009091,0.956937,6.938381
15222,1027.650024,-0.003346,0.953736,6.935030
...,...,...,...,...
17213,2720.979980,0.006864,2.525271,7.908747
17214,2718.699951,-0.000838,2.523155,7.907909
17215,2741.669922,0.008449,2.544473,7.916322
17216,2748.459961,0.002477,2.550775,7.918796


In [39]:
normal = time_series_prediction(sp_500['Date'][-2000:],df['pct_change'],10,1) # pass time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised learning ML problem
normal.train_test_split(split=1200) # testing and training dataset split
normal.test_train_plot() 

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [40]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine()
normal.neural_net_mlp()

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.04934315 -0.03010505  0.00353447 -0.00763261 -0.01381228 -0.09801618
 -0.01311948 -0.08629862  0.05020159 -0.02946175]
RMSE:  0.007862288446575811
MAE:  0.005315734888004195

Training support vector machine:
RMSE:  0.007774426254777921
MAE:  0.005249901777256953

Training neural network: 
RMSE:  0.007749995201242111
MAE:  0.0052516049606334565


In [41]:
normal.vis_results_time_series()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [42]:
normal.vis_results_scatter()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …