# 1D time series prediction on airplane dataset

Purpose of this notebook is to implement first order differencing to make a non-stationary timeseries stationary

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

In [2]:
# import some data
df = pd.read_csv('./test_data/GSPC.csv') 
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000


In [3]:
# plot original data
fig,ax = plt.subplots(figsize=(10,4))
ax.plot(df['Date'][-2000:],df['Open'][-2000:])
# ax.set_xticks([df['Date'][i] for i in range(-2000,0,100)])
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 1. Baseline attempt: no feature engineering, just raw data

In [4]:
# initialize class object
normal = time_series_prediction(df['Date'][-2000:],df['Open'][-2000:],15,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=1700) # testing and training dataset split
normal.test_train_plot()    # visualize training split

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)
normal.naive_model()

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.05007065 -0.06226744  0.01745346 -0.00422591 -0.02827593  0.07965269
 -0.05773014  0.007217   -0.00402506  0.07055558 -0.03092398  0.00667209
 -0.04963946 -0.01003295  1.01485027]
RMSE:  19.560159846776852
MAE:  12.322090373811996

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  70 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-3)]: Done 193 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:  1.9min finished


best_score:  -207.892857633265
best_model:  SVR(C=0.1, epsilon=1, kernel='linear')
best_params:  {'C': 0.1, 'epsilon': 1, 'kernel': 'linear'}
RMSE:  19.01673931764518
MAE:  12.04376928185388

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 104 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-3)]: Done 233 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-3)]: Done 378 out of 405 | elapsed:  2.9min remaining:   12.3s
[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:  3.1min finished


best_score:  -652.3411363380995
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), learning_rate='invscaling',
             max_iter=1000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'invscaling', 'learning_rate_init': 0.001}
RMSE:  46.4912467049308
MAE:  37.151353508575646

Naive model results:
RMSE:  19.356019896320362
MAE:  12.232913589473679


In [6]:
# visualize results
normal.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
tabulated_results_0 = normal.results()
tabulated_results_0.plot()
display(tabulated_results_0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,2010-06-28,1077.500000,0.000000,0.000000,0.000000,0.000000
1,2010-06-29,1071.099976,0.000000,0.000000,0.000000,0.000000
2,2010-06-30,1040.560059,0.000000,0.000000,0.000000,0.000000
3,2010-07-01,1031.099976,0.000000,0.000000,0.000000,0.000000
4,2010-07-02,1027.650024,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
1995,2018-05-31,2720.979980,2702.131994,2702.121418,2704.226755,2702.429932
1996,2018-06-01,2718.699951,2721.203405,2720.072371,2707.802451,2720.979980
1997,2018-06-04,2741.669922,2718.407681,2718.889494,2711.788773,2718.699951
1998,2018-06-05,2748.459961,2743.839444,2744.588739,2710.605048,2741.669922


# Attempt 1: creating stationary series by means of differencing

In [9]:
# import some data
df = pd.read_csv('./test_data/GSPC.csv') 

# difference data
df['diff'] = df['Open'].diff(periods=1)

# drop rows with nans ie first row
df.dropna(inplace=True)

# view dataframe
display(df)

# plot this new signal
# df.plot()

# forecasting on new dataset

# initialize class object
differenced = time_series_prediction(df['Date'][-2000:],df['diff'][-2000:],15,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
differenced.sliding_window_1(verbose=0) # time series to supervised ML problem
differenced.train_test_split(split=1700) # testing and training dataset split
differenced.test_train_plot()    # visualize training split

# perform some prediction tasks
differenced.linear_regression()
differenced.support_vector_machine(model_tunning=True)
differenced.neural_net_mlp(model_tunning=True)
differenced.naive_model()

# visualize results
differenced.vis_results_time_series(second_plot='error')

# invert transform difference strategy


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,diff
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000,0.190000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000,0.080000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000,0.050000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000,0.100000
5,1950-01-10,17.030001,17.030001,17.030001,17.030001,17.030001,2160000,-0.049999
...,...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000,18.550048
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000,-2.280029
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000,22.969971
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000,6.790039


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [-0.04031065 -0.04966156  0.01214086 -0.00735327 -0.00278153  0.02404239
 -0.05268452  0.00484618 -0.00205696 -0.00035472 -0.06768289 -0.03786283
 -0.0446718   0.00557925  0.01307656]
RMSE:  19.513315063621693
MAE:  12.150138198834048

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 148 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-3)]: Done 213 out of 240 | elapsed:   21.8s remaining:    2.7s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:  4.5min finished
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.


best_score:  -201.61083609315875
best_model:  SVR(C=0.1, epsilon=10, kernel='sigmoid')
best_params:  {'C': 0.1, 'epsilon': 10, 'kernel': 'sigmoid'}
RMSE:  19.284661752069116
MAE:  11.997911921778167

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Done 105 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-3)]: Done 233 tasks      | elapsed:  1.6min


best_score:  -205.56712144012786
best_model:  MLPRegressor(activation='logistic', hidden_layer_sizes=(10,),
             learning_rate_init=1, max_iter=1000, shuffle=False)
best_params:  {'activation': 'logistic', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'learning_rate_init': 1}
RMSE:  19.359220074067068
MAE:  12.291250940208549

Naive model results:
RMSE:  27.522005656726424
MAE:  16.738004585964877


[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:  2.7min finished


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
df.iloc[-2000:,:]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,diff
15218,2010-06-28,1077.500000,1082.599976,1071.449951,1074.569946,1074.569946,3896410000,2.400024
15219,2010-06-29,1071.099976,1071.099976,1035.180054,1041.239990,1041.239990,6136700000,-6.400024
15220,2010-06-30,1040.560059,1048.079956,1028.329956,1030.709961,1030.709961,5067080000,-30.539917
15221,2010-07-01,1031.099976,1033.579956,1010.909973,1027.369995,1027.369995,6435770000,-9.460083
15222,2010-07-02,1027.650024,1032.949951,1015.929993,1022.580017,1022.580017,3968500000,-3.449952
...,...,...,...,...,...,...,...,...
17213,2018-05-31,2720.979980,2722.500000,2700.679932,2705.270020,2705.270020,4235370000,18.550048
17214,2018-06-01,2718.699951,2736.929932,2718.699951,2734.620117,2734.620117,3684130000,-2.280029
17215,2018-06-04,2741.669922,2749.159912,2740.540039,2746.870117,2746.870117,3376510000,22.969971
17216,2018-06-05,2748.459961,2752.610107,2739.510010,2748.800049,2748.800049,3517790000,6.790039


In [17]:
# invert transform difference strategy
df_new = df.iloc[-2000:,:]
df_new.reset_index(inplace=True)

def invert_first_difference(prediction_split,lag_window,predictions,df_original):
    # first real value to work from
    beginnning_value = df_original['Open'][prediction_split+lag_window]#df_original['#Passengers'][prediction_split+lag_window] #
    beginning_date = df_original['Date'][prediction_split+lag_window]#df_original['Month'][prediction_split+lag_window]
    print(f'Beginning: {beginnning_value} at date: {beginning_date}')

    # determined predicted values
    total_dates = df_original.shape[0]
    total_prediction_range =  df_original.shape[0]-prediction_split-lag_window
    count = 0
    previous_value = beginnning_value
    inverted = []
    for date in range(total_prediction_range):
        real_value = previous_value + predictions[date]
        inverted.append(real_value)
        previous_value = real_value

    # set all values before prediction start to zero
    zeros = [0 for i in range(0,prediction_split+lag_window)]

    # append prediction results
    inverted_predictions = np.append(zeros,inverted)

    # tabulate
    df_results = pd.DataFrame(columns=['Date','Value','Pred Value'])
    df_results['Month'] = df_original['Date']#[-2000:]
    df_results['Value'] = df_original['Open']#[-2000:]
    df_results['Pred Value'] = inverted_predictions

    return df_results

# invert results for different models
df_results_lin = invert_first_difference(1700,15,differenced.linear_reg_predictions,df_new)
df_results_svm = invert_first_difference(1700,15,differenced.svm_predictions,df_new)
df_results_nn = invert_first_difference(1700,15,differenced.neural_net_predictions,df_new)

# add all these dfs together
df_results = pd.DataFrame(columns=['date','Original Values','Inverted linear','Inverted svm','Inverted NN'])
df_results['date'] = df_results_lin['Month']
df_results['Original Values'] = df_results_lin['Value']
df_results['Inverted linear'] = df_results_svm['Pred Value']
df_results['Inverted svm'] = df_results_svm['Pred Value']
df_results['Inverted NN'] = df_results_nn['Pred Value']


# plot results
df_results.plot()

# RMSE of this method
mse_lin = mean_squared_error(df_results['Original Values'].iloc[1715:],df_results['Inverted linear'].iloc[1715:])
mse_svm = mean_squared_error(df_results['Original Values'].iloc[1715:],df_results['Inverted svm'].iloc[1715:])
mse_nn = mean_squared_error(df_results['Original Values'].iloc[1715:],df_results['Inverted NN'].iloc[1715:])

print(f'RMSE linear: {mse_lin**0.5}')
print(f'RMSE svm: {mse_svm**0.5}')
print(f'RMSE nn: {mse_nn**0.5}')



Beginning: 2342.689941 at date: 2017-04-20
Beginning: 2342.689941 at date: 2017-04-20
Beginning: 2342.689941 at date: 2017-04-20


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

RMSE linear: 121.90521902102694
RMSE svm: 121.90521902102694
RMSE nn: 309.6412671317006


In [None]:
tabulated_predictions.plot()