# 1D time series prediction with differencing

Purpose of this notebook is to implement first order differencing to make a non-stationary timeseries stationary

In [30]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction
from one_dimensional_time_series_forecasting import invert_scaling
from one_dimensional_time_series_forecasting import hit_rate

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

In [31]:
# import some data
df = pd.read_csv('./test_data/AirPassengers.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
# df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        144 non-null    object
 1   #Passengers  144 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ KB


In [3]:
# plot original data
fig,ax = plt.subplots(figsize=(10,4))
ax.plot(df['Month'],df['#Passengers'])
# ax.set_xticks([df['Date'][i] for i in range(-2000,0,100)])
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Attempt 1: creating stationary series by means of differencing

In [4]:
df.plot(subplots=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([<AxesSubplot:>], dtype=object)

In [32]:
# import some data
df = pd.read_csv('./test_data/AirPassengers.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
# df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days

# difference data
df['#Passengers_log'] = df['#Passengers'].apply(lambda x: np.log(x))
df['diff'] = df['#Passengers'].diff(periods=1)
df['diff'].iloc[0] = 0
df['log_diff'] = df['#Passengers_log'].diff(periods=1)
df['log_diff'].iloc[0] = 0

# forecasting on new dataset
window_length = 15
split = 44

# scale data
scaler = MinMaxScaler()
scaled_training_data = scaler.fit_transform(df['log_diff'][0:-split].to_numpy().reshape(-1,1)).flatten()
scaled_test_data = scaler.transform(df['log_diff'][-split:].to_numpy().reshape(-1,1)).flatten()
data = np.append(scaled_training_data,scaled_test_data)

# initialize class object
differenced = time_series_prediction(df['Month'],data,window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
differenced.sliding_window_1(verbose=0) # time series to supervised ML problem
differenced.train_test_split(split=split) # testing and training dataset split
differenced.test_train_plot(ylabel='Difference of Passengers',steps=5)    # visualize training split

# perform some prediction tasks
differenced.linear_regression()
differenced.support_vector_machine(model_tunning=True)
differenced.neural_net_mlp(model_tunning=True)
differenced.naive_model()

# visualize results
differenced.vis_results_time_series(second_plot='error',ylabel='Difference of Passengers',steps=5)

# collect results together nicely
tabulated_results_1 = differenced.results()
tabulated_results_1.plot(x='date',figsize=(10,4))
display(tabulated_results_1)

# invert scaling and compute eval metric in orginal feature space
print('NN:')
inverted_predictions_nn, inverted_testing_data = invert_scaling(scaler,data[-split:],differenced.neural_net_predictions)

print('SVM:')
inverted_predictions_svm, inverted_testing_data = invert_scaling(scaler,data[-split:],differenced.svm_predictions)

print('Linear Regression:')
inverted_predictions_linear, inverted_testing_data = invert_scaling(scaler,data[-split:],differenced.linear_reg_predictions)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.21073579 -0.0449212   0.25049905  0.6353662  -0.11156824 -0.25964539
 -0.1466502  -0.29718165 -0.19454909 -0.23468218 -0.1305246  -0.2978432
 -0.33582033 -0.24718047 -0.3497452 ]
MAPE: 0.7674516028163971
RMSE:  0.08909471430350961
MAE:  0.07107917275146734

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
best_score:  -0.10784331863466261
best_model:  SVR(C=1, max_iter=5000, tol=0.0001)
best_params:  {'C': 1, 'epsilon': 0.1, 'kernel': 'rbf'}
MAPE: 1.1619504722221636
RMSE:  0.10790625104072399
MAE:  0.09100554574403859

Training neural network: 
Fitting 5 folds for each of 108 candidates, totalling 540 fits
best_score:  -0.10107684114723763
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), max_iter=5000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'constant', 'learning_rate_init': 0.001}
MAPE: 0.67087755

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1949-01,0.435636,,,,
1,1949-02,0.567622,,,,
2,1949-03,0.719198,,,,
3,1949-04,0.377492,,,,
4,1949-05,0.273715,,,,
...,...,...,...,...,...,...
139,1960-08,0.369726,0.449845,0.432486,0.43109,0.816712
140,1960-09,-0.010503,0.079455,0.139762,0.050319,0.369726
141,1960-10,0.190097,0.110038,0.23444,0.09974,-0.010503
142,1960-11,0.012632,0.1964,0.236828,0.189341,0.190097


NN:
MAPE: 0.5023379745506396
RMSE:  0.033872106708021876
MAE:  0.026459788241225243
SVM:
MAPE: 0.548427093784819
RMSE:  0.04266499391087777
MAE:  0.03598263323558667
Linear Regression:
MAPE: 0.5601587698267818
RMSE:  0.03522711063158002
MAE:  0.028103955455621176


In [18]:
df_lin.plot(subplots=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>,
       <AxesSubplot:>], dtype=object)

In [27]:
# invert difference + log
def invert_first_difference_with_log(prediction_split,lag_window,predictions,df_original):
    # first real value to work from
    beginnning_value = df_original['#Passengers_log'].iloc[-prediction_split] # this must be the column that is logged, before differencing
    beginning_date = df_original['Month'].iloc[-prediction_split]
    print(f'Beginning: {beginnning_value} at date: {beginning_date}')

    # determined predicted values
    total_dates = df_original.shape[0]
    total_prediction_range =  prediction_split
    count = 0
    previous_value = beginnning_value
    inverted = []
    for date in range(total_prediction_range):
        real_value = previous_value + predictions[date]
        inverted.append(real_value)
        previous_value = real_value

    # set all values before prediction start to zero
    zeros = [None for i in range(0,total_dates-prediction_split)]

    # append prediction results
    inverted_predictions = np.append(zeros,inverted)

    # tabulate
    df_results = pd.DataFrame(columns=['Date','Value','Pred Value'])
    df_results['Month'] = df_original['Month']
    df_results['Value'] = df_original['#Passengers']
    df_results['Pred Value'] = inverted_predictions
    df_results['Pred Value'] = df_results['Pred Value'][-prediction_split:].apply(lambda x: np.exp(x)) # inverting the log
    return df_results

In [28]:
# invert transform difference strategy

def invert_first_difference(prediction_split,lag_window,predictions,df_original):
    # first real value to work from
    beginnning_value = df_original['#Passengers'].iloc[-prediction_split]
    beginning_date = df_original['Month'].iloc[-prediction_split]
    print(f'Beginning: {beginnning_value} at date: {beginning_date}')

    # determined predicted values
    total_dates = df_original.shape[0]
    total_prediction_range =  prediction_split
    count = 0
    previous_value = beginnning_value
    inverted = []
    for date in range(total_prediction_range):
        real_value = previous_value + predictions[date]
        inverted.append(real_value)
        previous_value = real_value

    # set all values before prediction start to zero
    zeros = [None for i in range(0,total_dates-prediction_split)]

    # append prediction results
    inverted_predictions = np.append(zeros,inverted)

    # tabulate
    df_results = pd.DataFrame(columns=['Date','Value','Pred Value'])
    df_results['Month'] = df_original['Month']
    df_results['Value'] = df_original['#Passengers']
    df_results['Pred Value'] = inverted_predictions

    return df_results

In [29]:
# invert results for different models
df_results_lin = invert_first_difference_with_log(split,window_length,inverted_predictions_linear,df)
df_results_svm = invert_first_difference_with_log(split,window_length,inverted_predictions_svm,df)
df_results_nn = invert_first_difference_with_log(split,window_length,inverted_predictions_nn,df)

# add all these dfs together
df_results = pd.DataFrame(columns=['date','Original Values','Inverted linear','Inverted svm','Inverted NN'])
df_results['date'] = df_results_lin['Month']
df_results['Original Values'] = df_results_lin['Value']#[-split:].apply(lambda x: np.exp(x))
df_results['Inverted linear'] = df_results_lin['Pred Value']#[-split:].apply(lambda x: np.exp(x))
df_results['Inverted svm'] = df_results_svm['Pred Value']#[-split:].apply(lambda x: np.exp(x))
df_results['Inverted NN'] = df_results_nn['Pred Value']#[-split:].apply(lambda x: np.exp(x))

# plot results
df_results.plot(figsize=(10,4),x='date')

# RMSE of this method
mse_lin = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted linear'].iloc[-split:])
mse_svm = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted svm'].iloc[-split:])
mse_nn = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted NN'].iloc[-split:])

# MAE of this method
mae_lin = mean_absolute_error(df_results['Original Values'].iloc[-split:],df_results['Inverted linear'].iloc[-split:])
mae_svm = mean_absolute_error(df_results['Original Values'].iloc[-split:],df_results['Inverted svm'].iloc[-split:])
mae_nn = mean_absolute_error(df_results['Original Values'].iloc[-split:],df_results['Inverted NN'].iloc[-split:])

# MAPE of this method
mape_lin = mean_absolute_percentage_error(df_results['Original Values'].iloc[-split:],df_results['Inverted linear'].iloc[-split:])
mape_svm = mean_absolute_percentage_error(df_results['Original Values'].iloc[-split:],df_results['Inverted svm'].iloc[-split:])
mape_nn = mean_absolute_percentage_error(df_results['Original Values'].iloc[-split:],df_results['Inverted NN'].iloc[-split:])


print(f'RMSE linear: {mse_lin**0.5}')
print(f'RMSE svm: {mse_svm**0.5}')
print(f'RMSE nn: {mse_nn**0.5}\n')

print(f'MAE linear: {mae_lin}')
print(f'MAE svm: {mae_svm}')
print(f'MAE nn: {mae_nn}\n')

print(f'mape linear: {mape_lin}')
print(f'mape svm: {mape_svm}')
print(f'mape nn: {mape_nn}\n')

# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = df_results['date'].iloc[split+window_length:]
original_values = df_results['Original Values'].iloc[split+window_length:]
lin_predictions = df_results['Inverted linear'].iloc[split+window_length:]
svm_predictions = df_results['Inverted svm'].iloc[split+window_length:]
nn_predictions =  df_results['Inverted NN'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)


Beginning: 5.872117789475416 at date: 1957-05
Beginning: 5.872117789475416 at date: 1957-05
Beginning: 5.872117789475416 at date: 1957-05


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

RMSE linear: 186.5833576778767
RMSE svm: 91.47986368991317
RMSE nn: 70.70434817188756

MAE linear: 161.28321373205327
MAE svm: 78.90695506119542
MAE nn: 62.285526514037954

mape linear: 0.3715627093293579
mape svm: 0.18695778264822913
mape nn: 0.14580218181022128

Linear Regression:
Movement prediction accuracy: 70.59 %
Confusion matrix:
[[36  2]
 [23 24]]
SVM:
Movement prediction accuracy: 69.41 %
Confusion matrix:
[[37  1]
 [25 22]]
NN:
Movement prediction accuracy: 70.59 %
Confusion matrix:
[[38  0]
 [25 22]]


In [70]:
df_results.plot(x='date',y='Original Values')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:xlabel='date'>

In [71]:
df_results

Unnamed: 0,date,Original Values,Inverted linear,Inverted svm,Inverted NN
0,1949-01,,,,
1,1949-02,,,,
2,1949-03,,,,
3,1949-04,,,,
4,1949-05,,,,
...,...,...,...,...,...
139,1960-08,606.0,935.289980,736.187643,631.613389
140,1960-09,508.0,812.427411,654.911083,545.074820
141,1960-10,461.0,714.289695,604.830813,481.264736
142,1960-11,390.0,649.821217,559.107708,436.331054
