# 1D time series prediction on airplane dataset

In [5]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

In [6]:
# import some data
df = pd.read_csv('./test_data/AirPassengers.csv') 
df

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [7]:
# plot original data
fig,ax = plt.subplots(figsize=(10,4))
ax.plot(df['Month'],df['#Passengers'])
ax.set_xticks([df['Month'].iloc[i] for i in range(0,144,7)])
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 1. Baseline attempt: no feature engineering, just raw data

In [8]:
# initialize class object
normal = time_series_prediction(df['Month'],df['#Passengers'],15,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=100) # testing and training dataset split
normal.test_train_plot()    # visualize training split

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)
normal.naive_model()

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.17068564 -0.43619066 -0.52110009  0.85529166  0.21998151 -0.16326028
  0.15272419 -0.13551272  0.0870167  -0.15020303  0.17336489 -0.15420544
 -0.00593946  0.20685781  0.70047134]
RMSE:  16.37160479513395
MAE:  12.641974322064023

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  76 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:  1.1min finished


best_score:  -223.00536321094188
best_model:  SVR(C=1, kernel='linear')
best_params:  {'C': 1, 'epsilon': 0.1, 'kernel': 'linear'}
RMSE:  17.889176273260063
MAE:  13.473920953157533

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-3)]: Done 378 out of 405 | elapsed:    6.8s remaining:    0.4s


best_score:  -145.64311475272146
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), learning_rate='adaptive',
             max_iter=1000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001}
RMSE:  68.02779693049509
MAE:  57.6045770407595

Naive model results:
RMSE:  52.4913786024544
MAE:  44.724137931034484


[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:    7.8s finished


In [10]:
# visualize results
normal.vis_results_time_series(second_plot='error')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [28]:
tabulated_results_0 = normal.results()
tabulated_results_0.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:>

# Attempt 1: creating stationary series by means of differencing

In [11]:
# import some data
df = pd.read_csv('./test_data/AirPassengers.csv') 

# difference data
df['diff'] = df['#Passengers'].diff(periods=1)

# drop rows with nans ie first row
df.dropna(inplace=True)

# view dataframe
display(df)

# plot this new signal
df.plot()

# forecasting on new dataset

# initialize class object
differenced = time_series_prediction(df['Month'],df['diff'],15,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
differenced.sliding_window_1(verbose=0) # time series to supervised ML problem
differenced.train_test_split(split=100) # testing and training dataset split
differenced.test_train_plot()    # visualize training split

# perform some prediction tasks
differenced.linear_regression()
differenced.support_vector_machine(model_tunning=True)
differenced.neural_net_mlp(model_tunning=True)
differenced.naive_model()

# visualize results
differenced.vis_results_time_series(second_plot='error')

# invert transform difference strategy


Unnamed: 0,Month,#Passengers,diff
1,1949-02,118,6.0
2,1949-03,132,14.0
3,1949-04,129,-3.0
4,1949-05,121,-8.0
5,1949-06,135,14.0
...,...,...,...
139,1960-08,606,-16.0
140,1960-09,508,-98.0
141,1960-10,461,-47.0
142,1960-11,390,-71.0


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.19437291 -0.10516455  0.26541158  0.78592115 -0.06110684 -0.29280778
 -0.11420274 -0.26473439 -0.1249843  -0.23385576 -0.04899373 -0.25623595
 -0.2401588  -0.14526642 -0.27417022]
RMSE:  17.375050901788562
MAE:  13.644170005307734

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 144 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-3)]: Done 213 out of 240 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    9.5s finished
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.


best_score:  -145.20012700729689
best_model:  SVR(C=0.1, epsilon=1, kernel='linear')
best_params:  {'C': 0.1, 'epsilon': 1, 'kernel': 'linear'}
RMSE:  18.47804566138797
MAE:  13.902152508796105

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:    6.9s finished


best_score:  -129.88116089038886
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), learning_rate='adaptive',
             max_iter=1000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001}
RMSE:  18.551498306140108
MAE:  15.159907533057384

Naive model results:
RMSE:  61.74313148613606
MAE:  52.357142857142854


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [27]:
# invert transform difference strategy

def invert_first_difference(prediction_split,lag_window,predictions,df_original):
    # first real value to work from
    beginnning_value = df_original['#Passengers'][prediction_split+lag_window]
    beginning_date = df_original['Month'][prediction_split+lag_window]
    print(f'Beginning: {beginnning_value} at date: {beginning_date}')

    # determined predicted values
    total_dates = df_original.shape[0]
    total_prediction_range = df_original.shape[0]-prediction_split-lag_window
    count = 0
    previous_value = beginnning_value
    inverted = []
    for date in range(total_prediction_range):
        real_value = previous_value + predictions[date]
        inverted.append(real_value)
        previous_value = real_value

    # set all values before prediction start to zero
    zeros = [0 for i in range(0,prediction_split+lag_window)]

    # append prediction results
    inverted_predictions = np.append(zeros,inverted)

    # tabulate
    df_results = pd.DataFrame(columns=['Month','Value','Pred Value'])
    df_results['Month'] = df_original['Month']
    df_results['Value'] = df_original['#Passengers']
    df_results['Pred Value'] = inverted_predictions

    return df_results

# invert results for different models
df_results_lin = invert_first_difference(100,15,differenced.linear_reg_predictions,df)
df_results_svm = invert_first_difference(100,15,differenced.svm_predictions,df)
df_results_nn = invert_first_difference(100,15,differenced.neural_net_predictions,df)

# add all these dfs together
df_results = pd.DataFrame(columns=['date','Original Values','Inverted linear','Inverted svm','Inverted NN'])
df_results['date'] = df_results_lin['Month']
df_results['Original Values'] = df_results_lin['Value']
df_results['Inverted linear'] = df_results_svm['Pred Value']
df_results['Inverted svm'] = df_results_svm['Pred Value']
df_results['Inverted NN'] = df_results_nn['Pred Value']


# plot results
df_results.plot()

# RMSE of this method
mse_lin = mean_squared_error(df_results['Original Values'].iloc[115:],df_results['Inverted linear'].iloc[115:])
mse_svm = mean_squared_error(df_results['Original Values'].iloc[115:],df_results['Inverted svm'].iloc[115:])
mse_nn = mean_squared_error(df_results['Original Values'].iloc[115:],df_results['Inverted NN'].iloc[115:])

print(f'RMSE linear: {mse_lin**0.5}')
print(f'RMSE svm: {mse_svm**0.5}')
print(f'RMSE nn: {mse_nn**0.5}')



Beginning: 505 at date: 1958-08
Beginning: 505 at date: 1958-08
Beginning: 505 at date: 1958-08


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

RMSE linear: 40.85544441930767
RMSE svm: 40.85544441930767
RMSE nn: 27.878520210903293


In [21]:
tabulated_predictions.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:>