# 1D time series prediction on airplane dataset

Purpose of this notebook is to implement first order differencing to make a non-stationary timeseries stationary

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

In [50]:
# import some data
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
# df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Month   108 non-null    object
 1   Sales   108 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [51]:
# plot original data
fig,ax = plt.subplots(figsize=(10,4))
ax.plot(df['Month'],df['Sales'])
# ax.set_xticks([df['Date'][i] for i in range(-2000,0,100)])
ax.tick_params(rotation=30)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 1. Baseline attempt: no feature engineering, just raw data

In [52]:
window_length = 24
split = 12

In [53]:
# initialize class object
normal = time_series_prediction(df['Month'],df['Sales'],window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=split) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)
normal.naive_model()

# visualize results
normal.vis_results_time_series(second_plot='error')

# collect results together nicely
tabulated_results_0 = normal.results()
tabulated_results_0.plot()
display(tabulated_results_0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.29772767  0.0297656  -0.20719679  0.27396527 -0.23778335  0.16212262
 -0.20931553  0.49549408 -0.35280125  0.07677138  0.0266807  -0.25181929
  0.27417877  0.31350865 -0.06785588 -0.04478725 -0.05784719  0.06532542
 -0.18701502  0.03758605 -0.0996454   0.05197084 -0.10323055  0.46866853]
MAPE: 0.08148850613853602
RMSE:  2052.4380619834737
MAE:  1477.3111972770294

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits




best_score:  -4453.244065875203
best_model:  SVR(C=0.1, epsilon=100, kernel='linear', max_iter=5000, tol=0.0001)
best_params:  {'C': 0.1, 'epsilon': 100, 'kernel': 'linear'}
MAPE: 0.10329933710448204
RMSE:  2266.7204633259453
MAE:  1753.2648247633376

Training neural network: 
Fitting 5 folds for each of 108 candidates, totalling 540 fits
best_score:  -2005.245548367521
best_model:  MLPRegressor(hidden_layer_sizes=(500,), learning_rate_init=0.01, max_iter=5000,
             shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (500,), 'learning_rate': 'constant', 'learning_rate_init': 0.01}
MAPE: 0.06857799803851418
RMSE:  1517.1648532255522
MAE:  1279.2445399030435

Naive model results:
MAPE: 0.16990394422632252
RMSE:  3783.9662392785694
MAE:  3235.6666666666665


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1960-01,6550,,,,
1,1960-02,8728,,,,
2,1960-03,12026,,,,
3,1960-04,14395,,,,
4,1960-05,14587,,,,
...,...,...,...,...,...,...
103,1968-08,16722,18968.518156,18465.60298,16597.92928,18024
104,1968-09,14385,13058.721517,12710.162711,14780.265167,16722
105,1968-10,21342,18365.477407,17192.270015,19052.606857,14385
106,1968-11,17180,17982.198651,16031.604425,18776.162999,21342


# Attempt 1: creating stationary series by means of differencing

In [54]:
# import some data
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
# df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days

# difference data
df['diff'] = df['Sales'].diff(periods=1)
df['diff'].iloc[0] = 0

# drop rows with nans ie first row
# df.dropna(inplace=True)

# view dataframe
display(df)

# plot this new signal

# forecasting on new dataset

# initialize class object
differenced = time_series_prediction(df['Month'],df['diff'],window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
differenced.sliding_window_1(verbose=0) # time series to supervised ML problem
differenced.train_test_split(split=split) # testing and training dataset split
differenced.test_train_plot()    # visualize training split

# perform some prediction tasks
differenced.linear_regression()
differenced.support_vector_machine(model_tunning=True)
differenced.neural_net_mlp(model_tunning=True)
differenced.naive_model()

# visualize results
differenced.vis_results_time_series(second_plot='error')

# collect results together nicely
tabulated_results_1 = differenced.results()
tabulated_results_1.plot()
display(tabulated_results_1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Month,Sales,diff
0,1960-01,6550,0.0
1,1960-02,8728,2178.0
2,1960-03,12026,3298.0
3,1960-04,14395,2369.0
4,1960-05,14587,192.0
...,...,...,...
103,1968-08,16722,-1302.0
104,1968-09,14385,-2337.0
105,1968-10,21342,6957.0
106,1968-11,17180,-4162.0


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.07458201 -0.2326626  -0.27186266 -0.05489436 -0.325289   -0.06173408
 -0.21378186  0.03553357 -0.47421157 -0.09526182 -0.1452776  -0.16156883
  0.10697646 -0.16949373 -0.51078554 -0.47346244 -0.44949253 -0.4432674
 -0.54284952 -0.3916856  -0.45419859 -0.39141679 -0.48636122 -0.42965421]
MAPE: 0.6566109111809849
RMSE:  2064.3306423730564
MAE:  1616.6534713877957

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits
best_score:  -3174.215283249339
best_model:  SVR(C=100, kernel='sigmoid', max_iter=5000, tol=0.0001)
best_params:  {'C': 100, 'epsilon': 0.1, 'kernel': 'sigmoid'}
MAPE: 0.7895313773686379
RMSE:  3289.7797291069037
MAE:  2736.202937299762

Training neural network: 
Fitting 5 folds for each of 108 candidates, totalling 540 fits
best_score:  -1786.5248635124397
best_model:  MLPRegressor(learning_rate_init=0.01, max_iter=5000, shuffle=False)
best_params:  {'ac

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1960-01,0.0,,,,
1,1960-02,2178.0,,,,
2,1960-03,3298.0,,,,
3,1960-04,2369.0,,,,
4,1960-05,192.0,,,,
...,...,...,...,...,...,...
103,1968-08,-1302.0,1791.094182,-106.138636,707.692762,-3060.0
104,1968-09,-2337.0,-3131.75589,552.892975,-2433.893425,-1302.0
105,1968-10,6957.0,4720.996279,996.118135,4339.079598,-2337.0
106,1968-11,-4162.0,-2319.356608,236.561039,1001.539216,6957.0


In [66]:
# invert transform difference strategy

def invert_first_difference(prediction_split,lag_window,predictions,df_original):
    # first real value to work from
    beginnning_value = df_original['Sales'].iloc[-prediction_split]
    beginning_date = df_original['Month'].iloc[-prediction_split]
    print(f'Beginning: {beginnning_value} at date: {beginning_date}')

    # determined predicted values
    total_dates = df_original.shape[0]
    total_prediction_range =  prediction_split
    count = 0
    previous_value = beginnning_value
    inverted = []
    for date in range(total_prediction_range):
        real_value = previous_value + predictions[date]
        inverted.append(real_value)
        previous_value = real_value

    # set all values before prediction start to zero
    zeros = [None for i in range(0,total_dates-prediction_split)]

    # append prediction results
    inverted_predictions = np.append(zeros,inverted)

    # tabulate
    df_results = pd.DataFrame(columns=['Date','Value','Pred Value'])
    df_results['Month'] = df_original['Month']
    df_results['Value'] = df_original['Sales']
    df_results['Pred Value'] = inverted_predictions

    return df_results

# invert results for different models
df_results_lin = invert_first_difference(split,window_length,differenced.linear_reg_predictions,df)
df_results_svm = invert_first_difference(split,window_length,differenced.svm_predictions,df)
df_results_nn = invert_first_difference(split,window_length,differenced.neural_net_predictions,df)

# add all these dfs together
df_results = pd.DataFrame(columns=['date','Original Values','Inverted linear','Inverted svm','Inverted NN'])
df_results['date'] = df_results_lin['Month']
df_results['Original Values'] = df_results_lin['Value']
df_results['Inverted linear'] = df_results_svm['Pred Value']
df_results['Inverted svm'] = df_results_svm['Pred Value']
df_results['Inverted NN'] = df_results_nn['Pred Value']


# plot results
df_results.plot()

# RMSE of this method
mse_lin = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted linear'].iloc[-split:])
mse_svm = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted svm'].iloc[-split:])
mse_nn = mean_squared_error(df_results['Original Values'].iloc[-split:],df_results['Inverted NN'].iloc[-split:])

print(f'RMSE linear: {mse_lin**0.5}')
print(f'RMSE svm: {mse_svm**0.5}')
print(f'RMSE nn: {mse_nn**0.5}')



Beginning: 13210 at date: 1968-01
Beginning: 13210 at date: 1968-01
Beginning: 13210 at date: 1968-01


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

RMSE linear: 4990.64906278671
RMSE svm: 4990.64906278671
RMSE nn: 6501.102431206107


In [57]:
tabulated_results_1.iloc[-split:,:]

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
96,1968-01,-503.0,-321.515171,-644.203622,-3257.807265,-2406.0
97,1968-02,1041.0,202.424615,138.765856,-949.629372,-503.0
98,1968-03,5888.0,6729.53241,1412.146219,5725.079774,1041.0
99,1968-04,1586.0,2237.192766,1074.597488,2546.384563,5888.0
100,1968-05,4374.0,3346.299357,311.377972,3398.788392,1586.0
101,1968-06,-5015.0,-3403.287005,-493.038533,-3149.95965,4374.0
102,1968-07,-3060.0,-8110.423566,-1234.679551,-9016.849505,-5015.0
103,1968-08,-1302.0,1791.094182,-106.138636,707.692762,-3060.0
104,1968-09,-2337.0,-3131.75589,552.892975,-2433.893425,-1302.0
105,1968-10,6957.0,4720.996279,996.118135,4339.079598,-2337.0


In [58]:
df.iloc[-split:,:]

Unnamed: 0,Month,Sales,diff
96,1968-01,13210,-503.0
97,1968-02,14251,1041.0
98,1968-03,20139,5888.0
99,1968-04,21725,1586.0
100,1968-05,26099,4374.0
101,1968-06,21084,-5015.0
102,1968-07,18024,-3060.0
103,1968-08,16722,-1302.0
104,1968-09,14385,-2337.0
105,1968-10,21342,6957.0
