In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf

In [2]:
# load the dataset from the course's repo
url = 'https://raw.githubusercontent.com/BI-DS/EBA-3530/main/Lecture_3/time_series.csv'
df = pd.read_csv(url, delimiter=',')
print('The size of the timeseries is {}'.format(df.shape[0]))

The size of the timeseries is 100


# Time series forecasting using AR models
Let's use a timeseries data set to do in-sample forecasting and out-of-sample forecasting. To that end, we use two different models:
\begin{align}
M1:& y_t = \beta_0 + \beta_1 y_{t-1} + ϵ_t \\
M2:& y_t = \beta_0 + \beta_1 y_{t-1} + \beta_2 y_{t-2} + ϵ_t
\end{align}

Let's see how the timeseries looks like!

In [None]:
# plot the time series and the ACF
fig, axes = plt.subplots(1,2,figsize=(15,4))
axes[0].plot(df.y)
axes[0].set_title('Time series data')
axes[0].set(xlabel='Time units')
plot_acf(df.y,alpha=1,lags=30,ax=axes[1])
plt.show()

**Question:** What can you say about the data set? 

**Tips:** Think about the four components in timeseries data

## Differencing the timeseries

In [None]:
# use Pandas diff() function
# remember to drop NANs generated by differentation
diff_1 = df.y.diff().dropna()
diff_2 = df.y.diff().diff().dropna()

# plot again
fig, axes = plt.subplots(1,2,figsize=(15,4))
axes[0].plot(diff_1)
axes[0].set_title('1st order differencing')
axes[1].plot(diff_2)
axes[1].set_title('2nd order differencing')
plt.show()

**Question:** Which series looks more mean-reverting?

 **Question:** Plot the ACF and PACF for both `diff_1` and `diff_2 `and compare them.

## Fitting AR models (In-sample)

In [None]:
# M1 model (AR(1))
# convert df to numpy using 'values'
ar1 = ARIMA(diff_2.values, order=(1,0,0)).fit()
print(ar1.summary())

# M2 now (AR(2))
ar2 = ARIMA(diff_2.values, order=(2,0,0)).fit()
print(ar2.summary())

**Question:** Calculate AIC and BIC using the formula that I showed in lecture 3.

**Tips:** Use the deviance formula from lecture 2, dropping the constat term.

### Plot in-sample fit

In [None]:
ar2.plot_predict(dynamic=False)
plt.show()

**Question:** Comment the blue and orange lines.

## Fitting AR models (Out-of-sample)

In [None]:
# 1-fold cross-validation
threshold = [85]
for th in threshold:
  y_tr = diff_2.values[:th]
  y_te = diff_2[th:]

  # fit the model
  ar2 = ARIMA(y_tr, order=(2, 0, 0)).fit() 
  # Forecast
  horizon = y_te.shape[0]
  fc, se, conf = ar2.forecast(horizon, alpha=0.05)
  
  # format results as df
  forecast = pd.Series(fc, index=y_te.index)
  lower_bound = pd.Series(conf[:, 0], index=y_te.index)
  upper_bound = pd.Series(conf[:, 1], index=y_te.index)

  # calculate your favorite metric, 
  # e.g. MSE as in slides or RMSE
  mse = np.mean((forecast - y_te)**2)
  rmse = np.mean((forecast - y_te)**2)**0.5

  # Plot
  plt.figure(figsize=(12,5))
  plt.plot(y_tr, label='training')
  plt.plot(y_te, label='actual')
  plt.plot(forecast, label='forecast')
  plt.fill_between(lower_bound.index, lower_bound, upper_bound, 
                 color='k', alpha=.15)
  plt.title('Forecast vs Actual\n mse: {0:.4f} and rmse: {1:.4f}'\
            .format(mse,rmse))
  plt.legend(loc='upper left', fontsize=8)
  plt.show()

**Question:** Add more values into `threshols = []` to run a K-folds cross validation with K=5 for example.

**Question:** Comment the difference between in-sample and out-of-sample fit (the two diagrams in the above blocks).