# BIG DATA ANALYTICS PROGRAMMING : AR model

---
References
- https://github.com/ritvikmath/Time-Series-Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
register_matplotlib_converters()

from statsmodels.tsa.ar_model import AutoReg

from statsmodels.tsa.arima.model import ARIMA

from datetime import datetime, timedelta

## 데이터 로드

In [None]:
#read data
df_ice_cream = pd.read_csv('data/ice_cream.csv')

In [None]:
df_ice_cream.head()

In [None]:
#rename columns to something more understandable
df_ice_cream.rename(columns={'DATE':'date', 'IPN31152N':'production'}, inplace=True)

In [None]:
#convert date column to datetime type
df_ice_cream['date'] = pd.to_datetime(df_ice_cream.date)

In [None]:
#set date as index
df_ice_cream.set_index('date', inplace=True)

In [None]:
#just get data from 2010 onwards
start_date = pd.to_datetime('2010-01-01')
df_ice_cream = df_ice_cream[start_date:]

In [None]:
#show result
df_ice_cream.head()

In [None]:
df_ice_cream.head()

In [None]:
df_ice_cream.index

In [None]:
pd.infer_freq(df_ice_cream.index)

In [None]:
df_ice_cream = df_ice_cream.asfreq(pd.infer_freq(df_ice_cream.index))

In [None]:
plt.figure(figsize=(10,4))
plt.plot(df_ice_cream.production)
plt.title('Ice Cream Production over Time', fontsize=20)
plt.ylabel('Production', fontsize=16)
for year in range(2011,2021):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

## ACF, PACF 계산
 - ACF와 PACF를 확인한 뒤, AR, MA 모델중 적합한 것을 결정하고 파라미터를 결정
 - ACF에 급격한 감소가 있다 = MA
 - PACF에 급격한 감소가 있다 = AR

In [None]:
acf_plot = plot_acf(df_ice_cream.production, lags=100)

In [None]:
pacf_plot = plot_pacf(df_ice_cream.production)

PACF에 의하여, 3이후에 급격히 떨어지는 지점이 발견되었으므로, p값은 3

In [None]:
train_end = datetime(2016,12,1)
test_end = datetime(2019,12,1)
train_data = df_ice_cream.production[:train_end]
test_data = df_ice_cream.production[train_end + timedelta(days=1):test_end]

In [None]:
#p=AR의 계수, d=차분의 횟수, q=MA의 계수
model = ARIMA(train_data, order=(3,0,0))

In [None]:
model_fit = model.fit()

In [None]:
model_fit.summary()

In [None]:
pred_start_date = test_data.index[0]
pred_end_date = test_data.index[-1]

In [None]:
predictions = model_fit.predict(start = pred_start_date, end = pred_end_date)
residuals = test_data - predictions

In [None]:
plt.figure(figsize=(10,4))
plt.plot(residuals)
plt.title('Residuals from AR Model', fontsize=20)
plt.ylabel('Error',fontsize=16)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(test_data)
plt.plot(predictions)
plt.legend(('Data', 'Predictions'), fontsize=16)
plt.title('Ice Cream Production over Time', fontsize=20)
plt.ylabel('Production', fontsize=16)
for year in range(2019,2021):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
print('MAPE', round(np.mean(abs(residuals/test_data)),4))

In [None]:
print('RMSE', np.sqrt(np.mean(residuals**2)))

## Rolling Forecast Origin
- 예측하는 지점을 일정하게 유지하면 좋지 않을까? (예: 마지막 데이터 이후로 3년)
<a href="https://www.researchgate.net/figure/Forecast-on-a-rolling-origin-cross-validation_fig1_326835034"><img style="width:400px" src="https://www.researchgate.net/profile/Alireza_Shojaei2/publication/326835034/figure/fig1/AS:669569765097494@1536649280728/Forecast-on-a-rolling-origin-cross-validation.ppm" alt="Forecast on a rolling origin cross-validation."/></a>

In [None]:
predictions_rolling = pd.Series()
for end_date in test_data.index:
    train_data = df_ice_cream.production[:end_date -timedelta(days=1)]
    print(end_date -timedelta(days=1))
    model = ARIMA(train_data, order=(3,0,0))
    model_fit = model.fit()
    pred = model_fit.predict(end_date)
    predictions_rolling.loc[end_date] = pred.loc[end_date]

In [None]:
residuals_rolling = test_data - predictions_rolling

In [None]:
plt.figure(figsize=(10,4))
plt.plot(residuals_rolling)
plt.title('Residuals from AR Model', fontsize=20)
plt.ylabel('Error',fontsize=16)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(test_data)
plt.plot(predictions_rolling)
plt.legend(('Data', 'Predictions'), fontsize=16)
plt.title('Ice Cream Production over Time', fontsize=20)
plt.ylabel('Production', fontsize=16)
for year in range(2019,2021):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
print('MAPE', round(np.mean(abs(residuals_rolling/test_data)),4))
print('RMSE', np.sqrt(np.mean(residuals_rolling**2)))