# Time Series Modeling (ARIMA Methods) Assignment

In [0]:
import numpy as np
import pandas as pd
import plotly.express as px
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [0]:
# linechart
def ilinechart(df,x,y,groups=None, title=''):
    fig = px.line(df,x=x,y=y, color=groups, title=title, template='none').update(layout=dict(title=dict(x=0.5)))
    
    # for item in range(len(fig.data)):
    #     fig.data[item].update(name=fig.data[item]['name'].split('=')[1])
    
    fig.show();

### Import the Amazon stock price data set (AMZN_data.csv) and plot a line chart with the observed daily closing prices.

In [0]:
df=pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/AMZN_data.csv')

In [4]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,261.4,265.25,260.555,261.95,3879078,AMZN
1,2013-02-11,263.2,263.25,256.6,257.21,3403403,AMZN
2,2013-02-12,259.19,260.16,257.0,258.7,2938660,AMZN
3,2013-02-13,261.53,269.96,260.3,269.47,5292996,AMZN
4,2013-02-14,267.37,270.65,265.4,269.24,3462780,AMZN


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
date      1259 non-null object
open      1259 non-null float64
high      1259 non-null float64
low       1259 non-null float64
close     1259 non-null float64
volume    1259 non-null int64
Name      1259 non-null object
dtypes: float64(4), int64(1), object(2)
memory usage: 69.0+ KB


In [0]:
df.date = pd.to_datetime(df.date)

### Run an Autoregression (AR) model on the series and add the results to a column in the dataframe.

In [0]:
model = AR(df.close).fit()
df['01_AR']=model.predict()

### Plot a multi-line chart comparing the AR model's results with the observed values.

In [0]:
melted = pd.melt(df,id_vars='date', value_vars=['01_AR','close'], var_name='Variable',value_name='Value')

In [9]:
ilinechart(melted, 'date','Value', groups='Variable', title='AR')

### Run a Moving Average (MA) model on the series and add the results to a column in the dataframe.

In [0]:
model = ARMA(df.close,order=(0,1)).fit()
df['02_MA']=model.predict()

In [0]:
melted = pd.melt(df,id_vars='date', value_vars=['01_AR','02_MA','close'], var_name='Variable',value_name='Value')

In [12]:
ilinechart(melted, 'date','Value', groups='Variable', title='MA')

### Print the model summary and add the model's results to the line chart with observations and the AR model results.

In [13]:
# plot above
print(model.summary())

                              ARMA Model Results                              
Dep. Variable:                  close   No. Observations:                 1259
Model:                     ARMA(0, 1)   Log Likelihood               -8047.769
Method:                       css-mle   S.D. of innovations            144.300
Date:                Thu, 09 Jan 2020   AIC                          16101.538
Time:                        16:48:59   BIC                          16116.952
Sample:                             0   HQIC                         16107.331
                                                                              
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const         576.9791      8.063     71.560      0.000     561.176     592.782
ma.L1.close     0.9834      0.004    226.363      0.000       0.975       0.992
                                    Roots       

### Run an Autoregressive Moving Average (ARMA) model on the series and add the results to a column in the dataframe.

Hint: You will need to make the time series stationary in order to run the ARMA model on it. The most common way to do this is by differncing, or subtracting the previous observed value from the current one.

In [0]:
# Remove NA's
df.dropna(inplace=True)

In [15]:
df.isnull().sum()

date      0
open      0
high      0
low       0
close     0
volume    0
Name      0
01_AR     0
02_MA     0
dtype: int64

In [0]:
# Create a stationary dataset
df['diff_1'] = df.close.diff()

In [19]:
model = ARMA(df.diff_1, order=(0,1)).fit()
df['03_ARMA']=model.predict()
print(model.summary())

                              ARMA Model Results                              
Dep. Variable:                 diff_1   No. Observations:                 1235
Model:                     ARMA(0, 1)   Log Likelihood               -4657.714
Method:                       css-mle   S.D. of innovations             10.512
Date:                Thu, 09 Jan 2020   AIC                           9321.428
Time:                        16:49:18   BIC                           9336.784
Sample:                             0   HQIC                          9327.204
                                                                              
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.9322      0.297      3.143      0.002       0.351       1.514
ma.L1.diff_1    -0.0084      0.028     -0.302      0.762      -0.063       0.046
                                    Roots   


An unsupported index was provided and will be ignored when e.g. forecasting.



In [0]:
# de-difference
df['03_ARMA'] = df['03_ARMA'] + df.close.shift(1)

### Print the model summary and generate a multi-line chart that compares the ARMA model's results with those from the AR and MA models.

Hint: You will need to de-difference the model results by adding the previous observed values to them.

In [21]:
melted = pd.melt(df,id_vars='date', value_vars=['01_AR','02_MA','03_ARMA','close'], var_name='Variable',value_name='Value')
ilinechart(melted, 'date','Value', groups='Variable', title='ARMA, AR, MA Comparison')

                              ARMA Model Results                              
Dep. Variable:                 diff_1   No. Observations:                 1235
Model:                     ARMA(0, 1)   Log Likelihood               -4657.714
Method:                       css-mle   S.D. of innovations             10.512
Date:                Thu, 09 Jan 2020   AIC                           9321.428
Time:                        16:49:28   BIC                           9336.784
Sample:                             0   HQIC                          9327.204
                                                                              
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.9322      0.297      3.143      0.002       0.351       1.514
ma.L1.diff_1    -0.0084      0.028     -0.302      0.762      -0.063       0.046
                                    Roots   

### Run an Autoregressive Integrated Moving Average (ARIMA) model on the series and add the results to a column in the dataframe.

In [22]:
model = ARIMA(df.close, order=(3,1,1)).fit()
df['04_ARIMA'] = model.predict(typ='levels')


An unsupported index was provided and will be ignored when e.g. forecasting.


An unsupported index was provided and will be ignored when e.g. forecasting.



### Print the model summary and generate a multi-line chart that compares the ARIMA model's results to the observed values.

In [23]:
print(model.summary())

                             ARIMA Model Results                              
Dep. Variable:                D.close   No. Observations:                 1234
Model:                 ARIMA(3, 1, 1)   Log Likelihood               -4650.829
Method:                       css-mle   S.D. of innovations             10.485
Date:                Thu, 09 Jan 2020   AIC                           9313.658
Time:                        16:49:54   BIC                           9344.366
Sample:                             1   HQIC                          9325.209
                                                                              
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.9378      0.281      3.342      0.001       0.388       1.488
ar.L1.D.close     0.2114      0.286      0.739      0.460      -0.349       0.772
ar.L2.D.close     0.0337      0.030     

### Run an Seasonal Autoregressive Integrated Moving Average (SARIMA) model on the series and add the results to a column in the dataframe.

In [29]:
model = SARIMAX(df.close,exog=None,order=(1,1,1), seasonal_order=(0,0,0,0)).fit()
df['05_SAR'] = model.predict(typ='levels')


An unsupported index was provided and will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.



### Print the model summary and generate a multi-line chart that compares the SARMA model's results with the observed values.

In [30]:
print(model.summary())

                           Statespace Model Results                           
Dep. Variable:                  close   No. Observations:                 1235
Model:               SARIMAX(1, 1, 1)   Log Likelihood               -4659.250
Date:                Thu, 09 Jan 2020   AIC                           9324.499
Time:                        16:51:27   BIC                           9339.853
Sample:                             0   HQIC                          9330.275
                               - 1235                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0004     26.650  -1.59e-05      1.000     -52.234      52.233
ma.L1         -0.0004     26.657  -1.57e-05      1.000     -52.247      52.247
sigma2       111.4115      1.305     85.394      0.0

### Run an Seasonal Autoregressive Integrated Moving Average with Exogenous Factors (SARIMAX) model on the series and add the results to a column in the dataframe.

Use the daily Open prices as the exogenous factors.

In [32]:
model = SARIMAX(df.close,exog=df.volume,order=(1,1,1), seasonal_order=(1,1,1,1)).fit()
df['06_SARX'] = model.predict(typ='levels')
print(model.summary())


An unsupported index was provided and will be ignored when e.g. forecasting.



                                 Statespace Model Results                                
Dep. Variable:                             close   No. Observations:                 1235
Model:             SARIMAX(1, 1, 1)x(1, 1, 1, 1)   Log Likelihood               -4894.305
Date:                           Thu, 09 Jan 2020   AIC                           9800.609
Time:                                   16:52:00   BIC                           9831.312
Sample:                                        0   HQIC                          9812.159
                                          - 1235                                         
Covariance Type:                             opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
volume      9.591e-08    5.9e-08      1.627      0.104   -1.96e-08    2.11e-07
ar.L1         -0.0872      0.010     -8.983


Maximum Likelihood optimization failed to converge. Check mle_retvals



### Print the model summary and generate a multi-line chart that compares the SARMAX model's results with the SARIMA model's values.

In [33]:
print(model.summary())
melted = pd.melt(df,id_vars='date', value_vars=['05_SAR','06_SARX','close'], var_name='Variable',value_name='Value')
ilinechart(melted, 'date','Value', groups='Variable', title='SARIMA v SARIMAX Comparison')

                                 Statespace Model Results                                
Dep. Variable:                             close   No. Observations:                 1235
Model:             SARIMAX(1, 1, 1)x(1, 1, 1, 1)   Log Likelihood               -4894.305
Date:                           Thu, 09 Jan 2020   AIC                           9800.609
Time:                                   16:52:19   BIC                           9831.312
Sample:                                        0   HQIC                          9812.159
                                          - 1235                                         
Covariance Type:                             opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
volume      9.591e-08    5.9e-08      1.627      0.104   -1.96e-08    2.11e-07
ar.L1         -0.0872      0.010     -8.983

### Evaluate the performance of all the models and compare to each other, using Mean Absolute Error and Root Mean Squared Error as your evaluation metrics.

In [36]:
models = ['01_AR','02_MA','03_ARMA','04_ARIMA','05_SAR','06_SARX']

for model in models:
    diff = df['close'] -df[model]
    mae = diff.abs().mean()
    rmse = np.sqrt(np.mean(diff**2))
    print('Model:',model,'-MAE:',mae,'|RMSE:',rmse)

Model: 01_AR -MAE: 6.633815444101258 |RMSE: 10.39914721824394
Model: 02_MA -MAE: 123.70680543825279 |RMSE: 144.03365094764152
Model: 03_ARMA -MAE: 6.648952205097708 |RMSE: 10.515058519899386
Model: 04_ARIMA -MAE: 6.664026313282706 |RMSE: 10.485154396958935
Model: 05_SAR -MAE: 6.870197274250821 |RMSE: 12.91766811183753
Model: 06_SARX -MAE: 8.817726841490657 |RMSE: 15.231662879553252
