# Time Series Modeling (ARIMA Methods) Assignment

In [1]:
import pandas as pd
import plotly.express as px
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



### Import the Amazon stock price data set (AMZN_data.csv) and plot a line chart with the observed daily closing prices.

In [140]:
data = pd.read_csv("https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/AMZN_data.csv")
data.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,261.4,265.25,260.555,261.95,3879078,AMZN
1,2013-02-11,263.2,263.25,256.6,257.21,3403403,AMZN
2,2013-02-12,259.19,260.16,257.0,258.7,2938660,AMZN
3,2013-02-13,261.53,269.96,260.3,269.47,5292996,AMZN
4,2013-02-14,267.37,270.65,265.4,269.24,3462780,AMZN


In [141]:
data.date = pd.to_datetime(data.date)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   open    1259 non-null   float64       
 2   high    1259 non-null   float64       
 3   low     1259 non-null   float64       
 4   close   1259 non-null   float64       
 5   volume  1259 non-null   int64         
 6   Name    1259 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 69.0+ KB


In [142]:
data = data[["date", "close", "open"]]
data.columns = ["Date", "Observed", "Open"]

In [143]:
data.head()

Unnamed: 0,Date,Observed,Open
0,2013-02-08,261.95,261.4
1,2013-02-11,257.21,263.2
2,2013-02-12,258.7,259.19
3,2013-02-13,269.47,261.53
4,2013-02-14,269.24,267.37


In [144]:
def ilinechart(df, x, y, groups=None, title=''):
  fig = px.line(df, x=x, y=y, color=groups, title=title, 
                template='none').update(layout=dict(title=dict(x=0.5)))

  fig.show()

In [145]:
ilinechart(data, "Date", "Observed", title="Observed Daily Closing Prices")

### Run an Autoregression (AR) model on the series and add the results to a column in the dataframe.

In [146]:
model = AR(data.Observed).fit()
data["AR"] = model.predict()

In [147]:
model.k_ar

23

### Plot a multi-line chart comparing the AR model's results with the observed values.

In [148]:
melted = pd.melt(
    data,
    id_vars="Date",
    value_vars=["Observed", "AR"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. AR")

### Run a Moving Average (MA) model on the series and add the results to a column in the dataframe.

In [149]:
model = ARMA(data.Observed, order=(0,1)).fit()
data["MA"] = model.predict()

### Print the model summary and add the model's results to the line chart with observations and the AR model results.

In [150]:
print(model.summary())

melted = pd.melt(
    data,
    id_vars="Date",
    value_vars=["Observed", "AR", "MA"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. AR v. MA")

                              ARMA Model Results                              
Dep. Variable:               Observed   No. Observations:                 1259
Model:                     ARMA(0, 1)   Log Likelihood               -8047.769
Method:                       css-mle   S.D. of innovations            144.300
Date:                Thu, 17 Dec 2020   AIC                          16101.538
Time:                        17:24:04   BIC                          16116.952
Sample:                             0   HQIC                         16107.331
                                                                              
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const            576.9791      8.063     71.560      0.000     561.176     592.782
ma.L1.Observed     0.9834      0.004    226.363      0.000       0.975       0.992
                                    

### Run an Autoregressive Moving Average (ARMA) model on the series and add the results to a column in the dataframe.

Hint: You will need to make the time series stationary in order to run the ARMA model on it. The most common way to do this is by differncing, or subtracting the previous observed value from the current one.

In [151]:
data["Observed_Diff"] = data.Observed - data.Observed.shift()

In [152]:
data.Observed_Diff.isnull().mean()

0.0007942811755361397

In [153]:
model = ARMA(data.Observed, order=(0,1)).fit()
data["ARMA"] = model.predict()

### Print the model summary and generate a multi-line chart that compares the ARMA model's results with those from the AR and MA models.

Hint: You will need to de-difference the model results by adding the previous observed values to them.

In [162]:
print(model.summary())

melted = pd.melt(
    data[2:],
    id_vars="Date",
    value_vars=["Observed", "AR", "MA", "ARMA"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. Models")

                           Statespace Model Results                           
Dep. Variable:               Observed   No. Observations:                 1259
Model:               SARIMAX(0, 1, 1)   Log Likelihood               -4452.504
Date:                Thu, 17 Dec 2020   AIC                           8911.009
Time:                        17:25:30   BIC                           8926.420
Sample:                             0   HQIC                          8916.801
                               - 1259                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.9996      0.001   1677.740      0.000       0.998       1.001
ma.L1         -0.9999      0.111     -8.979      0.000      -1.218      -0.782
sigma2        69.0660      7.722      8.944      0.0

### Run an Autoregressive Integrated Moving Average (ARIMA) model on the series and add the results to a column in the dataframe.

In [155]:
model = ARIMA(data.Observed, order=(0,1,1)).fit()
data["ARIMA"] = model.predict(typ="levels")

### Print the model summary and generate a multi-line chart that compares the ARIMA model's results to the observed values.

In [163]:
print(model.summary())

melted = pd.melt(
    data[2:],
    id_vars="Date",
    value_vars=["Observed", "AR", "MA", "ARMA", "ARIMA"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. Models")

                           Statespace Model Results                           
Dep. Variable:               Observed   No. Observations:                 1259
Model:               SARIMAX(0, 1, 1)   Log Likelihood               -4452.504
Date:                Thu, 17 Dec 2020   AIC                           8911.009
Time:                        17:25:37   BIC                           8926.420
Sample:                             0   HQIC                          8916.801
                               - 1259                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.9996      0.001   1677.740      0.000       0.998       1.001
ma.L1         -0.9999      0.111     -8.979      0.000      -1.218      -0.782
sigma2        69.0660      7.722      8.944      0.0

### Run an Seasonal Autoregressive Integrated Moving Average (SARIMA) model on the series and add the results to a column in the dataframe.

In [157]:
model = SARIMAX(data.Observed, order=(0,1,1), seasonal_order=(1,1,1,1)).fit()
data["SARIMA"] = model.predict()

### Print the model summary and generate a multi-line chart that compares the SARMA model's results with the observed values.

In [164]:
print(model.summary())

melted = pd.melt(
    data[2:],
    id_vars="Date",
    value_vars=["Observed", "AR", "MA", "ARMA", "ARIMA", "SARIMA"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. Models")

                           Statespace Model Results                           
Dep. Variable:               Observed   No. Observations:                 1259
Model:               SARIMAX(0, 1, 1)   Log Likelihood               -4452.504
Date:                Thu, 17 Dec 2020   AIC                           8911.009
Time:                        17:25:41   BIC                           8926.420
Sample:                             0   HQIC                          8916.801
                               - 1259                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.9996      0.001   1677.740      0.000       0.998       1.001
ma.L1         -0.9999      0.111     -8.979      0.000      -1.218      -0.782
sigma2        69.0660      7.722      8.944      0.0

### Run an Seasonal Autoregressive Integrated Moving Average with Exogenous Factors (SARIMAX) model on the series and add the results to a column in the dataframe.

Use the daily Open prices as the exogenous factors.

In [161]:
model = SARIMAX(data.Observed, exog=data.Open, order=(0,1,1), seasonal_order=(0,0,0,0)).fit()
data["SARIMAX"] = model.predict()


Maximum Likelihood optimization failed to converge. Check mle_retvals



### Print the model summary and generate a multi-line chart that compares the SARMAX model's results with the SARIMA model's values.

In [165]:
print(model.summary())

melted = pd.melt(
    data[2:],
    id_vars="Date",
    value_vars=["Observed", "AR", "MA", "ARMA", "ARIMA", "SARIMA", "SARIMAX"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. Models")

                           Statespace Model Results                           
Dep. Variable:               Observed   No. Observations:                 1259
Model:               SARIMAX(0, 1, 1)   Log Likelihood               -4452.504
Date:                Thu, 17 Dec 2020   AIC                           8911.009
Time:                        17:26:04   BIC                           8926.420
Sample:                             0   HQIC                          8916.801
                               - 1259                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.9996      0.001   1677.740      0.000       0.998       1.001
ma.L1         -0.9999      0.111     -8.979      0.000      -1.218      -0.782
sigma2        69.0660      7.722      8.944      0.0

### Evaluate the performance of all the models and compare to each other, using Mean Absolute Error and Root Mean Squared Error as your evaluation metrics.

In [170]:
import numpy as np
models = ['AR', 'MA', 'ARMA', 'ARIMA', 'SARIMA', 'SARIMAX']

for model in models:
    diff = data.Observed - data[model]
    mae = diff.abs().mean()
    rmse = np.sqrt(np.mean(diff**2))
    print(f'Model {model} - MAE: {mae}   RMSE: {rmse}')

Model AR - MAE: 6.636754781948539   RMSE: 10.39904085160622
Model MA - MAE: 124.57766603584766   RMSE: 144.6307251244663
Model ARMA - MAE: 124.57766603584766   RMSE: 144.6307251244663
Model ARIMA - MAE: 6.58412908722416   RMSE: 10.434248186807165
Model SARIMA - MAE: 6.88767938451345   RMSE: 13.339356956942417
Model SARIMAX - MAE: 5.552508091187803   RMSE: 8.317313038820974


Best model is SARIMAX due to having the lower mean absolute error and root mean square error.