# Time Series Forecasting Assignment

In [1]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from fbprophet import Prophet


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



### Import the Microsoft stock price data set (MSFT_data.csv) into a Pandas dataframe.

In [14]:
data = pd.read_csv("https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/MSFT_data.csv")

In [15]:
data.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,27.35,27.71,27.31,27.55,33318306,MSFT
1,2013-02-11,27.65,27.92,27.5,27.86,32247549,MSFT
2,2013-02-12,27.88,28.0,27.75,27.88,35990829,MSFT
3,2013-02-13,27.93,28.11,27.88,28.03,41715530,MSFT
4,2013-02-14,27.92,28.06,27.87,28.04,32663174,MSFT


In [16]:
data.date = pd.to_datetime(data.date)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   open    1259 non-null   float64       
 2   high    1259 non-null   float64       
 3   low     1259 non-null   float64       
 4   close   1259 non-null   float64       
 5   volume  1259 non-null   int64         
 6   Name    1259 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 69.0+ KB


### Generate a line chart showing the observed values (closing prices).

In [17]:
data = data[["date", "close"]]

In [18]:
data.columns = ["Date", "Close"]

In [19]:
def ilinechart(df, x, y, groups=None, title=''):
    fig = px.line(df, x=x, y=y, color=groups, title=title,
                  template="none").update(layout=dict(title=dict(x=0.5)))

    fig.show()

In [20]:
ilinechart(data, "Date", "Close", title="Microsoft Corporation Daily Closing Prices")

### Decompose the time series and check it for stationarity. If the data is not stationary, difference the observations and store the results in a new Diff column.

In [107]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, shuffle=False)

In [108]:
series = train.set_index("Date")
dec = sm.tsa.seasonal_decompose(series, model="additive", freq=252)

trend = dec.trend.reset_index()
seas = dec.seasonal.reset_index()
resid = dec.resid.reset_index()

merged = train.merge(trend, on="Date")
merged = merged.merge(seas, on="Date")
merged = merged.merge(resid, on="Date")

merged.columns = ["Date", "Close", "Trend", "Seasonality", "Residuals"]
merged["T+S"] = merged.Trend + merged.Seasonality
merged.head()

Unnamed: 0,Date,Close,Trend,Seasonality,Residuals,T+S
0,2013-02-08,27.55,,-1.74112,,
1,2013-02-11,27.86,,-1.729685,,
2,2013-02-12,27.88,,-1.158508,,
3,2013-02-13,28.03,,-0.689222,,
4,2013-02-14,28.04,,-0.37323,,


In [109]:
merged["Diff"] = train.Close - train.Close.shift(1)
merged.Diff.fillna(method="bfill", inplace=True)

In [110]:
melted = pd.melt(
    merged,
    id_vars="Date",
    var_name="Variable",
    value_name="Value",
    value_vars=["Close", "Trend", "Diff", "Seasonality", "Residuals", "T+S"]
)

ilinechart(melted, "Date", "Value", groups="Variable", title="Observed v. Components")

### Forecast the time series 60 days into the future using double and triple exponential smoothing models.

In [111]:
model = sm.tsa.ExponentialSmoothing(merged.Close, trend="additive").fit()
DExp = model.forecast(60)

In [112]:
model = sm.tsa.ExponentialSmoothing(
    merged.Close, trend="additive", 
    seasonal="additive", seasonal_periods=4).fit()
TExp = model.forecast(60)


Optimization failed to converge. Check mle_retvals.



### Forecast the time series 60 days into the future using ARMA, ARIMA, and SARIMA models.

In [113]:
def forecast(data, field, model, periods):
    model_results = model.forecast(periods)[0]
    current_value = data[field].iloc[-1]
    forecasts = []
    
    for result in model_results:
        forecast = current_value + result
        forecasts.append(forecast)
        current_value = forecast
    
    forecasts = pd.Series(forecasts)
    forecasts.index = [x + periods 
                       for x in list(data[field].tail(periods).index)]
    
    return forecasts

In [114]:
model = ARMA(merged['Diff'], order=(2, 1)).fit()
arma_forecasts = forecast(merged, 'Close', model, 60)
arma_forecasts.head()

1007    63.477285
1008    63.534361
1009    63.588353
1010    63.639193
1011    63.687405
dtype: float64

In [115]:
model = ARIMA(merged['Diff'], order=(2, 1, 1)).fit()
arima_forecasts = forecast(merged, 'Close', model, 60)
arima_forecasts.head()

1007    63.465669
1008    63.505787
1009    63.543302
1010    63.580744
1011    63.618215
dtype: float64

In [116]:
model = SARIMAX(merged.Close, order=(1,1,1), seasonal_order=(1,1,1,1)).fit()
sarima_forecasts = model.forecast(60)

### Forecast the time series 60 days into the future using the Facebook Prophet model.

In [117]:
series = merged[["Date", "Close"]]
series.columns = ["ds", "y"]

proph = Prophet()
proph.fit(series)

future = proph.make_future_dataframe(60)

proph_results = proph.predict(future)
proph_results

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2013-02-08,27.385937,24.893682,29.088396,27.385937,27.385937,-0.342450,-0.342450,-0.342450,0.016252,0.016252,0.016252,-0.358702,-0.358702,-0.358702,0.0,0.0,0.0,27.043487
1,2013-02-11,27.555372,25.000230,29.277228,27.555372,27.555372,-0.365390,-0.365390,-0.365390,0.010777,0.010777,0.010777,-0.376167,-0.376167,-0.376167,0.0,0.0,0.0,27.189982
2,2013-02-12,27.611850,25.195909,29.183023,27.611850,27.611850,-0.425238,-0.425238,-0.425238,-0.046902,-0.046902,-0.046902,-0.378336,-0.378336,-0.378336,0.0,0.0,0.0,27.186612
3,2013-02-13,27.668328,25.186094,29.330456,27.668328,27.668328,-0.394591,-0.394591,-0.394591,-0.014927,-0.014927,-0.014927,-0.379665,-0.379665,-0.379665,0.0,0.0,0.0,27.273736
4,2013-02-14,27.724806,25.239417,29.341234,27.724806,27.724806,-0.414436,-0.414436,-0.414436,-0.033711,-0.033711,-0.033711,-0.380724,-0.380724,-0.380724,0.0,0.0,0.0,27.310370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,2017-04-04,63.816311,61.105919,65.282544,63.397164,64.180640,-0.538231,-0.538231,-0.538231,-0.046902,-0.046902,-0.046902,-0.491329,-0.491329,-0.491329,0.0,0.0,0.0,63.278080
1063,2017-04-05,63.847842,61.159803,65.464514,63.418762,64.224697,-0.515010,-0.515010,-0.515010,-0.014927,-0.014927,-0.014927,-0.500084,-0.500084,-0.500084,0.0,0.0,0.0,63.332832
1064,2017-04-06,63.879374,61.298677,65.471808,63.440406,64.266768,-0.540893,-0.540893,-0.540893,-0.033711,-0.033711,-0.033711,-0.507182,-0.507182,-0.507182,0.0,0.0,0.0,63.338480
1065,2017-04-07,63.910905,61.139170,65.408127,63.454278,64.307236,-0.495785,-0.495785,-0.495785,0.016252,0.016252,0.016252,-0.512037,-0.512037,-0.512037,0.0,0.0,0.0,63.415120


In [122]:
proph_forecast = proph_results.iloc[-60:]["yhat"]
proph_forecast.tail()

1062    63.278080
1063    63.332832
1064    63.338480
1065    63.415120
1066    63.462584
Name: yhat, dtype: float64

### Combine the observed values and all the forecasts into a single data frame and generate a line chart to visually compare the different models.

In [133]:
future_dates = pd.DataFrame(
    {
        "Date": pd.date_range(
            start=train.Date.iloc[-1], periods=61, freq="D", closed="right"
        )
    }
)

forecast_df = pd.concat([pd.DataFrame(test.Date.iloc[:60]), future_dates], ignore_index=True)
forecast_df = forecast_df.merge(test[["Date", "Close"]], on="Date", how="left")
forecast_df.dropna(inplace=True)

In [134]:
forecast_df

Unnamed: 0,Date,Close
0,2017-02-08,63.34
1,2017-02-09,64.06
2,2017-02-10,64.00
3,2017-02-13,64.72
4,2017-02-14,64.57
...,...,...
114,2017-04-03,65.55
115,2017-04-04,65.73
116,2017-04-05,65.56
117,2017-04-06,65.73


In [140]:
DExp.reset_index().drop("index", axis=1)

Unnamed: 0,0
0,63.465666
1,63.501332
2,63.536998
3,63.572664
4,63.60833
5,63.643996
6,63.679662
7,63.715328
8,63.750994
9,63.786661


In [142]:
forecast_df["Double_Exp"] = DExp.reset_index().drop("index", axis=1)
forecast_df["Triple_Exp"] = TExp.reset_index().drop("index", axis=1)
forecast_df["ARMA"] = arma_forecasts.reset_index().drop("index", axis=1)
forecast_df["ARIMA"] = arima_forecasts.reset_index().drop("index", axis=1)
forecast_df["SARIMA"] = sarima_forecasts.reset_index().drop("index", axis=1)
forecast_df["Prophet"] = proph_forecast.reset_index().drop("index", axis=1)
forecast_df.head()

Unnamed: 0,Date,Close,Double_Exp,Triple_Exp,ARMA,ARIMA,SARIMA,Prophet
0,2017-02-08,63.34,63.465666,63.52849,63.477285,63.465669,63.522679,61.708451
1,2017-02-09,64.06,63.501332,63.549689,63.534361,63.505787,63.647906,61.71296
2,2017-02-10,64.0,63.536998,63.541979,63.588353,63.543302,63.747093,61.788827
3,2017-02-13,64.72,63.572664,63.563368,63.639193,63.580744,63.818668,61.834762
4,2017-02-14,64.57,63.60833,63.649058,63.687405,63.618215,63.871644,61.864125


In [144]:
melted = pd.melt(
    forecast_df,
    id_vars="Date",
    value_vars=list(forecast_df.columns)[1:],
    var_name="Variable",
    value_name="Value"
)
ilinechart(melted, "Date", "Value", groups="Variable", title="60-Day Forecast from Feb. 8, 2017 v. Actual Observations")

In [150]:
import numpy as np
models = list(forecast_df.columns)[2:]

for model in models:
    diff = forecast_df.Close - forecast_df[model]
    mae = diff.abs().mean()
    rmse = np.sqrt(np.mean(diff**2))
    print(f'Model {model} - MAE: {mae}   RMSE: {rmse}')

Model Double_Exp - MAE: 0.9346272707275513   RMSE: 1.3476414950248188
Model Triple_Exp - MAE: 1.0530052235859586   RMSE: 1.478449323300785
Model ARMA - MAE: 0.8245947588973603   RMSE: 1.2564071447991079
Model ARIMA - MAE: 0.8859050451974255   RMSE: 1.2962093756210493
Model SARIMA - MAE: 0.8133056807194119   RMSE: 1.2844774829884298
Model Prophet - MAE: 2.906048435864997   RMSE: 3.0853430116736043


The best model for this forecasting is SARIMA with the lowest MAE and RMSE's.