# Time Series Forecasting Assignment

In [1]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from fbprophet import Prophet


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



### Import the Microsoft stock price data set (MSFT_data.csv) into a Pandas dataframe.

In [2]:
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/MSFT_data.csv')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1259 non-null   object 
 1   open    1259 non-null   float64
 2   high    1259 non-null   float64
 3   low     1259 non-null   float64
 4   close   1259 non-null   float64
 5   volume  1259 non-null   int64  
 6   Name    1259 non-null   object 
dtypes: float64(4), int64(1), object(2)
memory usage: 69.0+ KB


Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,27.35,27.71,27.31,27.55,33318306,MSFT
1,2013-02-11,27.65,27.92,27.5,27.86,32247549,MSFT
2,2013-02-12,27.88,28.0,27.75,27.88,35990829,MSFT
3,2013-02-13,27.93,28.11,27.88,28.03,41715530,MSFT
4,2013-02-14,27.92,28.06,27.87,28.04,32663174,MSFT


### Generate a line chart showing the observed values (closing prices).

In [4]:
df['date'] = pd.to_datetime(df['date'])
df.columns = ['Date', 'Open', 'High', 'Low', 'Observed', 'Volume', 'Ticker']
df.head()

Unnamed: 0,Date,Open,High,Low,Observed,Volume,Ticker
0,2013-02-08,27.35,27.71,27.31,27.55,33318306,MSFT
1,2013-02-11,27.65,27.92,27.5,27.86,32247549,MSFT
2,2013-02-12,27.88,28.0,27.75,27.88,35990829,MSFT
3,2013-02-13,27.93,28.11,27.88,28.03,41715530,MSFT
4,2013-02-14,27.92,28.06,27.87,28.04,32663174,MSFT


In [5]:
def ilinechart(df, x, y, groups=None, title=''):
    fig = px.line(df, x=x, y=y, color=groups, title=title, 
                  template='none').update(layout=dict(title=dict(x=0.5)))
    
    fig.show()

In [6]:
ilinechart(df, 'Date', 'Observed', title='Observations Over Time')

### Decompose the time series and check it for stationarity. If the data is not stationary, difference the observations and store the results in a new Diff column.

In [7]:
df = df[['Date', 'Observed']]
df.head()

Unnamed: 0,Date,Observed
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04


In [9]:
series = df.set_index('Date')
decomposition = sm.tsa.seasonal_decompose(series, model='additive', freq=252)

trend = decomposition.trend.reset_index()
seasonality = decomposition.seasonal.reset_index()
residuals = decomposition.resid.reset_index()

merged = df.merge(trend, on='Date') # center moving average
merged = merged.merge(seasonality, on='Date')
merged = merged.merge(residuals, on='Date')

merged.columns = ['Date', 'Observed', 'Trend', 'Seasonality', 'Residuals']
merged['T+S'] = merged['Trend'] + merged['Seasonality']
merged.head()

Unnamed: 0,Date,Observed,Trend,Seasonality,Residuals,T+S
0,2013-02-08,27.55,,-1.26776,,
1,2013-02-11,27.86,,-1.287869,,
2,2013-02-12,27.88,,-0.694293,,
3,2013-02-13,28.03,,-0.395081,,
4,2013-02-14,28.04,,-0.183618,,


In [10]:
adf_test = sm.tsa.stattools.adfuller(df['Observed'])
results = pd.Series(adf_test[0:4], 
                    index=['ADF Test Statistic',
                            'P-Value', 
                            '# Lags Used',
                            '# Observations Used'])
for key, value in adf_test[4].items():
  results[f'Critical Value {key}'] = value

print(results)

ADF Test Statistic        0.415655
P-Value                   0.982071
# Lags Used               0.000000
# Observations Used    1258.000000
Critical Value 1%        -3.435559
Critical Value 5%        -2.863840
Critical Value 10%       -2.567995
dtype: float64


In [11]:
df['Shift'] = df['Observed'].shift(1).fillna(method='bfill')
df['Diff'] = df['Observed'] - df['Shift']

In [12]:
adf_test = sm.tsa.stattools.adfuller(df['Diff'])
results = pd.Series(adf_test[0:4], 
                    index=['ADF Test Statistic',
                            'P-Value', 
                            '# Lags Used',
                            '# Observations Used'])
for key, value in adf_test[4].items():
  results[f'Critical Value {key}'] = value

print(results)

ADF Test Statistic      -36.493649
P-Value                   0.000000
# Lags Used               0.000000
# Observations Used    1258.000000
Critical Value 1%        -3.435559
Critical Value 5%        -2.863840
Critical Value 10%       -2.567995
dtype: float64


### Forecast the time series 60 days into the future using double and triple exponential smoothing models.

In [18]:
model = sm.tsa.ExponentialSmoothing(df['Observed'], trend='additive').fit()
double_exp = model.forecast(30)

model = sm.tsa.ExponentialSmoothing(df['Observed'], trend='additive', 
                                    seasonal='additive', 
                                    seasonal_periods=4).fit()

triple_exp = model.forecast(30)

In [19]:
double_exp

1259    89.715378
1260    89.764747
1261    89.814116
1262    89.863484
1263    89.912853
1264    89.962222
1265    90.011590
1266    90.060959
1267    90.110328
1268    90.159696
1269    90.209065
1270    90.258434
1271    90.307803
1272    90.357171
1273    90.406540
1274    90.455909
1275    90.505277
1276    90.554646
1277    90.604015
1278    90.653384
1279    90.702752
1280    90.752121
1281    90.801490
1282    90.850858
1283    90.900227
1284    90.949596
1285    90.998965
1286    91.048333
1287    91.097702
1288    91.147071
dtype: float64

### Forecast the time series 60 days into the future using ARMA, ARIMA, and SARIMA models.

In [20]:
def forecast(data, field, model, periods):
    model_results = model.forecast(periods)[0]
    current_value = data[field].iloc[-1]
    forecasts = []
    
    for result in model_results:
        forecast = current_value + result
        forecasts.append(forecast)
        current_value = forecast
    
    forecasts = pd.Series(forecasts)
    forecasts.index = [x + periods 
                       for x in list(data[field].tail(periods).index)]
    
    return forecasts

In [27]:
model = ARMA(df['Diff'], order=(2, 1)).fit()
arma_forecasts = forecast(df, 'Observed', model, 60)
arma_forecasts.tail(10)

1309    92.560752
1310    92.610538
1311    92.660311
1312    92.710072
1313    92.759822
1314    92.809563
1315    92.859296
1316    92.909021
1317    92.958740
1318    93.008453
dtype: float64

In [28]:
model = ARIMA(df['Diff'], order=(1,1,1)).fit()
arima_forecasts = forecast(df, 'Observed', model, 60)
arima_forecasts.tail(10)

1309    94.160895
1310    94.250522
1311    94.340205
1312    94.429946
1313    94.519744
1314    94.609600
1315    94.699513
1316    94.789483
1317    94.879510
1318    94.969595
dtype: float64

In [29]:
model = SARIMAX(df['Observed'], order=(2, 1, 1), 
                seasonal_order=(1, 1, 1, 1)).fit()

sarima_forecasts = model.forecast(60)
sarima_forecasts.tail(10)

1309    94.412079
1310    94.513463
1311    94.614964
1312    94.716574
1313    94.818290
1314    94.920107
1315    95.022020
1316    95.124026
1317    95.226119
1318    95.328297
dtype: float64

### Forecast the time series 60 days into the future using the Facebook Prophet model.

In [31]:
series = df[['Date', 'Observed']]
series.columns = ['ds', 'y']
series

Unnamed: 0,ds,y
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04
...,...,...
1254,2018-02-01,94.26
1255,2018-02-02,91.78
1256,2018-02-05,88.00
1257,2018-02-06,91.33


In [32]:
model = Prophet()
model.fit(series)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<fbprophet.forecaster.Prophet at 0x7fa829235a58>

In [33]:
future = model.make_future_dataframe(60)
future.tail(10)

Unnamed: 0,ds
1309,2018-03-30
1310,2018-03-31
1311,2018-04-01
1312,2018-04-02
1313,2018-04-03
1314,2018-04-04
1315,2018-04-05
1316,2018-04-06
1317,2018-04-07
1318,2018-04-08


In [34]:
results = model.predict(future)
results.tail(10)

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
1309,2018-03-30,91.595056,89.400284,93.623581,91.051676,92.180366,-0.029415,-0.029415,-0.029415,-0.210331,-0.210331,-0.210331,0.180917,0.180917,0.180917,0.0,0.0,0.0,91.565641
1310,2018-03-31,91.665818,90.479113,94.379422,91.102469,92.271723,0.837431,0.837431,0.837431,0.687495,0.687495,0.687495,0.149935,0.149935,0.149935,0.0,0.0,0.0,92.503249
1311,2018-04-01,91.73658,90.529376,94.703881,91.140728,92.35871,0.804736,0.804736,0.804736,0.687495,0.687495,0.687495,0.117241,0.117241,0.117241,0.0,0.0,0.0,92.541316
1312,2018-04-02,91.807342,89.759224,93.671563,91.188671,92.461043,-0.177754,-0.177754,-0.177754,-0.261213,-0.261213,-0.261213,0.083458,0.083458,0.083458,0.0,0.0,0.0,91.629588
1313,2018-04-03,91.878104,89.623748,93.731805,91.242305,92.551737,-0.253467,-0.253467,-0.253467,-0.302759,-0.302759,-0.302759,0.049292,0.049292,0.049292,0.0,0.0,0.0,91.624638
1314,2018-04-04,91.948867,89.684794,93.825131,91.278422,92.648362,-0.26595,-0.26595,-0.26595,-0.281461,-0.281461,-0.281461,0.015511,0.015511,0.015511,0.0,0.0,0.0,91.682917
1315,2018-04-05,92.019629,89.720164,93.745043,91.314125,92.746629,-0.336303,-0.336303,-0.336303,-0.319226,-0.319226,-0.319226,-0.017076,-0.017076,-0.017076,0.0,0.0,0.0,91.683326
1316,2018-04-06,92.090391,89.720757,93.8438,91.350764,92.839939,-0.257973,-0.257973,-0.257973,-0.210331,-0.210331,-0.210331,-0.047642,-0.047642,-0.047642,0.0,0.0,0.0,91.832418
1317,2018-04-07,92.161153,90.712942,95.018014,91.388881,92.934758,0.612132,0.612132,0.612132,0.687495,0.687495,0.687495,-0.075363,-0.075363,-0.075363,0.0,0.0,0.0,92.773285
1318,2018-04-08,92.231915,90.684505,94.884999,91.435554,93.024733,0.588047,0.588047,0.588047,0.687495,0.687495,0.687495,-0.099448,-0.099448,-0.099448,0.0,0.0,0.0,92.819962


In [36]:
# take the last 60 rows
prophet_forecasts = results.iloc[-60:]['yhat']
prophet_forecasts.tail(10)

1309    91.565641
1310    92.503249
1311    92.541316
1312    91.629588
1313    91.624638
1314    91.682917
1315    91.683326
1316    91.832418
1317    92.773285
1318    92.819962
Name: yhat, dtype: float64

### Combine the observed values and all the forecasts into a single data frame and generate a line chart to visually compare the different models.

In [38]:
future_dates = pd.DataFrame({'Date': pd.date_range(start=df.Date.iloc[-1], periods=31, 
                                                   freq='D', closed='right')})

fcast_df = pd.concat([pd.DataFrame(df['Date']), future_dates], ignore_index=True)
fcast_df = fcast_df.merge(df[['Date', 'Observed']], on='Date', how='left')
fcast_df

Unnamed: 0,Date,Observed
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04
...,...,...
1284,2018-03-05,
1285,2018-03-06,
1286,2018-03-07,
1287,2018-03-08,


In [39]:
fcast_df['Double_Exp_Smooth'] = double_exp
fcast_df['Triple_Exp_Smooth'] = triple_exp
fcast_df['ARMA'] = arma_forecasts
fcast_df['ARIMA'] = arima_forecasts
fcast_df['SARIMA'] = sarima_forecasts
fcast_df['Prophet'] = prophet_forecasts

In [41]:
fcast_df.tail()

Unnamed: 0,Date,Observed,Double_Exp_Smooth,Triple_Exp_Smooth,ARMA,ARIMA,SARIMA,Prophet
1284,2018-03-05,,90.949596,90.954938,91.300094,91.938863,91.935632,89.992211
1285,2018-03-06,,90.998965,91.064534,91.351966,92.027056,92.031591,90.004341
1286,2018-03-07,,91.048333,91.044091,91.403597,92.115307,92.127904,90.080928
1287,2018-03-08,,91.097702,91.081165,91.455012,92.203616,92.224553,90.100264
1288,2018-03-09,,91.147071,91.152412,91.506235,92.291981,92.321524,90.268218


In [42]:
melted = pd.melt(fcast_df, id_vars='Date', var_name='Variable', value_name='Value', 
                 value_vars=list(fcast_df.columns)[1:])

ilinechart(melted, 'Date', 'Value', groups='Variable', 
                title='Forecast Comparison')