# Time Series Forecasting Assignment

In [155]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from fbprophet import Prophet

### Import the Microsoft stock price data set (MSFT_data.csv) into a Pandas dataframe.

In [156]:
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/MSFT_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1259 non-null   object 
 1   open    1259 non-null   float64
 2   high    1259 non-null   float64
 3   low     1259 non-null   float64
 4   close   1259 non-null   float64
 5   volume  1259 non-null   int64  
 6   Name    1259 non-null   object 
dtypes: float64(4), int64(1), object(2)
memory usage: 69.0+ KB


In [157]:
ms = df[['date', 'close']].copy()
ms.head()

Unnamed: 0,date,close
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04


In [158]:
ms.date = pd.to_datetime(ms.date)

In [159]:
ms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   close   1259 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.8 KB


### Generate a line chart showing the observed values (closing prices).

In [160]:
def iline(df, x, y, groups=None, title=''):
    fig = px.line(df, x=x, y=y, color=groups, title=title, 
                  template='none')
    fig.show()

In [161]:
def chart(df, vars, title='Model Comparison'):
   
   vars = list(vars)
   vars.append('close')
   chart = pd.melt(df, id_vars='date', value_vars=vars, 
                 var_name='Variable', value_name='Value')

   iline(chart, 'date', 'Value', groups='Variable', title=title)

In [162]:
chart(ms, '', 'Closing Prices over Time')

### Decompose the time series and check it for stationarity. If the data is not stationary, difference the observations and store the results in a new Diff column.

In [163]:
series = ms.set_index('date')
#frequency is 252, we're looking at finance.
decomposition = sm.tsa.seasonal_decompose(series, model='additive', freq=252)


ms['trend'] = decomposition.trend.reset_index().close
ms['seasonality'] = decomposition.seasonal.reset_index().close
ms['residuals'] = decomposition.resid.reset_index().close

In [164]:
ms.describe()

Unnamed: 0,close,trend,seasonality,residuals
count,1259.0,1007.0,1259.0,1007.0
mean,51.063081,49.919506,0.0013,-0.200148
std,14.852117,10.170233,1.329931,1.789639
min,27.37,33.443516,-2.576283,-6.586172
25%,40.31,43.3163,-0.917716,-1.343388
50%,47.52,47.470635,-0.365645,-0.125781
75%,59.73,56.347579,0.549321,1.035761
max,95.01,74.683016,3.33635,4.118625


In [168]:
chart(ms, ['trend', 'seasonality', 'residuals'], 'Decomposition Analysis')

In [166]:
#augmented dickey fuller - we know it's not stationary.
adf_test = sm.tsa.stattools.adfuller(ms['close'])
results = pd.Series(adf_test[0:4], 
                    index=['ADF Test Statistic',
                            'P-Value', 
                            '# Lags Used',
                            '# Observations Used'])
for key, value in adf_test[4].items():
  results[f'Critical Value {key}'] = value
results

ADF Test Statistic        0.415655
P-Value                   0.982071
# Lags Used               0.000000
# Observations Used    1258.000000
Critical Value 1%        -3.435559
Critical Value 5%        -2.863840
Critical Value 10%       -2.567995
dtype: float64

In [167]:
ms['diff'] = ms['close'] - ms['close'].shift()
ms['diff'].fillna(0, inplace=True)

### Forecast the time series 60 days into the future using double and triple exponential smoothing models.

In [169]:
model = sm.tsa.ExponentialSmoothing(ms['close'], trend='additive').fit()
double_exp = model.forecast(60)

In [170]:
model = sm.tsa.ExponentialSmoothing(ms['close'], trend='additive', 
                                    seasonal='additive', 
                                    seasonal_periods=4).fit()

triple_exp = model.forecast(60)

### Forecast the time series 60 days into the future using ARMA, ARIMA, and SARIMA models.

In [171]:
def forecast(data, field, model, periods):
    model_results = model.forecast(periods)[0]
    current_value = data[field].iloc[-1]
    forecasts = []
    
    for result in model_results:
        forecast = current_value + result
        forecasts.append(forecast)
        current_value = forecast
    
    forecasts = pd.Series(forecasts)
    forecasts.index = [x + periods 
                       for x in list(data[field].tail(periods).index)]
    
    return forecasts

In [173]:
#a shift column will help with future modeling arrangements. Adding it here, should've done it sooner; probably.
ms['shift'] = ms['close'].shift(1).fillna(method='bfill')

In [175]:
model = ARMA(ms['diff'], order=(2, 1)).fit()
arma_forecasts = forecast(ms, 'close', model, 60)
arma_forecasts.head(20)

1259    89.711841
1260    89.801678
1261    89.887137
1262    89.968672
1263    90.046715
1264    90.121648
1265    90.193812
1266    90.263509
1267    90.331012
1268    90.396559
1269    90.460366
1270    90.522623
1271    90.583500
1272    90.643148
1273    90.701703
1274    90.759282
1275    90.815995
1276    90.871935
1277    90.927187
1278    90.981827
dtype: float64

In [177]:
model = ARIMA(ms['diff'], order=(1, 1, 1)).fit()
arima_forecasts = forecast(ms, 'close', model, 60)
arima_forecasts.head(20)

1259    89.754431
1260    89.839348
1261    89.926225
1262    90.013099
1263    90.100032
1264    90.187022
1265    90.274070
1266    90.361175
1267    90.448337
1268    90.535557
1269    90.622834
1270    90.710168
1271    90.797559
1272    90.885008
1273    90.972514
1274    91.060077
1275    91.147698
1276    91.235376
1277    91.323111
1278    91.410904
dtype: float64

In [179]:
model = SARIMAX(ms['close'], order=(2, 1, 1), 
                seasonal_order=(1, 1, 1, 1)).fit()

sarima_forecasts = model.forecast(60)
sarima_forecasts.head(20)

1259    89.714712
1260    89.793799
1261    89.874259
1262    89.955747
1263    90.038219
1264    90.121633
1265    90.205948
1266    90.291124
1267    90.377124
1268    90.463911
1269    90.551452
1270    90.639713
1271    90.728663
1272    90.818271
1273    90.908510
1274    90.999350
1275    91.090767
1276    91.182735
1277    91.275230
1278    91.368228
dtype: float64

### Forecast the time series 60 days into the future using the Facebook Prophet model.

In [180]:
series = ms[['date', 'close']]
series.columns = ['ds', 'y']
series

Unnamed: 0,ds,y
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04
...,...,...
1254,2018-02-01,94.26
1255,2018-02-02,91.78
1256,2018-02-05,88.00
1257,2018-02-06,91.33


In [181]:
model = Prophet()
model.fit(series)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<fbprophet.forecaster.Prophet at 0x7f0d023ba160>

In [184]:
future = model.make_future_dataframe(60)
future.tail(61)

Unnamed: 0,ds
1258,2018-02-07
1259,2018-02-08
1260,2018-02-09
1261,2018-02-10
1262,2018-02-11
...,...
1314,2018-04-04
1315,2018-04-05
1316,2018-04-06
1317,2018-04-07


In [185]:
results = model.predict(future)
results.tail()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
1314,2018-04-04,91.948867,89.553511,93.659337,91.286799,92.618983,-0.26595,-0.26595,-0.26595,-0.281461,-0.281461,-0.281461,0.015511,0.015511,0.015511,0.0,0.0,0.0,91.682917
1315,2018-04-05,92.019629,89.606631,93.541665,91.330374,92.717709,-0.336303,-0.336303,-0.336303,-0.319226,-0.319226,-0.319226,-0.017076,-0.017076,-0.017076,0.0,0.0,0.0,91.683326
1316,2018-04-06,92.090391,89.696377,94.05976,91.379068,92.811634,-0.257973,-0.257973,-0.257973,-0.210331,-0.210331,-0.210331,-0.047642,-0.047642,-0.047642,0.0,0.0,0.0,91.832418
1317,2018-04-07,92.161153,90.765306,94.909486,91.433631,92.905995,0.612132,0.612132,0.612132,0.687495,0.687495,0.687495,-0.075363,-0.075363,-0.075363,0.0,0.0,0.0,92.773285
1318,2018-04-08,92.231915,90.714967,94.922135,91.477662,92.988956,0.588047,0.588047,0.588047,0.687495,0.687495,0.687495,-0.099448,-0.099448,-0.099448,0.0,0.0,0.0,92.819962


In [186]:
# take the last 30 rows
prophet_forecasts = results.iloc[-60:]['yhat']
prophet_forecasts.head()

1259    88.695185
1260    88.837582
1261    89.772343
1262    89.812578
1263    88.907186
Name: yhat, dtype: float64

### Combine the observed values and all the forecasts into a single data frame and generate a line chart to visually compare the different models.

In [187]:
future_dates = pd.DataFrame({'date': pd.date_range(start=ms.date.iloc[-1], periods=61, 
                                                   freq='D', closed='right')})

fcast_df = pd.concat([pd.DataFrame(ms['date']), future_dates], ignore_index=True)
fcast_df = fcast_df.merge(ms[['date', 'close']], on='date', how='left')
fcast_df

Unnamed: 0,date,close
0,2013-02-08,27.55
1,2013-02-11,27.86
2,2013-02-12,27.88
3,2013-02-13,28.03
4,2013-02-14,28.04
...,...,...
1314,2018-04-04,
1315,2018-04-05,
1316,2018-04-06,
1317,2018-04-07,


In [188]:
fcast_df['Double_Exp_Smooth'] = double_exp
fcast_df['Triple_Exp_Smooth'] = triple_exp
fcast_df['ARMA'] = arma_forecasts
fcast_df['ARIMA'] = arima_forecasts
fcast_df['SARIMA'] = sarima_forecasts
fcast_df['Prophet'] = prophet_forecasts

In [189]:
fcast_df

Unnamed: 0,date,close,Double_Exp_Smooth,Triple_Exp_Smooth,ARMA,ARIMA,SARIMA,Prophet
0,2013-02-08,27.55,,,,,,
1,2013-02-11,27.86,,,,,,
2,2013-02-12,27.88,,,,,,
3,2013-02-13,28.03,,,,,,
4,2013-02-14,28.04,,,,,,
...,...,...,...,...,...,...,...,...
1314,2018-04-04,,92.430657,92.426407,92.809563,94.609600,94.920107,91.682917
1315,2018-04-05,,92.480026,92.463481,92.859296,94.699513,95.022020,91.683326
1316,2018-04-06,,92.529395,92.534728,92.909021,94.789483,95.124026,91.832418
1317,2018-04-07,,92.578763,92.644324,92.958740,94.879510,95.226119,92.773285


In [190]:
fcast_df.columns[2:]

Index(['Double_Exp_Smooth', 'Triple_Exp_Smooth', 'ARMA', 'ARIMA', 'SARIMA',
       'Prophet'],
      dtype='object')

In [191]:
chart(fcast_df, ['Double_Exp_Smooth', 'Triple_Exp_Smooth', 'ARMA', 'ARIMA', 'SARIMA',
       'Prophet'])