# Time Series Modeling (Smoothing Methods) Assignment

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import plotly.express as px

  import pandas.util.testing as tm


### Import the walmart stock prices data set.

In [2]:
df = pd.read_csv("https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%203/walmart_stock_prices.csv")
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,71.2,71.64,71.07,71.48,5906823,WMT
1,2013-02-11,71.25,71.51,70.53,71.4,6202534,WMT
2,2013-02-12,71.49,71.66,71.1,71.4,4761910,WMT
3,2013-02-13,71.29,71.7,71.21,71.39,3969807,WMT
4,2013-02-14,71.1,71.23,70.755,70.82,6820952,WMT


In [4]:
df.date = pd.to_datetime(df.date)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   open    1259 non-null   float64       
 2   high    1259 non-null   float64       
 3   low     1259 non-null   float64       
 4   close   1259 non-null   float64       
 5   volume  1259 non-null   int64         
 6   Name    1259 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 69.0+ KB


### Plot a line chart with the observed values (the daily closing prices).

In [5]:
def ilinechart(df, x, y, groups=None, title=''):
    fig = px.line(df, x=x, y=y, color=groups, title=title, 
                  template='none').update(layout=dict(title=dict(x=0.5)))

    fig.show()

In [7]:
ilinechart(df, "date", "close", title="Daily Closing Prices")

### Choose 4 moving average time windows and add columns to the data set containing the values for each of them.

In [8]:
df["20_Day"] = df.close.rolling(window=20).mean()
df["50_Day"] = df.close.rolling(window=50).mean()
df["100_Day"] = df.close.rolling(window=100).mean()
df["200_Day"] = df.close.rolling(window=200).mean()
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,20_Day,50_Day,100_Day,200_Day
0,2013-02-08,71.2,71.64,71.07,71.48,5906823,WMT,,,,
1,2013-02-11,71.25,71.51,70.53,71.4,6202534,WMT,,,,
2,2013-02-12,71.49,71.66,71.1,71.4,4761910,WMT,,,,
3,2013-02-13,71.29,71.7,71.21,71.39,3969807,WMT,,,,
4,2013-02-14,71.1,71.23,70.755,70.82,6820952,WMT,,,,


### Plot a multi-line chart that compares the 4 different simple moving averages.

In [11]:
melted = pd.melt(
    df,
    id_vars="date",
    value_vars=["close", "20_Day", "50_Day", "100_Day", "200_Day"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "date", "Value", groups="Variable", title="MA Comparison")

### Write a function that calculates a weighted moving average. Take the weighted moving average of the observed values and store it in a new column in the dataframe.

In [12]:
def wma(df, field, window):
    weights = np.arange(1, window+1)
    ma = df[field].rolling(window)
    wma = ma.apply(lambda x: np.dot(x, weights)/weights.sum(), raw=True)
    return wma

df["20_Day_WMA"] = wma(df, "close", 20)
df["50_Day_WMA"] = wma(df, "close", 50)
df["100_Day_WMA"] = wma(df, "close", 100)
df["200_Day_WMA"] = wma(df, "close", 200)

### Plot the weighted moving average in a multi-line plot with its simple moving average of the same length and evaluate the differences.

In [13]:
melted = pd.melt(
    df,
    id_vars="date",
    value_vars=["close", "20_Day", "20_Day_WMA", "50_Day", "50_Day_WMA", 
                "100_Day", "100_Day_WMA", "200_Day", "200_Day_WMA", ],
    var_name="Variable",
    value_name="Value"
)

melted

Unnamed: 0,date,Variable,Value
0,2013-02-08,close,71.480000
1,2013-02-11,close,71.400000
2,2013-02-12,close,71.400000
3,2013-02-13,close,71.390000
4,2013-02-14,close,70.820000
...,...,...,...
11326,2018-02-01,200_Day_WMA,90.171723
11327,2018-02-02,200_Day_WMA,90.364910
11328,2018-02-05,200_Day_WMA,90.512902
11329,2018-02-06,200_Day_WMA,90.667696


In [14]:
ilinechart(melted, "date", "Value", groups="Variable", title="Simple vs. Weighted Moving Averages")

### Apply a simple exponential smoothing model to the observed values and store the results in a new column.

In [15]:
model = sm.tsa.ExponentialSmoothing(df["close"]).fit()
df["200_Day_ExpS"] = model.predict(200)

### Add the simple exponential smoothing values to your multi-line plot containing the simple and weighted moving average values and evaluate how they differ.

In [16]:
melted = pd.melt(
    df,
    id_vars="date",
    value_vars=["close", "200_Day", "200_Day_WMA", "200_Day_ExpS"],
    var_name="Variable",
    value_name="Value"
)

ilinechart(melted, "date", "Value", groups="Variable", title="Smoothing Method Comparison")

Exponential Smoothing looks identical to the actual value.

### Perform double and triple exponential smoothing and store their respective values in new columns.

In [17]:
#double exp smoothing
model = sm.tsa.ExponentialSmoothing(df.close, trend="add").fit()
df["200_Day_DExpS"] = model.predict(200)

In [18]:
#triple exp smoothing
model = sm.tsa.ExponentialSmoothing(df.close, 
                                    trend="add", 
                                    seasonal="add",
                                    seasonal_periods=4).fit()
df["200_Day_TExpS"] = model.predict(200)

### Create a new multi-line plot showing the results of the three exponential smoothing methods.

In [19]:
melted = pd.melt(
    df,
    id_vars="date",
    var_name="Variable",
    value_name="Value",
    value_vars=["close", "200_Day_ExpS", "200_Day_DExpS", "200_Day_TExpS"]
)

ilinechart(melted, "date", "Value", groups="Variable", title="Smoothing Methods")

### Evaluate the performances of the three exponential smoothing methods by calculating their mean absolute error and their root mean squared error. Which one modeled the data best?

In [24]:
df["ExpS_diff"] = df.close - df["200_Day_ExpS"]
df["DExpS_diff"] = df.close - df["200_Day_DExpS"]
df["TExpS_diff"] = df.close - df["200_Day_TExpS"]

print('Simple MAE:', df['ExpS_diff'].abs().mean(), 
      'RMSE: ', np.sqrt(np.mean(df['ExpS_diff']**2)))

print('Double MAE:', df['DExpS_diff'].abs().mean(), 
      'RMSE: ', np.sqrt(np.mean(df['DExpS_diff']**2)))

print('Triple MAE:', df['TExpS_diff'].abs().mean(), 
      'RMSE: ', np.sqrt(np.mean(df['TExpS_diff']**2)))

Simple MAE: 0.5621068297747482 RMSE:  0.8577642387344923
Double MAE: 0.5610482929207913 RMSE:  0.8574485942323363
Triple MAE: 0.5606008228074769 RMSE:  0.8565630113160846


Triple Exponential Smoothing did best by having the lowest errors