# 02-Time Series Forecasting: Problem 1 

### - 구간평균법 
### - 단순지수평활법 
### - 이중지수평활법
### - 홀트-윈터지수평활법 

### 1. 모듈 불러오기

In [1]:
import os
import itertools

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

from sklearn.datasets import fetch_openml
import statsmodels.api as sm

import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-whitegrid')
import seaborn as sns

!git clone https://github.com/hansam95/LG-Elec-Day3.git

  plt.style.use('seaborn-whitegrid')


### 2. 데이터 불러오기

In [3]:
# Google trend에서 2012년부터 2022년까지 10년간 '데이터분석 ' 키워드 관심도 변화량
# https://trends.google.com/trends/explore?date=2012-01-01%202022-01-01&geo=KR&q=%EB%8D%B0%EC%9D%B4%ED%84%B0%EB%B6%84%EC%84%9D
data = pd.read_csv('/content/LG-Elec-Day3/data/googletrend_keyword.csv')
data

Unnamed: 0,카테고리: 모든 카테고리
월,데이터분석: (대한민국)
2012-01,3
2012-02,15
2012-03,9
2012-04,0
...,...
2021-09,93
2021-10,78
2021-11,69
2021-12,71


#### 2.1 Data Preprocessing

In [66]:
# change column name
data = data.rename(columns={'카테고리: 모든 카테고리': 'data_analysis(korea)'})
data = data[1:]

In [68]:
# change the column type to numeric
data = data.astype(np.int64(data['data_analysis(korea)']))

In [None]:
# change the index type to datetime
data.index = pd.to_datetime(data.index)
data.index

In [None]:
'''
데이터 시각화를 통해 전체 개요 확인
'''
data.plot(figsize=(12,4)) # color='green', linestyle='--', linewidth=1

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.legend('')

plt.title("'Data analysis' keyword search amount from Google trend \n", fontsize=15)
plt.xlabel('\n Year', fontsize=13)
plt.ylabel('keyword search amount \n', fontsize=13)
plt.tight_layout()
plt.show()

In [None]:
# Seasonal decomposition plot: Seasonal decomposition using moving averages.
# https://www.statsmodels.org/stable/generated/statsmodels.tsa.seasonal.seasonal_decompose.html

# Observed: observed data
# Trend: The estimated trend component
# Seasonal: The estimated seasonal component
# resid: The estimated residuals

decompostion = sm.tsa.seasonal_decompose(data['data_analysis(korea)'],  model='additive')

fig = decompostion.plot()
fig.set_size_inches(10,10)
plt.show()

### Q. 학습 및 평가 데이터셋 분리
 - 학습 데이터는 2019년 12월까지, 평가 데이터는 2020년 1월부터 구성

In [75]:
'''
Train Test Split
'''
train = data[:'''Answer''']
test  = data['''Answer''':]

### 3. Moving Average (구간평균법)

In [None]:
'''
Version 1
'''
def Moving_Average(x, N):
    x = x.values.flatten()
    
    pred = np.convolve(x, np.ones(N) / float(N), 'valid')
    pred = np.concatenate((np.zeros(N-1), pred), axis=0)
    pred[:N-1] = np.nan
    return pred

MA_train_pred = pd.DataFrame(Moving_Average(train, 5), index=train.index, columns=['MA_5'])
MA_test_pred = pd.DataFrame(np.array([MA_train_pred.iloc[-1]]*len(test)), index=test.index, columns=['MA_5'])

print('Moving Average Train results')
print(MA_train_pred)
print('-'*30)
print('Moving Average Test results')
print(MA_test_pred)

In [None]:
'''
Version 2
'''
MA_train_pred = train.rolling(5).mean() # Option: min_periods=1
MA_train_pred.columns = ['MA_5']

MA_test_pred = pd.DataFrame(np.array([MA_train_pred.iloc[-1]]*len(test)), index=test.index, columns=['MA_5'])
prediction = pd.concat([MA_train_pred, MA_test_pred], axis=0)

print('Moving Average Train Results')
print(MA_train_pred)
print('-'*30)
print('Moving Average Test Results')
print(MA_test_pred)

In [None]:
'''
Visualization 
'''
fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
prediction.plot(ax=ax, label = 'Prediction (N=5)') 
ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'Prediction (N=5)', 'Start of Forecast'], loc='upper left')
plt.title('Moving Average Results (Train and Test)')
plt.tight_layout()
plt.show()

# Only Test
fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
MA_test_pred.plot(ax=ax, label = 'Prediction (N=5)')
ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'Prediction (N=5)', 'Start of Forecast'], loc='upper left')
plt.title('Moving Average Results (Only Test)')
plt.tight_layout()
plt.show()

##### 정량적 지표를 통한 모델 검증

###### Mean Squared Error (평균 제곱 오차) <br>
$\frac{1}{n} \sum_{i=1}^{n} (y_{i} - \hat{y}_{i})^{2}$   

In [None]:
print(f'MSE: {np.round(mean_squared_error(test, MA_test_pred), 2)}')

### Q. RMSE(제곱근 평균 제곱 오차)
 - Root Mean Squared Error (제곱근 평균 제곱 오차) <br>
$\sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_{i} - \hat{y}_{i})^{2}}$

In [None]:
print(f'RMSE: {'''Answer'''}') 

##### Mean Absolute Error (평균 절대 오차) <br>
$\frac{1}{n} \sum_{i=1}^{n} |y_{i} - \hat{y}_{i}|$

In [None]:
print(f'MAE: {np.round(mean_absolute_error(test, MA_test_pred), 2)}')

#####  R squared (결정계수 $({r})^{2}$) <br>
$\frac{SSR}{SST} = 1- \frac{SSR}{SST}%$

In [None]:
print(f'R2 score: {np.round(r2_score(test, MA_test_pred), 2)}')

### 4. Exponential Smoothing (지수평활법)

#### 4.1 Simple Exponential Smoothing (단순지수평활법)

### Q. 지수 평활법
 - 초기값 L_0을 train data의 평균으로 지정
 - numpy 내장 함수 활용

In [None]:
L_0 = '''Answer'''  # initial value

SES_train = train.copy()
SES_train = pd.concat([pd.DataFrame(np.zeros(1), columns=['data_analysis(korea)']), SES_train])
SES_train['Level'] = np.nan
SES_train['Forecast'] = np.nan
SES_train.loc[0,'Level'] = L_0.values

SES_train

In [91]:
'''
Version 1
'''
def Simple_Exponential_Smoothing(table, alpha):
    # table = SES_train.copy()
    # alpha = 0.9
    for i in range(len(table)):
        if i == 0: # Skip initialization point
            continue

        x = table.loc[list(table.index)[i],'data_analysis(korea)']
        L_prev = table.loc[list(table.index)[i-1],'Level']
        table.loc[list(table.index)[i],'Level'] = alpha*x + (1-alpha)*(L_prev)

        L_pred = table.iloc[-1,1]
        SES_test_pred = [L_pred]*len(test)
        SES_test_pred = pd.DataFrame(SES_test_pred, index=test.index, columns=[f'SES_{alpha}'])
        
    return table, SES_test_pred

In [92]:
SES_train_pred, SES_test_pred = Simple_Exponential_Smoothing(SES_train, 0.9)

In [None]:
print('Simple Exponential Smoothing Train Results')
print(SES_train_pred)
print('-'*30)
print('Simple Exponential Smoothing Test results')
print(SES_test_pred)

In [94]:
'''
Version 2
'''

SES_train_pred_09 = pd.concat([pd.DataFrame([L_0], columns=['data_analysis(korea)']), train]).ewm(alpha=0.9, adjust=False).mean().iloc[1:]
SES_train_pred_09.columns = ['SES_09']
SES_test_pred_09 = pd.DataFrame(np.array([SES_train_pred_09.iloc[-1]]*len(test)),
                                index=test.index, columns=['SES_09'])
prediction_09 = pd.concat([SES_train_pred_09, SES_test_pred_09], axis=0)

SES_train_pred_05 = pd.concat([pd.DataFrame([L_0], columns=['data_analysis(korea)']), train]).ewm(alpha=0.5, adjust=False).mean().iloc[1:]
SES_train_pred_05.columns = ['SES_05']
SES_test_pred_05 = pd.DataFrame(np.array([SES_train_pred_05.iloc[-1]]*len(test)),
                                index=test.index, columns=['SES_05'])
prediction_05 = pd.concat([SES_train_pred_05, SES_test_pred_05], axis=0)

SES_train_pred_01 = pd.concat([pd.DataFrame([L_0], columns=['data_analysis(korea)']), train]).ewm(alpha=0.1, adjust=False).mean().iloc[1:]
SES_train_pred_01.columns = ['SES_01']
SES_test_pred_01 = pd.DataFrame(np.array([SES_train_pred_01.iloc[-1]]*len(test)),
                                index=test.index, columns=['SES_01'])
prediction_01 = pd.concat([SES_train_pred_01, SES_test_pred_01], axis=0)

In [None]:
'''
Visualization 
'''
fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
prediction_09.plot(ax=ax, label = 'Prediction (alpha=0.9)')
prediction_05.plot(ax=ax, label = 'Prediction (alpha=0.5)')
prediction_01.plot(ax=ax, label = 'Prediction (alpha=0.1)')

ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'Prediction (alpha=0.9)', 'Prediction (alpha=0.5)', 'Prediction (alpha=0.1)', 'Start of Forecast'], loc='upper left')
plt.suptitle('Exponential Moving Average Results')
plt.tight_layout()
plt.show()

# Only Test
fig, ax = plt.subplots(figsize=(12,4))
data['2020-01':'2022-01'].plot(ax=ax)
prediction_09['2020-01':'2022-01'].plot(ax=ax, label = 'Prediction (alpha=0.9)')
prediction_05['2020-01':'2022-01'].plot(ax=ax, label = 'Prediction (alpha=0.5)')
prediction_01['2020-01':'2022-01'].plot(ax=ax, label = 'Prediction (alpha=0.1)')

ax.vlines(test.index[0], 50, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'Prediction (alpha=0.9)', 'Prediction (alpha=0.5)', 'Prediction (alpha=0.1)', 'Start of Forecast'], loc='upper left')
plt.suptitle('Exponential Moving Average Results')
plt.tight_layout()
plt.show()

#### 4.2 Double Exponential Smoothing (이중지수평활법)

In [None]:
'''
Searching Initialization Points
'''
train_reg = train.reset_index()
x = np.array(train_reg.index)
y = train_reg['data_analysis(korea)']

x = sm.add_constant(x)
model = sm.OLS(y,x)
results = model.fit()

print(results.summary())
L_0, B_0 = results.params

In [None]:
DES_train = train.copy()
DES_train = pd.concat([pd.DataFrame(np.zeros(1), columns=['data_analysis(korea)']), DES_train])
DES_train['Level'] = np.nan
DES_train['Trend'] = np.nan
DES_train['Forecast'] = np.nan

DES_train.loc[0,'Level'] = L_0
DES_train.loc[0,'Trend'] = B_0

DES_train

### Q. 지수평활법
 - Double_Exponential_Smoothing 함수 내 Level 과 Trend 변수 생성
 - 총 4개의 Answer는 'x', 'L_prev', 'T_prev" 3개 값으로 구성

In [100]:
def Double_Exponential_Smoothing(table, alpha, beta):
    # table = DES_train.copy()
    # alpha = 0.16
    # beta = 0.1
    for i in range(len(table)):
        if i == 0: # Skip initialization point
            continue

        x = table.loc[list(table.index)[i],'data_analysis(korea)']
        L_prev = table.loc[list(table.index)[i-1],'Level']
        T_prev = table.loc[list(table.index)[i-1],'Trend']
        
        table.loc[list(table.index)[i],'Level'] = alpha*'''Answer''' + (1-alpha)*'''Answer'''
        table.loc[list(table.index)[i],'Trend'] = beta*(table.loc[list(table.index)[i],'Level']-'''Answer''') + (1-beta)*'''Answer'''

        L_Pred = table.iloc[-1,1]
        T_Pred = table.iloc[-1,2]
        
        DES_test_pred = L_Pred + range(len(test))*T_Pred
        DES_test_pred = pd.DataFrame(DES_test_pred, index = test.index, columns=['DES'])
        
    return table, DES_test_pred

In [101]:
DES_train_pred, DES_test_pred = Double_Exponential_Smoothing(DES_train, 0.16, 0.1)

In [None]:
print('Double Exponential Smoothing Train Results')
print(DES_train_pred)
print('-'*30)
print('Double Exponential Smoothing Test results')
print(DES_test_pred)

In [None]:
'''
Visualization 
'''
# Train and Test
train_pred = pd.DataFrame(DES_train_pred.iloc[1:,1])
train_pred.columns = ['DES']
DES_pred = pd.concat([train_pred, DES_test_pred])

fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
DES_pred.plot(ax=ax, label = 'Prediction (alpha=0.16, beta=0.1)')
ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'DEMA (alpha=0.16, beta=0.1)', 'Start of Forecast'], loc='upper left')
plt.suptitle('Double Exponential Moving Average Results')
plt.tight_layout()
plt.show()

In [None]:
'''
Quantitative Evaluation
'''
print('-'*30)
print('alpha  = 0.16, beta = 0.1')
print(f'MSE: {np.round(mean_squared_error(test, DES_test_pred), 2)}')
print(f'RMSE: {np.round(np.sqrt(mean_squared_error(test,DES_test_pred)), 2)}')
print(f'MAE: {np.round(mean_absolute_error(test, DES_test_pred), 2)}')
print(f'R2 score: {np.round(r2_score(test, DES_test_pred), 2)}')

In [None]:
'''
Comparing
'''
fig, ax = plt.subplots(figsize=(12,4))
data['2020-01':].plot(ax=ax)
MA_test_pred['2020-01':].plot(ax=ax, label = 'Prediction (N=5)')
SES_test_pred_09['2020-01':].plot(ax=ax, label = 'Prediction (alpha=0.9)')
DES_test_pred['2020-01':].plot(ax=ax, label = 'Prediction (alpha=0.16, beta=0.1)')
ax.legend(['Raw Dataset', 'MA (N=5)', 'SES (alpha=0.9)', 'DES (alpha=0.16, beta=0.1)'], loc='upper left')
plt.suptitle('Double Exponential Moving Average Results')
plt.tight_layout()
plt.show()

#### 4.3 Holt-Winter's Exponential Smoothing (홀트-윈터 지수평활법)

##### 4.3.1 Additive/Multiplicative Winter's method

### Q. 홀트-윈터 지수평활법
 - Additive 홀트-윈터 지수 평활법을 활용한 분석 과정일 때, Answer 값 작성
 - Answer는 'add', 'mul' 값 중 하나로 구성

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
HW_model = ExponentialSmoothing(train, trend='''Answer''', seasonal='''Answer''').fit(optimized=True)
HW_model.summary()

In [None]:
HW_train_pred = HW_model.fittedvalues
HW_test_pred = HW_model.forecast(len(test))
HW_test_pred = pd.DataFrame(HW_test_pred, index=test.index, columns=['HW_add'])

print('Holt-Winter Exponential Smoothing Train Results')
print(HW_train_pred)
print('-'*30)
print('Holt-Winter Smoothing Test results')
print(HW_test_pred)

In [None]:
'''
Visualization 
'''
# Train and Test
train_pred = pd.DataFrame(HW_train_pred, columns=['HW_add'])
HW_pred = pd.concat([train_pred, HW_test_pred])

fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
HW_pred.plot(ax=ax, label = 'Prediction')
ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'HW', 'Start of Forecast'], loc='upper left')
plt.title('Holt-Winter Exponential Moving Average Results')
plt.tight_layout()
plt.show()

In [None]:
'''
Comparing
'''
fig, ax = plt.subplots(figsize=(12,4))
data.plot(ax=ax)
MA_test_pred['2020-01':].plot(ax=ax, label = 'Prediction (N=5)')
SES_test_pred_09['2020-01':].plot(ax=ax, label = 'Prediction (alpha=0.9)')
DES_test_pred['2020-01':].plot(ax=ax, label = 'Prediction (alpha=0.16, beta=0.1)')
HW_test_pred['2020-01':].plot(ax=ax, label = 'Prediction (additive)')
ax.vlines(test.index[0], 0, 100, linestyle='--', color='r')
ax.legend(['Raw Dataset', 'MA (N=5)', 'SES (alpha=0.9)', 'DES (alpha=0.16, beta=0.1)', 'HW (Additive)'], loc='upper left')
plt.title('Double Exponential Moving Average Results')
plt.tight_layout()
plt.show()

In [None]:
'''
Quantitative Evaluation
'''
print('-'*55)
print('Moving Average (N = 5)')
print(f'MSE: {np.round(mean_squared_error(test, MA_test_pred), 2)}')
print(f'RMSE: {np.round(np.sqrt(mean_squared_error(test,MA_test_pred)), 2)}')
print(f'MAE: {np.round(mean_absolute_error(test, MA_test_pred), 2)}')

print('-'*55)
print('Simple Exponential Smoothing (alpha = 0.9)')
print(f'MSE: {np.round(mean_squared_error(test, SES_test_pred_09), 2)}')
print(f'RMSE: {np.round(np.sqrt(mean_squared_error(test,SES_test_pred_09)), 2)}')
print(f'MAE: {np.round(mean_absolute_error(test, SES_test_pred_09), 2)}')

print('-'*55)
print('Double Exponential Smoothing (alpha = 0.16, beta = 0.1)')
print(f'MSE: {np.round(mean_squared_error(test, DES_test_pred), 2)}')
print(f'RMSE: {np.round(np.sqrt(mean_squared_error(test,DES_test_pred)), 2)}')
print(f'MAE: {np.round(mean_absolute_error(test, DES_test_pred), 2)}')

print('-'*55)
print('Holt-Winter Exponential Smoothing')
print(f'MSE: {np.round(mean_squared_error(test, HW_test_pred), 2)}')
print(f'RMSE: {np.round(np.sqrt(mean_squared_error(test,HW_test_pred)), 2)}')
print(f'MAE: {np.round(mean_absolute_error(test, HW_test_pred), 2)}')

In [None]:
test_predict_results = pd.concat([test, MA_test_pred, SES_test_pred_09, DES_test_pred, HW_test_pred], axis=1)
test_predict_results

### Q. 결론
 - 상기 4가지 시계열 분석 모델의 성능 비교를 통해 본 데이터에 가장 적합한 모델을 고른 후,
 - Quantitative Evaluation를 참고하여 선정 이유 서술

# EOD