[辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）](https://github.com/taroyabuki/fromzero)

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o pandarallel pca pmdarima | tail -n 1

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install pmdarima | tail -n 1

## 12.1 日時と日時の列

In [None]:
import pandas as pd
pd.to_datetime('2020-01-01')

In [None]:
pd.date_range(start='2021-01-01', end='2023-01-01', freq='1A')

pd.date_range(start='2021-01-01', end='2023-01-01', freq='1AS')

pd.date_range(start='2021-01-01', end='2021-03-01', freq='2M')

pd.date_range(start='2021-01-01', end='2021-03-01', freq='2MS')

pd.date_range(start='2021-01-01', end='2021-01-03', freq='1D')

pd.date_range(start='2021-01-01 00:00:00', end='2021-01-01 03:00:00', freq='2H')

## 12.2 時系列データの予測

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pmdarima.datasets import airpassengers
from sklearn.metrics import mean_squared_error

my_data = airpassengers.load_airpassengers()

In [None]:
n = len(my_data) # データ数（144）
k = 108          # 訓練データ数

In [None]:
my_ds = pd.date_range(
    start='1949/01/01',
    end='1960/12/01',
    freq='MS')
my_df = pd.DataFrame({
    'ds': my_ds,
    'x': range(n),
    'y': my_data},
    index=my_ds)
my_df.head()

In [None]:
my_train = my_df[        :k]
my_test  = my_df[-(n - k): ]
y = my_test.y

In [None]:
plt.plot(my_train.y, label='train')
plt.plot(my_test.y,  label='test')
plt.legend()

In [None]:
from sklearn.linear_model import LinearRegression

my_lm_model = LinearRegression()
my_lm_model.fit(my_train[['x']], my_train.y)

X = my_test[['x']]
y_ = my_lm_model.predict(X)
mean_squared_error(y, y_)**0.5 # RMSE（テスト）

In [None]:
y_ = my_lm_model.predict(my_df[['x']])
tmp = pd.DataFrame(y_,
                   index=my_df.index)
plt.plot(my_train.y, label='train')
plt.plot(my_test.y,  label='test')
plt.plot(tmp, label='model')
plt.legend()

In [None]:
import pmdarima as pm
my_arima_model = pm.auto_arima(my_train.y, m=12, trace=True)

In [None]:
y_, my_ci = my_arima_model.predict(len(my_test),         # 期間はテストデータと同じ．
                                   alpha=0.05,           # 有意水準（デフォルト）
                                   return_conf_int=True) # 信頼区間を求める．
tmp = pd.DataFrame({'y': y_,
                    'Lo': my_ci[:, 0],
                    'Hi': my_ci[:, 1]},
                   index=my_test.index)
tmp.head()

In [None]:
mean_squared_error(y, y_)**0.5

In [None]:
plt.plot(my_train.y, label='train')
plt.plot(my_test.y,  label='test')
plt.plot(tmp.y,      label='model')
plt.fill_between(tmp.index,
                 tmp.Lo,
                 tmp.Hi,
                 alpha=0.25) # 不透明度
plt.legend(loc='upper left')

In [None]:
try: from fbprophet import Prophet
except ImportError: from prophet import Prophet
my_prophet_model = Prophet(seasonality_mode='multiplicative')
my_prophet_model.fit(my_train)

In [None]:
tmp = my_prophet_model.predict(my_test)
tmp[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

In [None]:
y_ = tmp.yhat
mean_squared_error(y, y_)**0.5

In [None]:
# my_prophet_model.plot(tmp) # 予測結果のみでよい場合

fig = my_prophet_model.plot(tmp)
fig.axes[0].plot(my_train.ds, my_train.y)
fig.axes[0].plot(my_test.ds, my_test.y, color='red')