In [1]:
import statsmodels as sm
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [2]:
def load_data():
    df_read = sm.datasets.nile.load_pandas().data
    s_date = pd.Series(
        [pd.to_datetime(str(int(y_str))) for y_str in df_read['year']]
    )
    df = df_read.set_index(s_date)
    df = df.drop('year', axis=1)
    return df

In [3]:
def step_forward(rf_fitted, X, in_out_sep=67, pred_len=33):
    nlags = 3
    idx = in_out_sep - nlags - 1
    lags123 = np.asarray([X[idx, 0],
                          X[idx, 1],
                          X[idx, 2]])
    x_pred_hist = []
    for i in range(nlags + pred_len):
        x_pred = rf_fitted.predict([lags123])
        if i > nlags:
            x_pred_hist.append(x_pred)
        lags123[0] = lags123[1]
        lags123[1] = lags123[2]
        lags123[2] = x_pred
    x_pred_np = np.asarray(x_pred_hist).squeeze()
    return x_pred_np

In [4]:
# データロード
df_nile = load_data()
df_nile['lag1'] = df_nile['volume'].shift(1) 
df_nile['lag2'] = df_nile['volume'].shift(2)
df_nile['lag3'] = df_nile['volume'].shift(3)
# lag の欠損を除去
df_nile = df_nile.dropna()
df_nile.shape

(97, 4)

In [5]:
# データセット分割
X_train = df_nile[['lag1', 'lag2', 'lag3']][:67].values
X_test = df_nile[['lag1', 'lag2', 'lag3']][67:].values
y_train = df_nile['volume'][:67].values
y_test = df_nile['volume'][67:].values

In [6]:
# 学習
r_forest = RandomForestRegressor(
    n_estimators=100,
    criterion='mse',
    random_state=1,
    n_jobs=-1
)
r_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [7]:
# 予測
x_pred_np = step_forward(r_forest, np.vstack([X_train, X_test]),pred_len=31)

In [8]:
# 精度
acc = np.sqrt(np.sum((x_pred_np - y_test)**2))
print(acc)

681.1883936914955
