# モデル運用 - ローリング予測の比較

### Refit: 
- 予測時点で新しく観測されたデータを学習データに加え、モデルを再学習し、予測を実施

### Test Time Augmentation(TTA): 
- モデルはそのままで、データをアップデートして（予測時点で新しく観測されたデータからlog変数を作成）予測を実施

[参照元](https://github.com/h2oai/driverlessai-tutorials/blob/master/driverlessai_experiments/timeseries/walmart_timeseries_experiment/timeseries_model_rollingwindow.ipynb)

In [1]:
from collections import OrderedDict
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import driverlessai

データはDriverlessAIサーバの`/data/TimeSeries/walmart/`配下

In [2]:
sales_data = pd.read_csv("../tmp/walmart_train.csv")
print(sales_data.shape)
sales_data.head()

(421570, 15)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,sample_weight
0,1,1,2010-02-05,24924.5,42.31,2.572,-1.0,-1.0,-1.0,-1.0,-1.0,211.096358,8.106,0,1
1,1,2,2010-02-05,50605.27,42.31,2.572,-1.0,-1.0,-1.0,-1.0,-1.0,211.096358,8.106,0,1
2,1,3,2010-02-05,13740.12,42.31,2.572,-1.0,-1.0,-1.0,-1.0,-1.0,211.096358,8.106,0,1
3,1,4,2010-02-05,39954.04,42.31,2.572,-1.0,-1.0,-1.0,-1.0,-1.0,211.096358,8.106,0,1
4,1,5,2010-02-05,32229.38,42.31,2.572,-1.0,-1.0,-1.0,-1.0,-1.0,211.096358,8.106,0,1


In [3]:
# 前処理
sales_data["Date"] = pd.to_datetime(sales_data["Date"], format="%Y-%m-%d")

lag_variables = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]
dai_data = sales_data.set_index(["Date", "Store", "Dept"])
lagged_data = dai_data.loc[:, lag_variables].groupby(level=["Store", "Dept"]).shift(1)

dai_data = dai_data.join(lagged_data.rename(columns=lambda x: x +"_lag"))

dai_data = dai_data.drop(lagged_data, axis=1)
dai_data = dai_data.reset_index()

dai_data.head()

Unnamed: 0,Date,Store,Dept,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,IsHoliday,sample_weight,Temperature_lag,Fuel_Price_lag,CPI_lag,Unemployment_lag
0,2010-02-05,1,1,24924.5,-1.0,-1.0,-1.0,-1.0,-1.0,0,1,,,,
1,2010-02-05,1,2,50605.27,-1.0,-1.0,-1.0,-1.0,-1.0,0,1,,,,
2,2010-02-05,1,3,13740.12,-1.0,-1.0,-1.0,-1.0,-1.0,0,1,,,,
3,2010-02-05,1,4,39954.04,-1.0,-1.0,-1.0,-1.0,-1.0,0,1,,,,
4,2010-02-05,1,5,32229.38,-1.0,-1.0,-1.0,-1.0,-1.0,0,1,,,,


前処理は[時系列予測実行例](./DAI_PyClient_TS_example.ipynb)参考

In [4]:
dai_data['Date'].nunique()

143

In [5]:
print( dai_data['Date'].min() )
print( dai_data['Date'].max() )

2010-02-05 00:00:00
2012-10-26 00:00:00


## Refitの実施

In [6]:
def get_moving_windows(dataset, train_len, test_len, date_col):
    '''
    Multi Window検証の学習とテストデータの日付期間を作成
    train_len + test_lenで１サイクル期間
    test_lenづつずらして行き、最後の日付に到達する回数がk分割回数となる
    '''
    
    unique_dates = dataset[date_col].unique()    # 元データ全体
    unique_dates.sort()
    num_dates = len(unique_dates)   # 元データの長さ
    num_windows = (num_dates - train_len) // test_len   # k分割回数
    print("Number of Training Windows: ", num_windows)
    
    windows = []
    for i in range(num_windows):
        train_start_date = unique_dates[i]
        train_end_date = unique_dates[(i + train_len - 1)]
        test_start_date = unique_dates[(i + train_len)]
        test_end_date = unique_dates[(i + train_len + test_len - 1)]
        
        window = {'train_start_date': train_start_date, 
                  'train_end_date': train_end_date, 
                  'test_start_date': test_start_date,
                  'test_end_date': test_end_date}
        windows.append(window)
        
    return windows

In [7]:
#get_moving_windows(dai_data, 139, 1, "Date")

In [8]:
# Multi Window検証の学習と検証データの日付をDataFrameで表示
pd.DataFrame([OrderedDict(x) for x in get_moving_windows(dai_data, 139, 1, "Date")])    # テスト期間は１週のみとする

Number of Training Windows:  4


Unnamed: 0,train_start_date,train_end_date,test_start_date,test_end_date
0,2010-02-05,2012-09-28,2012-10-05,2012-10-05
1,2010-02-12,2012-10-05,2012-10-12,2012-10-12
2,2010-02-19,2012-10-12,2012-10-19,2012-10-19
3,2010-02-26,2012-10-19,2012-10-26,2012-10-26


#### TrainとTest期間がローリング(4 windows)されている。Refitでは毎回、Train期間のデータでモデルが一から学習され、そのモデルでTest期間の予測が実施される。

In [14]:
def dai_get_forecast(train_data, test_data, predictors, target, date_col, time_group_cols):
    '''
    train_dataでモデル作成し、test_dataを予測、予測結果をpandas.DataFrameで返す
    train_data、test_dataはローカル上のpandas.DataFrame
    '''
    # 学習、テストデータをローカルに保存
    train_path = "../tmp/tta_refit__train_data.csv"
    test_path = "../tmp/tta_refit__test_data.csv"
    keep_cols = predictors + [target, date_col] + time_group_cols
    keep_cols = list(set(keep_cols))
    print('Train data shape: {}'.format(train_data[keep_cols].shape))
    print('Test data shape: {}'.format(test_data[keep_cols].shape))
    train_data[keep_cols].to_csv(train_path, index = False)
    test_data[keep_cols].to_csv(test_path, index = False)
    
    # Driverless AIへのデータのアップロード
    train_dai = h2oai.datasets.create(train_path, force=True)
    test_dai = h2oai.datasets.create(test_path, force=True)
    
    # Experimentの実施
    settings = {
        'test_dataset': test_dai,    # テストデータは指定しなくても良い
        'time_column': date_col, 
        'time_groups_columns': time_group_cols,
        'num_prediction_periods': test_data[date_col].nunique(),
        'num_gap_periods': 0,
        'accuracy': 1,
        'time': 1,
        'interpretability': 10,
        'scorer': "RMSE",
    }
    experiment = h2oai.experiments.create(train_dataset=train_dai,
                                    target_column=target,
                                    task='regression',
                                    force=True,
                                    **settings)
    print('Is this experiment completed? : {}'.format(experiment.is_complete()))
    
    # テストデータに対する予測
    keep_cols = [target, date_col] + time_group_cols
    prediction = experiment.predict(dataset=test_dai, include_columns=keep_cols)
    result_data_name = '../tmp/wm_test_data_RES.csv'
    prediction.download(dst_dir='.', dst_file=result_data_name, overwrite=True)
    test_predictions = pd.read_csv(result_data_name) 
    
    return test_predictions

In [15]:
# Driverless AIのuser nameとpasswordの読み込み
import json
with open('../idpass.json') as f:
    idpass = json.load(f)

In [16]:
# Driverless AIサーバーへの接続
h2oai = driverlessai.Client(address='http://34.203.218.123:12345', username=idpass['id'], password=idpass['pass1921'])
h2oai

<class 'driverlessai._core.Client'> http://34.203.218.123:12345

***
#### dai_get_forecast関数のテスト実施  
1Windowだけ実施

In [17]:
date_dict = get_moving_windows(dai_data, 139, 1, "Date")[0]
date_dict   # テスト実施する最初のWindow

Number of Training Windows:  4


{'train_start_date': numpy.datetime64('2010-02-05T00:00:00.000000000'),
 'train_end_date': numpy.datetime64('2012-09-28T00:00:00.000000000'),
 'test_start_date': numpy.datetime64('2012-10-05T00:00:00.000000000'),
 'test_end_date': numpy.datetime64('2012-10-05T00:00:00.000000000')}

In [18]:
train_data_local = dai_data[(dai_data['Date']>=date_dict['train_start_date'])&(dai_data['Date']<=date_dict['train_end_date'])]
#print(train_data_local.shape)
test_data_local = dai_data[(dai_data['Date']>=date_dict['test_start_date'])&(dai_data['Date']<=date_dict['test_end_date'])]
#print(test_data_local.shape)

predictors = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "IsHoliday",
              "Temperature_lag", "Fuel_Price_lag", "CPI_lag", "Unemployment_lag"]

test_pred = dai_get_forecast(train_data=train_data_local, 
                             test_data=test_data_local,
                             predictors=predictors, target='Weekly_Sales', 
                             date_col='Date',
                             time_group_cols=["Store", "Dept"])
print(test_pred.shape)
test_pred.head()    # テストデータの予測結果（１週分）がローカル環境のpandas.DataFrameとして返る

Train data shape: (409695, 14)
Test data shape: (2976, 14)
Complete 100.00% - [4/4] Computed stats for column CPI_lag
Complete 100.00% - [4/4] Computed stats for column MarkDown3
Experiment launched at: http://34.203.218.123:12345/#/experiment?key=805cb456-b449-11eb-a33c-0242ac110002
Complete 100.00% - Status: Complete                                                
Is this experiment completed? : True
Complete
Downloaded '../tmp/wm_test_data_RES.csv'
(2976, 7)


Unnamed: 0,Weekly_Sales,Dept,Store,Date,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,21904.47,1,1,2012-10-05,19068.201,1274.277656,33210.344137
1,48577.08,2,1,2012-10-05,46046.13,42492.829406,50226.264695
2,11676.98,3,1,2012-10-05,14077.924,10552.320883,22630.977262
3,39311.93,4,1,2012-10-05,37381.54,33228.004227,41720.183117
4,25508.81,5,1,2012-10-05,22094.934,8953.028156,34228.29002


***

#### ローリング予測（４回）の実施

In [19]:
windows = get_moving_windows(dai_data, train_len = 139, test_len = 1, date_col = "Date")

Number of Training Windows:  4


In [20]:
forecast_predictions = pd.DataFrame([])   # 結果の格納用

predictors = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "IsHoliday",
              "Temperature_lag", "Fuel_Price_lag", "CPI_lag", "Unemployment_lag"]

for window in windows:   # 4回のローリング予測
    train_data_local = dai_data[(dai_data["Date"] >= window.get("train_start_date")) & 
                                (dai_data["Date"] <= window.get("train_end_date"))]
    test_data_local = dai_data[(dai_data["Date"] >= window.get("test_start_date")) & 
                               (dai_data["Date"] <= window.get("test_end_date"))]
    print('Test Data date: {}'.format(test_data_local['Date'].unique()))

    # Driverless AI予測の実施
    preds = dai_get_forecast(train_data=train_data_local, 
                             test_data=test_data_local, 
                             predictors=predictors, target='Weekly_Sales', 
                             date_col='Date',
                             time_group_cols=["Store", "Dept"])
    forecast_predictions = forecast_predictions.append(preds)

Test Data date: ['2012-10-05T00:00:00.000000000']
Train data shape: (409695, 14)
Test data shape: (2976, 14)
Complete 100.00% - [4/4] Computed stats for column CPI_lag
Complete 100.00% - [4/4] Computed stats for column MarkDown3
Experiment launched at: http://34.203.218.123:12345/#/experiment?key=a860c00e-b44a-11eb-a33c-0242ac110002
Running 100.00% - Status: Complete                                                 
Is this experiment completed? : True
Complete
Downloaded '../tmp/wm_test_data_RES.csv'
Test Data date: ['2012-10-12T00:00:00.000000000']
Train data shape: (409716, 14)
Test data shape: (2990, 14)
Running 100.00% - [4/4] Computed stats for column CPI_lag
Complete 100.00% - [4/4] Computed stats for column MarkDown3
Experiment launched at: http://34.203.218.123:12345/#/experiment?key=8b05fd34-b44b-11eb-a33c-0242ac110002
Complete 100.00% - Status: Complete                                                
Is this experiment completed? : True
Complete
Downloaded '../tmp/wm_test_dat

In [21]:
forecast_predictions.shape

(11875, 7)

In [22]:
# テストデータは各1日のみ。合計4つのテストデータWindow
forecast_predictions['Date'].unique()

array(['2012-10-05', '2012-10-12', '2012-10-19', '2012-10-26'],
      dtype=object)

In [23]:
forecast_predictions.head()

Unnamed: 0,Weekly_Sales,Dept,Store,Date,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,21904.47,1,1,2012-10-05,19164.223,1087.738789,33253.242086
1,48577.08,2,1,2012-10-05,46317.516,42595.535586,50645.047117
2,11676.98,3,1,2012-10-05,14119.062,10600.086898,22705.786588
3,39311.93,4,1,2012-10-05,37001.637,32999.175672,41430.120812
4,25508.81,5,1,2012-10-05,22046.75,8944.144172,34217.192461


In [24]:
# Store=1, Dept=1のみ表示
forecast_predictions.loc[(forecast_predictions["Store"] == 1) & (forecast_predictions["Dept"] == 1),:]

Unnamed: 0,Weekly_Sales,Dept,Store,Date,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,21904.47,1,1,2012-10-05,19164.223,1087.738789,33253.242086
0,22764.01,1,1,2012-10-12,21454.732,3563.688594,36387.560445
0,24185.27,1,1,2012-10-19,22815.848,4026.443086,37206.846285
0,27390.81,1,1,2012-10-26,23584.477,6803.611633,37111.713668


## Test Time Augmentation(TTA)の実施

#### TTAでは、最初の139週でモデル学習（予測期間は１週先）を行い、残りの4週はその同じモデルを利用する

In [25]:
# 最後の4週
date_show = dai_data['Date'].unique()
date_show.sort()
print(date_show[-4:])

['2012-10-05T00:00:00.000000000' '2012-10-12T00:00:00.000000000'
 '2012-10-19T00:00:00.000000000' '2012-10-26T00:00:00.000000000']


In [26]:
# 学習期間の最終週
date_show[-5]

numpy.datetime64('2012-09-28T00:00:00.000000000')

In [27]:
train_data = dai_data[dai_data["Date"] <= "2012-09-28"]
print(train_data.shape)
test_data = dai_data[dai_data["Date"] >= "2012-10-05"]     # 4週分
print(test_data.shape)

(409695, 15)
(11875, 15)


In [29]:
# 学習データをローカルへ保存
train_path = "../tmp/tta_refit__train_data.csv"
#test_path = "../tmp/tta_refit__test_data.csv"

predictors = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "IsHoliday",
              "Temperature_lag", "Fuel_Price_lag", "CPI_lag", "Unemployment_lag"]
keep_cols = predictors + ["Weekly_Sales", "Date", "Dept", "Store"]
keep_cols = list(set(keep_cols))

train_data[keep_cols].to_csv(train_path, index = False)

In [31]:
# Driverless AIへの学習データのアップロード
train_dai = h2oai.datasets.create(train_path, force=True)

Complete 100.00% - [4/4] Computed stats for column CPI_lag


In [32]:
# Experimentの実施
settings = {
        'time_column': 'Date', 
        'time_groups_columns': ["Store", "Dept"],
        'num_prediction_periods': 1,
        'num_gap_periods': 0,
        'accuracy': 1,
        'time': 1,
        'interpretability': 10,
        'scorer': "RMSE",
}
experiment = h2oai.experiments.create(train_dataset=train_dai,
                                      target_column='Weekly_Sales',
                                      task='regression',
                                      force=True,
                                      **settings)

Experiment launched at: http://34.203.218.123:12345/#/experiment?key=8ae0a5e0-b44e-11eb-a33c-0242ac110002
Complete 100.00% - Status: Complete                                                


In [35]:
experiment.is_complete()

True

#### TTAを実施するにあたっての予測用データセットの作成に関して
- 予測を実施する週のターゲット変数の値はNAとする
- Lag変数（ターゲット変数など）の値は予測時点で実測が分かっているものは含めておく

***
#### 運用開始２週目（テスト期間２週目）の予測を実施する場合（１週目の実績は手に入っていることを想定）

In [36]:
# テスト期間２週目（2012-10-12）の予測を実施する場合(2012-10-05の実績はもう手に入っている)に作成するデータ

tta_test_data_wk2 = test_data[test_data["Date"] <= "2012-10-12"].copy()
tta_test_data_wk2.loc[tta_test_data_wk2["Date"] == "2012-10-12", "Weekly_Sales"] = None
print(tta_test_data_wk2.shape)

# Store=1, Dept=1のみ表示
tta_test_data_wk2.loc[(tta_test_data_wk2["Store"] == 1) & (tta_test_data_wk2["Dept"] == 1), ["Date", "Store", "Dept", "Weekly_Sales"]]

(5966, 15)


Unnamed: 0,Date,Store,Dept,Weekly_Sales
409695,2012-10-05,1,1,21904.47
412671,2012-10-12,1,1,


TTAによる予測を実施する場合、内部で自動的に、2012-10-05の実績値を用いたLag変数作成の処理が行われ、学習済みのモデルを用いた予測が実施される

In [38]:
# 上記のデータをDriverlessにアップ
test_path = "../tmp/tta_test_data.csv"
tta_test_data_wk2.to_csv(test_path, index = False)
tta_test_data_wk2_dai = h2oai.datasets.create(test_path, force=True)

Complete 100.00% - [4/4] Computed stats for column Unemployment_lag


In [39]:
# 予測の実施
pred_job = experiment.predict(dataset=tta_test_data_wk2_dai, include_columns=keep_cols)
result_data_name = '../tmp/tta_test_data_RES.csv'
# 結果のダウンロード
pred_job.download(dst_dir='.', dst_file=result_data_name, overwrite=True)
# pandas.DataFrameへ読み込み
preds = pd.read_csv(result_data_name)
print(preds.shape)

Complete
Downloaded '../tmp/tta_test_data_RES.csv'
(5966, 17)


In [40]:
preds['Date'].unique()

array(['2012-10-05', '2012-10-12'], dtype=object)

In [41]:
preds.head()

Unnamed: 0,Date,Store,Dept,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,IsHoliday,Temperature_lag,Fuel_Price_lag,CPI_lag,Unemployment_lag,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,2012-10-05,1,1,21904.47,8077.89,-1.0,18.22,3617.43,3626.14,0,76.08,3.666,222.981658,6.908,19164.223,1087.738789,33253.242086
1,2012-10-05,1,2,48577.08,8077.89,-1.0,18.22,3617.43,3626.14,0,76.08,3.666,222.981658,6.908,46317.516,42595.535586,50645.047117
2,2012-10-05,1,3,11676.98,8077.89,-1.0,18.22,3617.43,3626.14,0,76.08,3.666,222.981658,6.908,14119.062,10600.086898,22705.786588
3,2012-10-05,1,4,39311.93,8077.89,-1.0,18.22,3617.43,3626.14,0,76.08,3.666,222.981658,6.908,37001.637,32999.175672,41430.120812
4,2012-10-05,1,5,25508.81,8077.89,-1.0,18.22,3617.43,3626.14,0,76.08,3.666,222.981658,6.908,22046.75,8944.144172,34217.192461


In [42]:
# Store=1, Dept=1のみ表示
preds.loc[(preds["Store"] == 1) & (preds["Dept"] == 1), ["Date", "Store", "Dept", "Weekly_Sales", "Weekly_Sales.predicted"]]

Unnamed: 0,Date,Store,Dept,Weekly_Sales,Weekly_Sales.predicted
0,2012-10-05,1,1,21904.47,19164.223
2976,2012-10-12,1,1,,21469.748


***

#### 各テスト期間（2012-10-05, 2012-10-12, 2012-10-19, 2012-10-26）に対してTTAによるローリング予測を実施

In [83]:
def get_tta_predictions(test_data, time_col, time_group_cols, target_col):
    
    test_preds = pd.DataFrame()
    dates = test_data[time_col].unique()   # 予測期間=1とした実装
    
    for i,day in enumerate(dates):
        print('~~~~~~~~~~~~~~~~~ ローリングWindow.{} ({}) ~~~~~~~~~~~~~~~~~'.format(i, day))
        
        # テストデータの作成
        tta_test_data = test_data[test_data[time_col] <= day].copy()
        tta_test_data.loc[test_data[time_col] == day, target_col] = None
        print('----- テストデータ：Store=1, Dept=1のみ表示 -----')
        display(tta_test_data.loc[(tta_test_data["Store"] == 1) & (tta_test_data["Dept"] == 1), ["Date", "Store", "Dept", "Weekly_Sales"]])
        print('--------------------------------------------')
        
        # Driverlessへのテストデータのアップ
        test_path = "../tmp/tta_test_data.csv"
        tta_test_data.to_csv(test_path, index = False)
        test_dai = h2oai.datasets.create(test_path, force=True)
        
        # 予測
        pred_job = experiment.predict(dataset=test_dai, include_columns=keep_cols)
        
        # 結果のDLとフォーマット
        result_data_name = 'tta_test_data_RES.csv'
        pred_job.download(dst_dir='.', dst_file=result_data_name, overwrite=True)
        preds = pd.read_csv(result_data_name)
        preds['Window_no'] = i     # Window番号を付与
        
        test_preds = pd.concat([test_preds, preds], axis = 0)    # 各テストデータの連結
        
    out_cols = ['Window_no','Weekly_Sales','Date','Store','Dept','Weekly_Sales.predicted','Weekly_Sales.predicted.lower','Weekly_Sales.predicted.upper']
    test_preds = test_preds[out_cols]
    return test_preds

In [84]:
# TTAによるローリング予測（Window数=４）の実施
forecast_predictions_tta = get_tta_predictions(test_data=test_data, time_col="Date", time_group_cols=["Store", "Dept"], target_col="Weekly_Sales")

~~~~~~~~~~~~~~~~~ ローリングWindow.0 (2012-10-05T00:00:00.000000000) ~~~~~~~~~~~~~~~~~
----- テストデータ：Store=1, Dept=1のみ表示 -----


Unnamed: 0,Date,Store,Dept,Weekly_Sales
409695,2012-10-05,1,1,


--------------------------------------------
Complete 100.00% - [4/4] Computed stats for column Unemployment_lag
Complete
Downloaded 'tta_test_data_RES.csv'
~~~~~~~~~~~~~~~~~ ローリングWindow.1 (2012-10-12T00:00:00.000000000) ~~~~~~~~~~~~~~~~~
----- テストデータ：Store=1, Dept=1のみ表示 -----


Unnamed: 0,Date,Store,Dept,Weekly_Sales
409695,2012-10-05,1,1,21904.47
412671,2012-10-12,1,1,


--------------------------------------------
Complete 100.00% - [4/4] Computed stats for column Unemployment_lag
Complete
Downloaded 'tta_test_data_RES.csv'
~~~~~~~~~~~~~~~~~ ローリングWindow.2 (2012-10-19T00:00:00.000000000) ~~~~~~~~~~~~~~~~~
----- テストデータ：Store=1, Dept=1のみ表示 -----


Unnamed: 0,Date,Store,Dept,Weekly_Sales
409695,2012-10-05,1,1,21904.47
412671,2012-10-12,1,1,22764.01
415661,2012-10-19,1,1,


--------------------------------------------
Complete 100.00% - [4/4] Computed stats for column Unemployment_lag
Complete
Downloaded 'tta_test_data_RES.csv'
~~~~~~~~~~~~~~~~~ ローリングWindow.3 (2012-10-26T00:00:00.000000000) ~~~~~~~~~~~~~~~~~
----- テストデータ：Store=1, Dept=1のみ表示 -----


Unnamed: 0,Date,Store,Dept,Weekly_Sales
409695,2012-10-05,1,1,21904.47
412671,2012-10-12,1,1,22764.01
415661,2012-10-19,1,1,24185.27
418611,2012-10-26,1,1,


--------------------------------------------
Complete 100.00% - [4/4] Computed stats for column Unemployment_lag
Complete
Downloaded 'tta_test_data_RES.csv'


In [85]:
forecast_predictions_tta.shape

(29733, 8)

In [86]:
forecast_predictions_tta.head()

Unnamed: 0,Window_no,Weekly_Sales,Date,Store,Dept,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,0,,2012-10-05,1,1,19164.223,1087.738789,33253.242086
1,0,,2012-10-05,1,2,46317.516,42595.535586,50645.047117
2,0,,2012-10-05,1,3,14119.062,10600.086898,22705.786588
3,0,,2012-10-05,1,4,37001.637,32999.175672,41430.120812
4,0,,2012-10-05,1,5,22046.75,8944.144172,34217.192461


In [87]:
forecast_predictions_tta['Date'].value_counts()

2012-10-05    11904
2012-10-12     8970
2012-10-19     5900
2012-10-26     2959
Name: Date, dtype: int64

In [88]:
# Store=1, Dept=1のみ表示（Window順に表示）
forecast_predictions_tta.loc[(forecast_predictions_tta["Store"] == 1) & (forecast_predictions_tta["Dept"] == 1),:]

# Window_noが各ローリング予測を示す

Unnamed: 0,Window_no,Weekly_Sales,Date,Store,Dept,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper
0,0,,2012-10-05,1,1,19164.223,1087.738789,33253.242086
0,1,21904.47,2012-10-05,1,1,19164.223,1087.738789,33253.242086
2976,1,,2012-10-12,1,1,21469.748,3393.26418,35558.767477
0,2,21904.47,2012-10-05,1,1,19164.223,1087.738789,33253.242086
2976,2,22764.01,2012-10-12,1,1,21469.748,3393.26418,35558.767477
5966,2,,2012-10-19,1,1,22349.893,4273.408711,36438.912008
0,3,21904.47,2012-10-05,1,1,19164.223,1087.738789,33253.242086
2976,3,22764.01,2012-10-12,1,1,21469.748,3393.26418,35558.767477
5966,3,24185.27,2012-10-19,1,1,22349.893,4273.408711,36438.912008
8916,3,,2012-10-26,1,1,23651.75,5575.266133,37740.76943


TTAローリング予測を実施した週のみ取得

In [89]:
# Weekly_SalesがNAの行を取得（＝TTA予測を実施した週）
forecast_predictions_tta_eval = forecast_predictions_tta[forecast_predictions_tta['Weekly_Sales'].isna()]
forecast_predictions_tta_eval.shape   # Refit結果データと同じ行数

(11875, 8)

In [90]:
# Weekly_Sales=NaNとなっているので、精度比較のために元データ（dai_data）から引っ張てくる
dai_data_merge = dai_data[['Date','Store','Dept','Weekly_Sales']]
forecast_predictions_tta_eval = forecast_predictions_tta_eval.drop('Weekly_Sales', axis=1)
forecast_predictions_tta_eval["Date"] = pd.to_datetime(forecast_predictions_tta_eval["Date"], format="%Y-%m-%d")
forecast_predictions_tta_eval = pd.merge(forecast_predictions_tta_eval, dai_data_merge, on=['Date','Store','Dept'], how='left')
print(forecast_predictions_tta_eval.shape)
forecast_predictions_tta_eval.head()

(11875, 8)


Unnamed: 0,Window_no,Date,Store,Dept,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper,Weekly_Sales
0,0,2012-10-05,1,1,19164.223,1087.738789,33253.242086,21904.47
1,0,2012-10-05,1,2,46317.516,42595.535586,50645.047117,48577.08
2,0,2012-10-05,1,3,14119.062,10600.086898,22705.786588,11676.98
3,0,2012-10-05,1,4,37001.637,32999.175672,41430.120812,39311.93
4,0,2012-10-05,1,5,22046.75,8944.144172,34217.192461,25508.81


In [91]:
# Store=1, Dept=1のみ表示
forecast_predictions_tta_eval.loc[(forecast_predictions_tta_eval["Store"] == 1) & (forecast_predictions_tta_eval["Dept"] == 1),:]

Unnamed: 0,Window_no,Date,Store,Dept,Weekly_Sales.predicted,Weekly_Sales.predicted.lower,Weekly_Sales.predicted.upper,Weekly_Sales
0,0,2012-10-05,1,1,19164.223,1087.738789,33253.242086,21904.47
2976,1,2012-10-12,1,1,21469.748,3393.26418,35558.767477,22764.01
5966,2,2012-10-19,1,1,22349.893,4273.408711,36438.912008,24185.27
8916,3,2012-10-26,1,1,23651.75,5575.266133,37740.76943,27390.81


## RefitとTTAの比較（テストデータ）

#### Refit

In [92]:
forecast_predictions.shape

(11875, 7)

In [93]:
# RMSE
rmse_refit = mean_squared_error(y_true=forecast_predictions['Weekly_Sales'], y_pred=forecast_predictions['Weekly_Sales.predicted'], squared=False)
# R2
r2_refit = r2_score(y_true=forecast_predictions['Weekly_Sales'], y_pred=forecast_predictions['Weekly_Sales.predicted'])

#### TTA

In [94]:
forecast_predictions_tta_eval.shape

(11875, 8)

In [95]:
# RMSE
rmse_tta = mean_squared_error(y_true=forecast_predictions_tta_eval['Weekly_Sales'], y_pred=forecast_predictions_tta_eval['Weekly_Sales.predicted'], squared=False)
# R2
r2_tta = r2_score(y_true=forecast_predictions_tta_eval['Weekly_Sales'], y_pred=forecast_predictions_tta_eval['Weekly_Sales.predicted'])

#### 結果比較

In [96]:
pd.DataFrame({'RMSE':[rmse_refit, rmse_tta], 'R2':[r2_refit, r2_tta]}, index=['Refit', 'TTA'])

Unnamed: 0,RMSE,R2
Refit,2772.501925,0.983839
TTA,2745.080253,0.984157
