In [None]:
import pickle
import utils
from matplotlib import pyplot as plt
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA, ARMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.tsa.api import VAR
import statsmodels as sm
from settings import in_size, out_size, train_start, train_end, valid_start, valid_end, test_start, test_end, interval
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas.tseries.offsets as offsets
import gc

In [None]:
train_num = 2000 #ARIMA, SARIMA モデルの訓練に使うデータ数．0 にすれば全訓練データを使用

In [None]:
from importlib import reload
reload(plt)

In [None]:
_ = pickle.load(open("./data/data.pickle","rb"))
cluster_sizes = _["cluster_sizes"]
cluster2id = _["cluster2id"]
locations = _["locations"]
timestamps = _["timestamps"]
raw_data = _["raw_data"]
cluster_data = _["cluster_data"]
onehot_data = _["onehot_data"]

In [None]:
data = pd.DataFrame(cluster_data * cluster_sizes,index=timestamps,columns=['cluster_'+str(i) for i in range(len(cluster_sizes))])

In [None]:
train_data = data.iloc[train_start:valid_end].asfreq("15T").interpolate()
#valid_data = data.iloc[valid_start:valid_end].asfreq("15T").interpolate()
test_data = data.iloc[test_start:test_end].asfreq("15T").interpolate()

In [None]:
#全駐車場のデータ
df_sum = train_data.sum(axis=1)
df_sum[-2000:].plot(figsize=(16,4))
plt.show()

In [None]:
#1日周期としたとき
sm.tsa.seasonal.seasonal_decompose(df_sum[-2000:], freq=96).plot()
plt.show()

In [None]:
def create_arima(train_data):
    #和分過程っぽさはないので ARMA として訓練（d=0）
    result_tuning = sm.tsa.stattools.arma_order_select_ic(train_data[-train_num:],max_ar=3,max_ma=3)
    p,q = result_tuning["bic_min_order"]
    arima_results = ARIMA(
        train_data[-train_num:],freq="15T",order=(p,0,q), 
    ).fit(maxiter=500,disp=False)
    return arima_results

In [None]:
def create_sarimax(train_data):
    #和分過程っぽさはないので ARMA として訓練（d=0）
    result_tuning = sm.tsa.stattools.arma_order_select_ic(train_data[-train_num:],max_ar=3,max_ma=3)
    p,q = result_tuning["bic_min_order"]
    sarima_results = SARIMAX(
        train_data[-train_num:],freq="15T",order=(p,0,q),seasonal_order=(0,1,0,96),
        enforce_invertibility=False,enforce_stationarity=False
    ).fit(maxiter=500)
    return sarima_results

In [None]:
def eval_arima(model, test_data):
    mae_cluster = np.zeros(out_size)
    y_preds = model.predict(model.fittedvalues.index[-1], test_data.index[-1])
    test_steps = 0
    plt.show()
    y_pred_matrix = []
    y_ref_matrix = []
    
    for i in tqdm_notebook(range(0,len(test_data)-in_size-out_size,out_size)):
        test_steps += 1
        y_ref = test_data.iloc[i+in_size:in_size+i+out_size]
        y_pred = y_preds[y_ref.index]
        mae_cluster += np.abs(y_ref.values - y_pred.values)
        y_pred_matrix.append(y_pred.values)
        y_ref_matrix.append(y_ref.values)
    mae_cluster /= test_steps
    return mae_cluster, np.array(y_pred_matrix), np.array(y_ref_matrix)

In [None]:
def run_arima(train_data,test_data,seasonal=False):
    results = []
    y_pred_sum = []
    y_ref_sum = []
    
    for i,items in enumerate(train_data.iteritems()):
        print("garvage collection,",gc.collect())
        c_num,cluster_train_data = items
        print("------cluster id",c_num,"最大駐車台数",cluster_sizes[i],"台------")
        print("平均絶対値誤差 (正解 - 予測) [台]")
        print("training...",end="")
        cluster_test_data = test_data[c_num]
        
        #train phase
        if not seasonal:
            model = create_arima(cluster_train_data)
        else:
            model = create_sarimax(cluster_train_data)
        print("done.")
        
        #test phase
        print("test_"+c_num)
        mae_cluster, y_pred_matrix, y_ref_matrix = eval_arima(model, cluster_test_data)
        results.append(mae_cluster)
        for j, mae in enumerate(mae_cluster):
            print(str(j * interval)+"分後: ",mae)
        y_pred_sum.append(y_pred_matrix)
        y_ref_sum.append(y_ref_matrix)
        
    print("--------total       最大駐車台数",sum(cluster_sizes),"台--------")
    y_pred_sum = sum(y_pred_sum)
    y_ref_sum = sum(y_ref_sum)
    mae_total = np.mean(np.abs(y_ref_sum - y_pred_sum),axis=0) #全体での平均絶対値誤差
    results.append(mae_total)
    for j, mae in enumerate(mae_total):
        print(str(j * interval)+"分後: ",mae)
        
    df_results = pd.DataFrame(results,
                 columns=[str((i+1) * interval)+"分後" for i in range(out_size)],
                 index = [str(idx)+" ("+str(cluster_sizes[idx])+"台)" for idx in range(len(cluster_sizes))] + ["total"]
                )[["15分後","30分後","60分後","120分後"]]
    return df_results

In [None]:
df_results_arima = run_arima(train_data,test_data)
df_results_arima.to_csv("./results/csv/arima_results.csv")

In [None]:
df_results_arima

In [None]:
df_results_sarima = run_arima(train_data,test_data,seasonal=True)
df_results_sarima.to_csv("./results/csv/sarima_results.csv")

In [None]:
df_results_sarima