In [1]:
import os
os.chdir("../../")

import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.vecm import VECM
from statsmodels.tsa.vector_ar.vecm import select_order
from scripts.python.tsa.mtsmodel import *
from scripts.python.tsa.ts_eval import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
test = VARPipeline(country="palau", data=None, 
                   var_name=["total", "seats_arrivals_intl"],
                   exog=["covid", "stringency_index", "covid"])
test.read_and_merge()
test.test_stationarity()
test.transform()
t_search = test.varma_search()

Unnamed: 0,date,seats_arrivals_total,seats_arrivals_intl,number_of_flights_total,number_of_flights_intl,japan,south_korea,taiwan,china,usa/canada,europe,others,total,stringency_index,covid,palau_flights,palau_hotel,palau_travel
0,2019-01-01,13048,13048,75,75,2055.0,1035.0,1092.0,4059.0,702.0,438.0,371.0,9752.0,0.0,0.0,0.493228,2.559978,0.194222
1,2019-02-01,13281,13281,74,74,2434.0,1090.0,1190.0,2549.0,826.0,483.0,461.0,9033.0,0.0,0.0,0.310671,2.841414,0.282245
2,2019-03-01,12870,12870,73,73,1756.0,808.0,1099.0,3182.0,777.0,368.0,415.0,8405.0,0.0,0.0,0.386389,2.921431,0.286529
3,2019-04-01,10806,10806,64,64,1288.0,1095.0,1393.0,2489.0,607.0,185.0,421.0,7478.0,0.0,0.0,0.293529,2.981907,0.338945
4,2019-05-01,11472,11472,67,67,650.0,655.0,1399.0,2321.0,622.0,119.0,305.0,6071.0,0.0,0.0,0.306351,3.537987,0.168664


order = 0: no stationarity obtained.


Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
total,-2.752038,0.065458,5.0,37.0,-3.620918,-2.943539,-2.6104
seats_arrivals_intl,-5.394519,3e-06,0.0,42.0,-3.596636,-2.933297,-2.604991


order = 1: stationarity obtained.
 Running for {'p': 1, 'q': 1, 'tr': 'n'}
 Running for {'p': 1, 'q': 1, 'tr': 'c'}
 Running for {'p': 1, 'q': 1, 'tr': 't'}
 Running for {'p': 1, 'q': 1, 'tr': 'ct'}
 Running for {'p': 1, 'q': 2, 'tr': 'n'}
 Running for {'p': 1, 'q': 2, 'tr': 'c'}
 Running for {'p': 1, 'q': 2, 'tr': 't'}
 Running for {'p': 1, 'q': 2, 'tr': 'ct'}
 Running for {'p': 1, 'q': 3, 'tr': 'n'}
 Running for {'p': 1, 'q': 3, 'tr': 'c'}
 Running for {'p': 1, 'q': 3, 'tr': 't'}
 Running for {'p': 1, 'q': 3, 'tr': 'ct'}
 Running for {'p': 2, 'q': 1, 'tr': 'n'}
 Running for {'p': 2, 'q': 1, 'tr': 'c'}
 Running for {'p': 2, 'q': 1, 'tr': 't'}
 Running for {'p': 2, 'q': 1, 'tr': 'ct'}
 Running for {'p': 2, 'q': 2, 'tr': 'n'}
 Running for {'p': 2, 'q': 2, 'tr': 'c'}
 Running for {'p': 2, 'q': 2, 'tr': 't'}
 Running for {'p': 2, 'q': 2, 'tr': 'ct'}
 Running for {'p': 2, 'q': 3, 'tr': 'n'}
 Running for {'p': 2, 'q': 3, 'tr': 'c'}
 Running for {'p': 2, 'q': 3, 'tr': 't'}
 Running for {'p':

In [47]:
from statsmodels.tsa.api import VARMAX
mod = VARMAX(test.data[["total", "seats_arrivals_intl"]],
             exog=test.data[test.exog],
             order=(1, 3),
             trend="c").fit(disp=False)

print(calculate_evaluation(test.data["total"], naive_method(test.data["total"])))
print(calculate_evaluation(test.data["total"], mod.fittedvalues["total"]))

{'MSE': 1192288.1395348837, 'RMSE': 1091.9194748400103, 'MAE': 636.8837209302326, 'SMAPE': 54.46121058732388}
{'MSE': 730352.211616988, 'RMSE': 854.6064659344604, 'MAE': 694.8881523051377, 'SMAPE': 86.16022388069882}


In [22]:
test.transform()

In [32]:
scaled = pd.DataFrame([test.x1_trans, test.x2_trans]).T
display(get_adf_df(scaled, ["total", "seats_arrivals_intl"]))

from sklearn.model_selection import ParameterGrid
param_grid = {'p': [1, 2, 3],
              'q': [1, 2, 3],
              'tr': ['n', 'c', 't', 'ct']}
pg = list(ParameterGrid(param_grid))

model_res = {
    "model": [],
    "result": []
}

for idx, params in enumerate(pg):
    print(f' Running for {params}')
    p = params.get('p')
    q = params.get('q')
    tr = params.get('tr')
    model = VARMAX(scaled,
                   exog=test.data[test.exog],
                   order=(p, q),
                   trend=tr).fit(disp=False)
    model_res["model"].append((p, q, tr))
    model_res["result"].append(model.aic)

Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
total,-2.497021,0.116201,2.0,41.0,-3.600983,-2.935135,-2.605963
seats_arrivals_intl,-2.83702,0.053177,2.0,41.0,-3.600983,-2.935135,-2.605963


 Running for {'p': 1, 'q': 1, 'tr': 'n'}
 Running for {'p': 1, 'q': 1, 'tr': 'c'}
 Running for {'p': 1, 'q': 1, 'tr': 't'}
 Running for {'p': 1, 'q': 1, 'tr': 'ct'}
 Running for {'p': 1, 'q': 2, 'tr': 'n'}
 Running for {'p': 1, 'q': 2, 'tr': 'c'}
 Running for {'p': 1, 'q': 2, 'tr': 't'}
 Running for {'p': 1, 'q': 2, 'tr': 'ct'}
 Running for {'p': 1, 'q': 3, 'tr': 'n'}
 Running for {'p': 1, 'q': 3, 'tr': 'c'}
 Running for {'p': 1, 'q': 3, 'tr': 't'}
 Running for {'p': 1, 'q': 3, 'tr': 'ct'}
 Running for {'p': 2, 'q': 1, 'tr': 'n'}
 Running for {'p': 2, 'q': 1, 'tr': 'c'}
 Running for {'p': 2, 'q': 1, 'tr': 't'}
 Running for {'p': 2, 'q': 1, 'tr': 'ct'}
 Running for {'p': 2, 'q': 2, 'tr': 'n'}
 Running for {'p': 2, 'q': 2, 'tr': 'c'}
 Running for {'p': 2, 'q': 2, 'tr': 't'}
 Running for {'p': 2, 'q': 2, 'tr': 'ct'}
 Running for {'p': 2, 'q': 3, 'tr': 'n'}
 Running for {'p': 2, 'q': 3, 'tr': 'c'}
 Running for {'p': 2, 'q': 3, 'tr': 't'}
 Running for {'p': 2, 'q': 3, 'tr': 'ct'}
 Running f

In [43]:
mod_scaled = VARMAX(scaled,
             exog=test.data[test.exog],
             order=(1, 2),
             trend="t").fit(disp=False)

fit_revs = []
for val in mod.fittedvalues["total"]:
    rev_val = inverse_scaledlogit(val, test.data["total"].max()+1, test.data["total"].min()-1)
    fit_revs.append(rev_val)

In [45]:
calculate_evaluation(test.data["total"], fit_revs)

{'MSE': 1865480.8635396196,
 'RMSE': 1365.8260736783507,
 'MAE': 774.9586152686217,
 'SMAPE': 55.81115325036844}

In [17]:
endog = test.data[["seats_arrivals_intl", "total"]]
exog = test.data[["covid", "stringency_index", "palau_travel"]]

# Determine the optimal lag order using the information criterion
lag_order = select_order(endog, maxlags=10, exog=exog)

# Create a VECM model with the selected lag order
model = VECM(endog, k_ar_diff=lag_order.selected_orders['aic'], exog=exog)

# Fit the VECM model
results = model.fit()

# Print the estimated parameters
print(results.summary())

# Get the long-run relationships (cointegrating vectors)
print(results.coint_rank)

# Perform Granger causality tests
granger_test = results.test_granger_causality(caused=1, causing=0)
print(granger_test.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation seats_arrivals_intl
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
exog1                       3.5720   1118.633      0.003      0.997   -2188.909    2196.053
exog2                      -9.8641     17.907     -0.551      0.582     -44.961      25.233
exog3                    2098.2777   1361.566      1.541      0.123    -570.342    4766.898
L1.seats_arrivals_intl     -0.5823      0.220     -2.652      0.008      -1.013      -0.152
L1.total                    0.9613      0.173      5.567      0.000       0.623       1.300
L2.seats_arrivals_intl     -0.3897      0.318     -1.225      0.220      -1.013       0.234
L2.total                    1.0455      0.436      2.398      0.016       0.191       1.900
L3.seats_arrivals_intl     -0.1287      0.246     -0.524      0.600      

In [18]:
calculate_evaluation(endog["total"], pd.DataFrame(results.fittedvalues)[1])

{'MSE': 19006337.115832955,
 'RMSE': 4359.625799977901,
 'MAE': 2870.2403678881547,
 'SMAPE': 143.58928425671743}