# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Hong Kong Vaccination

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_hk_vacc_daily_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_vacc_daily_count_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_hk_vacc_daily_cnt_std]:
    df.drop(['report_year', 'report_month', 'report_year_month'], axis=1, inplace=True, errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Combine Datasets

In [4]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# covid_hk_vacc_daily_cnt_std: from 2021-02-22 to 2024-07-07
var_data = covid_hk_case_cnt_std \
    .merge(covid_hk_vacc_daily_cnt_std,
           'inner',
           'report_date')

var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [5]:
# var_module.stationary_and_difference_loop(var_data, print_ind=True)

# Set Independent Variable, Dependent Variable and Threshold

In [6]:
# First 3 Dose to New Case Count
X_col_list, y_col_list, p_val_thrhld = \
    ['sinov_1st_dose','sinov_2nd_dose','sinov_3rd_dose','biont_1st_dose','biont_2nd_dose','biont_3rd_dose'], \
    ['new_case_cnt'], \
    0.05

# New Case Count to BioNTech 1st Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['biont_1st_dose'], 0.05

# New Case Count and BioNTech 1st Dose to BioBioNTechnT 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'biont_1st_dose'], ['biont_2nd_dose'], 0.05

# New Case Count to SinoVac 1st Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['sinov_1st_dose'], 0.05

# New Case Count and SinoVac 1st Dose to SinoVac 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'sinov_1st_dose'], ['sinov_2nd_dose'], 0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [7]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 20 ...
Done.

Results for equation new_case_cnt
                        coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------------
const                   -255.772739       120.821784           -2.117           0.034
L1.sinov_1st_dose         -0.046584         0.054966           -0.848           0.397
L1.sinov_2nd_dose         -0.036649         0.053534           -0.685           0.494
L1.sinov_3rd_dose          0.050014         0.064022            0.781           0.435
L1.biont_1st_dose         -0.117526         0.044823           -2.622           0.009
L1.biont_2nd_dose          0.021559         0.045884            0.470           0.638
L1.biont_3rd_dose          0.035938         0.052325            0.687           0.492
L1.new_case_cnt            0.832130         0.042010           19.808           0.000
L2.sinov_1st_dose          0.141542         0.072870 

In [8]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For new_case_cnt,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L1.biont_1st_dose: 0.00874
L1.new_case_cnt: 2.56e-87
L2.new_case_cnt: 5.63e-05
L3.new_case_cnt: 6.24e-08
L4.new_case_cnt: 1.65e-18
L5.biont_3rd_dose: 0.0274
L5.new_case_cnt: 0.0224
L6.sinov_1st_dose: 0.00179
L6.biont_2nd_dose: 0.0114
L7.sinov_2nd_dose: 0.019
L7.new_case_cnt: 7.27e-06
L8.sinov_1st_dose: 0.013
L10.sinov_1st_dose: 0.0463
L11.sinov_1st_dose: 0.043
L11.new_case_cnt: 0.0295
L12.sinov_1st_dose: 2.34e-06
L12.sinov_2nd_dose: 0.00764
L12.biont_2nd_dose: 0.0245
L13.sinov_1st_dose: 1.09e-08
L13.biont_2nd_dose: 0.0337
L14.sinov_1st_dose: 2.72e-05
L14.sinov_2nd_dose: 0.0154
L14.sinov_3rd_dose: 0.0469
L14.new_case_cnt: 0.000538
L15.sinov_1st_dose: 0.0197
L17.sinov_3rd_dose: 0.00576
L17.biont_3rd_dose: 0.00536
L18.sinov_1st_dose: 3.53e-09
L18.sinov_2nd_dose: 0.0175
L18.new_case_cnt: 0.0218
L19.new_case_cnt: 0.000596
L20.sinov_1st_dose: 0.00218
L20.new_case_cnt: 0.000134



# Fit Ordinary Least Squares Regress Model

In [9]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
X = X[sorted(X.columns)]

y = var_data[y_col_list].iloc[var_model.max_lag_order:, :]

In [10]:
tree_module.print_random_forest_importance(X, y, random_state=2024)

Random Forest
MSE:  9.85774e+05
Feature Importances:
1. L1.new_case_cnt: 0.3393
2. L2.new_case_cnt: 0.2563
3. L15.sinov_1st_dose: 0.0935
4. L3.new_case_cnt: 0.0625
5. L12.sinov_1st_dose: 0.0478
6. L14.sinov_1st_dose: 0.0410
7. L8.sinov_1st_dose: 0.0267
8. L11.sinov_1st_dose: 0.0239
9. L10.sinov_1st_dose: 0.0190
10. L18.sinov_1st_dose: 0.0184
11. L13.sinov_1st_dose: 0.0098
12. L11.new_case_cnt: 0.0088
13. L19.new_case_cnt: 0.0060
14. L12.sinov_2nd_dose: 0.0060
15. L7.new_case_cnt: 0.0041
16. L14.new_case_cnt: 0.0040
17. L1.biont_1st_dose: 0.0038
18. L6.sinov_1st_dose: 0.0035
19. L18.new_case_cnt: 0.0035
20. L18.sinov_2nd_dose: 0.0032
21. L14.sinov_3rd_dose: 0.0032
22. L5.biont_3rd_dose: 0.0030
23. L4.new_case_cnt: 0.0025
24. L13.biont_2nd_dose: 0.0019
25. L5.new_case_cnt: 0.0016
26. L20.sinov_1st_dose: 0.0015
27. L6.biont_2nd_dose: 0.0012
28. L17.biont_3rd_dose: 0.0011
29. L12.biont_2nd_dose: 0.0010
30. L14.sinov_2nd_dose: 0.0007
31. L17.sinov_3rd_dose: 0.0006
32. L7.sinov_2nd_dose: 0.0

In [11]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:           new_case_cnt   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     292.3
Date:                Sun, 20 Oct 2024   Prob (F-statistic):               0.00
Time:                        12:18:56   Log-Likelihood:                -5942.7
No. Observations:                 687   AIC:                         1.195e+04
Df Residuals:                     653   BIC:                         1.211e+04
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               -124.9636    113