# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Hong Kong Vaccination

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_hk_vacc_daily_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_vacc_daily_count_std.csv')
covid_hk_vacc_age_grp_daily_count_std = pd.read_csv('./data/std_data/hk/covid_hk_vacc_age_grp_daily_count_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_hk_vacc_daily_cnt_std,
           covid_hk_vacc_age_grp_daily_count_std]:
    df \
        .drop(['report_year', 'report_month', 'report_year_month', 'report_day'],
              axis=1,
              inplace=True,
              errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Combine Datasets

In [4]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# covid_hk_vacc_daily_cnt_std: from 2021-02-22 to 2024-07-07
var_data = covid_hk_case_cnt_std \
    .merge(covid_hk_vacc_daily_cnt_std,
           'inner',
           'report_date') \
    .merge(covid_hk_vacc_age_grp_daily_count_std,
           'inner',
           'report_date')

var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [5]:
var_module.stationary_and_difference_loop(var_data, print_ind=False)

Running the function stationary_and_difference using for loop ...

Running Loop #1 ...

Running Loop #2 ...

All target columns are now stationary.
Loop Ended.


# Set Independent Variable, Dependent Variable and Threshold

In [6]:
# First 3 Dose to New Case Count
# X_col_list, y_col_list, p_val_thrhld = \
#     ['sinov_1st_dose','sinov_2nd_dose','sinov_3rd_dose','biont_1st_dose','biont_2nd_dose','biont_3rd_dose'], \
#     ['new_case_cnt'], \
#     0.05

# New Case Count to BioNTech 1st Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['biont_1st_dose'], 0.05

# New Case Count and BioNTech 1st Dose to BioBioNTechnT 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'biont_1st_dose'], ['biont_2nd_dose'], 0.05

# New Case Count to SinoVac 1st Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['sinov_1st_dose'], 0.05

# New Case Count and SinoVac 1st Dose to SinoVac 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'sinov_1st_dose'], ['sinov_2nd_dose'], 0.05

# New Case Count and SinoVac 1st Dose to SinoVac 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'sinov_1st_dose'], ['sinov_2nd_dose'], 0.05

# Age Groups to New Case Count
X_col_list, y_col_list, p_val_thrhld = \
    ['sinov_1st_age_20_to_69', 'sinov_2nd_age_20_to_69',
     'sinov_2nd_age_over_69', 'biont_1st_age_20_to_69',
     'biont_1st_age_over_69', 'biont_2nd_age_20_to_69',
     'biont_2nd_age_below_20', 'biont_2nd_age_over_69'], \
    ['new_case_cnt'], \
    0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [7]:
# To update the column list since difference columns were created and replaced the original columns
X_col_list = [col for col in var_data.columns if col.split('_diff_')[0] in X_col_list]
y_col_list = [col for col in var_data.columns if col.split('_diff_')[0] in y_col_list]

In [8]:
var_data = var_module.remove_first_last_zero_rows(var_data, y_col_list)

Originally:
First Index: 2021-02-22 00:00:00
Last Index: 2023-01-29 00:00:00

After removing first and last zero rows:
First Index: 2021-02-22 00:00:00
Last Index: 2023-01-29 00:00:00


In [9]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 20 ...
Done.

Results for equation new_case_cnt
                                       coefficient       std. error           t-stat            prob
----------------------------------------------------------------------------------------------------
const                                    75.202233        55.144571            1.364           0.173
L1.sinov_1st_age_20_to_69_diff_1         -0.115839         0.091668           -1.264           0.206
L1.sinov_2nd_age_20_to_69_diff_1         -0.143367         0.107099           -1.339           0.181
L1.sinov_2nd_age_over_69_diff_1           0.730958         0.272405            2.683           0.007
L1.biont_1st_age_20_to_69_diff_1         -0.027754         0.063515           -0.437           0.662
L1.biont_1st_age_over_69_diff_1          -1.178457         0.713501           -1.652           0.099
L1.biont_2nd_age_20_to_69_diff_1          0.029867         0.070718            0.422           0.673
L1.b

In [10]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For new_case_cnt,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L1.sinov_2nd_age_over_69_diff_1: 0.00729
L1.new_case_cnt: 1.08e-72
L2.sinov_2nd_age_over_69_diff_1: 0.0489
L2.biont_1st_age_20_to_69_diff_1: 0.000152
L2.new_case_cnt: 2.29e-06
L3.sinov_2nd_age_20_to_69_diff_1: 0.0434
L3.biont_1st_age_20_to_69_diff_1: 0.000194
L3.biont_1st_age_over_69_diff_1: 7.51e-06
L3.biont_2nd_age_over_69_diff_1: 0.00532
L3.new_case_cnt: 0.000381
L4.biont_2nd_age_over_69_diff_1: 7.96e-05
L4.new_case_cnt: 1.63e-23
L5.biont_2nd_age_over_69_diff_1: 0.0181
L5.new_case_cnt: 0.000717
L6.sinov_1st_age_20_to_69_diff_1: 0.0328
L6.sinov_2nd_age_over_69_diff_1: 0.0147
L6.biont_2nd_age_20_to_69_diff_1: 0.0147
L7.sinov_1st_age_20_to_69_diff_1: 0.0486
L7.sinov_2nd_age_over_69_diff_1: 0.0148
L7.new_case_cnt: 2.34e-05
L8.sinov_1st_age_20_to_69_diff_1: 0.00888
L8.biont_1st_age_20_to_69_diff_1: 0.0177
L8.new_case_cnt: 0.000528
L9.biont_1st_age_20_to_69_diff_1: 0.00231
L9.biont_1st_age

# Fit Ordinary Least Squares Regress Model

In [11]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
# X = X[sorted(X.columns)]

y = var_data[y_col_list].iloc[var_model.max_lag_order:, :]

In [12]:
tree_module.fit_and_print_random_forest_feature_importance(X, y, random_state=2024)

Random Forest
MSE:  3.61868e+05
Feature Importances:
1. L1.new_case_cnt: 0.4214
2. L2.new_case_cnt: 0.3552
3. L3.new_case_cnt: 0.0750
4. L6.sinov_1st_age_20_to_69_diff_1: 0.0512
5. L12.new_case_cnt: 0.0165
6. L14.new_case_cnt: 0.0128
7. L11.new_case_cnt: 0.0089
8. L7.new_case_cnt: 0.0073
9. L8.new_case_cnt: 0.0038
10. L11.sinov_1st_age_20_to_69_diff_1: 0.0036
11. L9.biont_1st_age_over_69_diff_1: 0.0033
12. L9.biont_1st_age_20_to_69_diff_1: 0.0029
13. L13.sinov_1st_age_20_to_69_diff_1: 0.0028
14. L3.biont_1st_age_over_69_diff_1: 0.0025
15. L6.biont_2nd_age_20_to_69_diff_1: 0.0023
16. L10.new_case_cnt: 0.0023
17. L9.biont_2nd_age_over_69_diff_1: 0.0023
18. L17.biont_2nd_age_over_69_diff_1: 0.0022
19. L5.new_case_cnt: 0.0021
20. L7.sinov_1st_age_20_to_69_diff_1: 0.0016
21. L8.biont_1st_age_20_to_69_diff_1: 0.0015
22. L11.biont_2nd_age_over_69_diff_1: 0.0013
23. L11.biont_2nd_age_20_to_69_diff_1: 0.0013
24. L17.sinov_1st_age_20_to_69_diff_1: 0.0012
25. L4.biont_2nd_age_over_69_diff_1: 0.00

In [13]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:           new_case_cnt   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     208.2
Date:                Wed, 30 Oct 2024   Prob (F-statistic):               0.00
Time:                        22:15:05   Log-Likelihood:                -5938.6
No. Observations:                 687   AIC:                         1.197e+04
Df Residuals:                     640   BIC:                         1.218e+04
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 