# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Hong Kong Vaccination

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_hk_vacc_daily_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_vacc_daily_count_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_hk_vacc_daily_cnt_std]:
    df.drop(['report_year', 'report_month', 'report_year_month'], axis=1, inplace=True, errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Combine Datasets

In [4]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# covid_hk_vacc_daily_cnt_std: from 2021-02-22 to 2024-07-07
var_data = covid_hk_case_cnt_std \
    .merge(covid_hk_vacc_daily_cnt_std,
           'inner',
           'report_date')

var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [5]:
var_module.stationary_and_difference_loop(var_data, print_ind=True)

Running the function stationary_and_difference using for loop ...

Running Loop #1 ...
The column cuml_case_cnt has ADF p-value 0.96047 which is non-stationary.
--> Replacing the column cuml_case_cnt with its difference column cuml_case_cnt_diff_1 ...
The column cuml_dischg_cnt has ADF p-value 0.71004 which is non-stationary.
--> Replacing the column cuml_dischg_cnt with its difference column cuml_dischg_cnt_diff_1 ...
The column cuml_death_cnt has ADF p-value 0.94034 which is non-stationary.
--> Replacing the column cuml_death_cnt with its difference column cuml_death_cnt_diff_1 ...
The column new_case_cnt has ADF p-value 0.00004 which is stationary.
The column new_dischg_cnt has ADF p-value 0.00000 which is stationary.
The column sinov_1st_dose has ADF p-value 0.03913 which is stationary.
The column sinov_2nd_dose has ADF p-value 0.11530 which is non-stationary.
--> Replacing the column sinov_2nd_dose with its difference column sinov_2nd_dose_diff_1 ...
The column sinov_3rd_dose has 

# Set Independent Variable, Dependent Variable and Threshold

In [6]:
# First 3 Dose to New Case Count
# X_col_list, y_col_list, p_val_thrhld = \
#     ['sinov_1st_dose','sinov_2nd_dose','sinov_3rd_dose','biont_1st_dose','biont_2nd_dose','biont_3rd_dose'], \
#     ['new_case_cnt'], \
#     0.05

# New Case Count to BioNTech 1st Dose
X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['biont_1st_dose'], 0.05

# New Case Count and BioNTech 1st Dose to BioBioNTechnT 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'biont_1st_dose'], ['biont_2nd_dose'], 0.05

# New Case Count to SinoVac 1st Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt'], ['sinov_1st_dose'], 0.05

# New Case Count and SinoVac 1st Dose to SinoVac 2nd Dose
# X_col_list, y_col_list, p_val_thrhld = ['new_case_cnt', 'sinov_1st_dose'], ['sinov_2nd_dose'], 0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [7]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 14 ...
Done.

Results for equation biont_1st_dose_diff_1
                               coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------------
const                            33.997336        58.054097            0.586           0.558
L1.new_case_cnt                  -0.009341         0.033187           -0.281           0.778
L1.biont_1st_dose_diff_1         -0.043078         0.038711           -1.113           0.266
L2.new_case_cnt                   0.022695         0.044254            0.513           0.608
L2.biont_1st_dose_diff_1         -0.119765         0.038585           -3.104           0.002
L3.new_case_cnt                  -0.034532         0.044261           -0.780           0.435
L3.biont_1st_dose_diff_1         -0.132576         0.038689           -3.427           0.001
L4.new_case_cnt                  -0.007134         0.044794        

In [8]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For biont_1st_dose_diff_1,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L2.biont_1st_dose_diff_1: 0.00191
L3.biont_1st_dose_diff_1: 0.000611
L4.biont_1st_dose_diff_1: 9.11e-05
L6.biont_1st_dose_diff_1: 0.00188
L7.biont_1st_dose_diff_1: 2.23e-12
L9.biont_1st_dose_diff_1: 0.00202
L12.biont_1st_dose_diff_1: 0.00745
L13.biont_1st_dose_diff_1: 0.00124



# Fit Ordinary Least Squares Regress Model

In [9]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
X = X[sorted(X.columns)]

y = var_data[[col for col in var_data.columns if col.split('_diff_')[0] in y_col_list]].iloc[var_model.max_lag_order:, :]

In [10]:
tree_module.fit_and_print_random_forest_feature_importance(X, y, random_state=2024)

Random Forest
MSE:  1.28178e+06
Feature Importances:
1. L7.biont_1st_dose_diff_1: 0.2908
2. L6.biont_1st_dose_diff_1: 0.1553
3. L3.biont_1st_dose_diff_1: 0.1026
4. L13.biont_1st_dose_diff_1: 0.1014
5. L2.biont_1st_dose_diff_1: 0.0941
6. L9.biont_1st_dose_diff_1: 0.0919
7. L12.biont_1st_dose_diff_1: 0.0832
8. L4.biont_1st_dose_diff_1: 0.0808


In [11]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                              OLS Regression Results                             
Dep. Variable:     biont_1st_dose_diff_1   R-squared:                       0.243
Model:                               OLS   Adj. R-squared:                  0.234
Method:                    Least Squares   F-statistic:                     27.39
Date:                   Sun, 20 Oct 2024   Prob (F-statistic):           5.30e-37
Time:                           18:37:15   Log-Likelihood:                -6008.1
No. Observations:                    693   AIC:                         1.203e+04
Df Residuals:                        684   BIC:                         1.208e+04
Df Model:                              8                                         
Covariance Type:               nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------