# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Hong Kong (Except Vaccination)

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_hk_policy_std = pd.read_csv('./data/std_data/hk/covid_hk_policy_std.csv')
hk_daily_avg_temp_std = pd.read_csv('./data/std_data/hk/hk_daily_avg_temp_std.csv')
hk_daily_avg_humid_std = pd.read_csv('./data/std_data/hk/hk_daily_avg_humid_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_hk_policy_std,
           hk_daily_avg_temp_std,
           hk_daily_avg_humid_std]:
    df.drop(['report_year', 'report_month', 'report_year_month'], axis=1, inplace=True, errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Combine Datasets

In [4]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# covid_hk_policy_std: from 2020-01-01 to 2023-02-28
# hk_daily_avg_temp_std: from 2020-01-01 to 2024-06-30
# hk_daily_avg_humid_std: from 2020-01-01 to 2024-09-30
var_data = covid_hk_case_cnt_std \
    .merge(covid_hk_policy_std,
           'left',
           'report_date') \
    .merge(hk_daily_avg_temp_std,
           'left',
           'report_date') \
    .merge(hk_daily_avg_humid_std,
           'left',
           'report_date')

var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [5]:
var_module.stationary_and_difference_loop(var_data, print_ind=False)

Running the function stationary_and_difference using for loop ...

Running Loop #1 ...

Running Loop #2 ...

All target columns are now stationary.
Loop Ended.


# Set Independent Variable, Dependent Variable and Threshold

In [6]:
X_col_list, y_col_list, p_val_thrhld = ['avg_temp'], ['new_case_cnt'], 0.05

# X_col_list, y_col_list, p_val_thrhld = ['avg_humid'], ['new_case_cnt'], 0.1

# X_col_list, y_col_list, p_val_thrhld = \
#     ['school', 'no_hong_kong_p', 'hong_kong_group', 'hong_kong_all', 'home_other_14',
#      # 'hotel_21', 'hotel_14', 'hotel_7', 'hotel_3',
#      'type_1_close',
#      'type_2_close', 'type_3_close', 'people2', 'people4', 'people8',
#      '0500_1800','0500_2200','0500_0200'], \
#     ['new_case_cnt'], \
#     0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [7]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 14 ...
Done.

Results for equation new_case_cnt
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
const                      61.225614        41.104695            1.490           0.136
L1.avg_temp_diff_1         41.010394        30.554174            1.342           0.180
L1.new_case_cnt             0.868453         0.030318           28.645           0.000
L2.avg_temp_diff_1          5.138682        30.690881            0.167           0.867
L2.new_case_cnt             0.132836         0.040190            3.305           0.001
L3.avg_temp_diff_1         -2.880826        32.401154           -0.089           0.929
L3.new_case_cnt            -0.236367         0.040215           -5.878           0.000
L4.avg_temp_diff_1        -28.260121        32.809589           -0.861           0.389
L4.new_case_cnt             0.500504       

In [8]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For new_case_cnt,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L1.new_case_cnt: 1.86e-180
L2.new_case_cnt: 0.000949
L3.new_case_cnt: 4.16e-09
L4.new_case_cnt: 1.35e-34
L5.new_case_cnt: 0.00458
L6.avg_temp_diff_1: 0.00462
L6.new_case_cnt: 0.00291
L7.avg_temp_diff_1: 0.0132
L8.new_case_cnt: 1.55e-06
L9.avg_temp_diff_1: 0.0193
L10.avg_temp_diff_1: 0.0115
L10.new_case_cnt: 0.000445
L11.new_case_cnt: 0.045
L12.new_case_cnt: 0.00237
L13.avg_temp_diff_1: 0.0455
L14.new_case_cnt: 0.000391



# Fit Ordinary Least Squares Regress Model

In [9]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
# X = X[sorted(X.columns)]

y = var_data[y_col_list].iloc[var_model.max_lag_order:, :]

In [10]:
# tree_module.fit_and_print_random_forest_feature_importance(X, y, random_state=2024)

In [11]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:           new_case_cnt   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     721.9
Date:                Sun, 20 Oct 2024   Prob (F-statistic):               0.00
Time:                        18:11:17   Log-Likelihood:                -9476.6
No. Observations:                1104   AIC:                         1.899e+04
Df Residuals:                    1087   BIC:                         1.907e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  63.7165    