# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Global

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_global_new_case_cnt_std = pd.read_csv('./data/std_data/global/covid_global_new_case_count_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_global_new_case_cnt_std]:
    df.drop(['report_year', 'report_month', 'report_year_month'], axis=1, inplace=True, errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Rename Column and Obtain Weekly New Case Count (Hong Kong Case Count Dataset)

In [4]:
# Rename columns
covid_hk_case_cnt_std.rename(columns={'new_case_cnt': 'HK'}, inplace=True)

# Weekly new case count
# Transform the report date to next Sunday 
covid_hk_case_cnt_std['report_weekday'] = covid_hk_case_cnt_std["report_date"].dt.dayofweek

covid_hk_case_cnt_std['report_date'] = covid_hk_case_cnt_std["report_date"] \
    - pd.to_timedelta(covid_hk_case_cnt_std['report_weekday'],'d') \
    + pd.Timedelta(6, 'd')

# Obtain the weekly counts via GroupBy
covid_hk_case_cnt_std = covid_hk_case_cnt_std \
    .groupby('report_date') \
    ['HK'] \
    .sum() \
    .reset_index()

# Combine Datasets

In [5]:
# covid_hk_case_cnt_std: from 2020-01-12 to 2023-01-29
# covid_global_new_case_count_std: from 2020-01-05 to 2024-08-11
var_data = covid_hk_case_cnt_std \
    .merge(covid_global_new_case_cnt_std,
           'left',
           'report_date')

# Exclude HK counts from CN
var_data['CN'] = var_data['CN'] - var_data['HK']
var_data.head()

# Set report date as index
var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [6]:
var_module.stationary_and_difference_loop(var_data,
                                          col_list=['CN','US','GB','AU','JP',
                                                    'SG','MY','TH','IN','DE',
                                                    'FR','KR','PH','CA','ID',
                                                    'VN','SA','AE','BN','QA',
                                                    'BD','LK','EG','PT','OM',
                                                    'NP','IL','AT','RU','NZ'],
                                          max_iter=1,
                                          print_ind=True)

Running stationary_and_difference using for loop ...

Running Loop #1 ...
The column CN has ADF p-value 0.99838 which is non-stationary.
--> Replacing the column CN with its difference column CN_diff_1 ...
The column US has ADF p-value 0.00500 which is stationary.
The column GB has ADF p-value 0.05379 which is non-stationary.
--> Replacing the column GB with its difference column GB_diff_1 ...
The column AU has ADF p-value 0.03969 which is stationary.
The column JP has ADF p-value 0.27687 which is non-stationary.
--> Replacing the column JP with its difference column JP_diff_1 ...
The column SG has ADF p-value 0.00244 which is stationary.
The column MY has ADF p-value 0.00091 which is stationary.
The column TH has ADF p-value 0.04273 which is stationary.
The column IN has ADF p-value 0.02087 which is stationary.
The column DE has ADF p-value 0.31202 which is non-stationary.
--> Replacing the column DE with its difference column DE_diff_1 ...
The column FR has ADF p-value 0.08601 which 

# Set Independent Variable, Dependent Variable and Threshold

In [7]:
X_col_list, y_col_list, p_val_thrhld = \
    ['CN','US','GB','AU','JP',
     'SG','MY','TH','IN','DE',
     'FR','KR','PH','CA','ID',
     'VN','SA','AE','BN','QA',
     'BD','LK','EG','PT','OM',
     'NP','IL','AT','RU','NZ'], \
    ['HK'], \
    0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [8]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 4 ...
Done.

Results for equation HK
                  coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------
const             -415.325758       607.483310           -0.684           0.494
L1.AE                0.036205         0.520538            0.070           0.945
L1.AU               -0.135984         0.047542           -2.860           0.004
L1.BD                0.288882         0.160756            1.797           0.072
L1.BN                4.306120         1.156037            3.725           0.000
L1.CA                0.071895         0.108696            0.661           0.508
L1.EG               -1.474204         0.780198           -1.890           0.059
L1.ID               -0.129598         0.052598           -2.464           0.014
L1.IL                0.542419         0.117914            4.600           0.000
L1.IN                0.003064         0.0107

In [9]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For HK,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L1.AU: 0.00423
L1.BN: 0.000195
L1.ID: 0.0137
L1.IL: 4.22e-06
L1.KR: 3.57e-14
L1.RU: 0.00632
L1.SA: 0.0404
L1.SG: 1.55e-05
L1.US: 0.0247
L1.VN: 1.12e-09
L1.JP_diff_1: 0.0231
L1.HK: 1.72e-09
L2.AE: 0.00546
L2.BD: 0.00289
L2.IL: 0.00185
L2.KR: 3.79e-06
L2.PT: 4.55e-05
L2.QA: 0.0151
L2.RU: 0.000672
L2.SG: 0.00491
L2.VN: 0.000395
L2.JP_diff_1: 0.0135
L2.LK_diff_1: 0.00539
L2.AT_diff_1: 1.08e-05
L2.HK: 1.1e-11
L3.KR: 7.49e-05
L3.MY: 0.0344
L3.PH: 0.0151
L3.PT: 0.00333
L3.RU: 0.00357
L3.VN: 0.00205
L3.CN_diff_1: 0.02
L3.GB_diff_1: 0.0174
L3.JP_diff_1: 0.05
L3.DE_diff_1: 0.0003
L3.FR_diff_1: 0.0108
L3.LK_diff_1: 0.0283
L4.BD: 0.000381
L4.BN: 0.0104
L4.ID: 0.000322
L4.MY: 0.000339
L4.PT: 1.4e-05
L4.QA: 0.00937
L4.RU: 0.00302
L4.SG: 0.00167
L4.JP_diff_1: 3.23e-12
L4.DE_diff_1: 1.44e-08
L4.FR_diff_1: 0.00478
L4.AT_diff_1: 1.23e-08
L4.HK: 0.0331



# Fit Ordinary Least Squares Regress Model

In [10]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
X = X[sorted(X.columns)]

y = var_data[y_col_list].iloc[var_model.max_lag_order:, :]

In [11]:
tree_module.print_random_forest_importance(X, y, random_state=2024)

Random Forest
MSE:  1.1243e+09
Feature Importances:
1. L1.SG: 0.1983
2. L3.RU: 0.1846
3. L1.HK: 0.1637
4. L3.FR_diff_1: 0.1031
5. L1.BN: 0.0582
6. L2.SG: 0.0527
7. L2.RU: 0.0482
8. L4.FR_diff_1: 0.0303
9. L1.RU: 0.0277
10. L4.DE_diff_1: 0.0174
11. L2.HK: 0.0118
12. L4.AT_diff_1: 0.0101
13. L4.BD: 0.0099
14. L3.GB_diff_1: 0.0095
15. L4.RU: 0.0091
16. L3.PT: 0.0083
17. L3.JP_diff_1: 0.0052
18. L3.PH: 0.0042
19. L4.PT: 0.0041
20. L1.ID: 0.0037
21. L1.KR: 0.0037
22. L2.AT_diff_1: 0.0029
23. L1.VN: 0.0027
24. L2.VN: 0.0026
25. L2.LK_diff_1: 0.0026
26. L4.SG: 0.0026
27. L4.ID: 0.0025
28. L1.AU: 0.0024
29. L1.SA: 0.0020
30. L2.PT: 0.0017
31. L4.MY: 0.0014
32. L2.BD: 0.0013
33. L2.AE: 0.0013
34. L3.LK_diff_1: 0.0012
35. L2.QA: 0.0012
36. L3.CN_diff_1: 0.0012
37. L4.JP_diff_1: 0.0011
38. L1.IL: 0.0009
39. L4.QA: 0.0009
40. L1.JP_diff_1: 0.0007
41. L2.JP_diff_1: 0.0006
42. L2.KR: 0.0005
43. L2.IL: 0.0004
44. L4.HK: 0.0004
45. L3.MY: 0.0004
46. L3.KR: 0.0003
47. L1.US: 0.0003
48. L4.BN: 0.0002
49

In [12]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:                     HK   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.969
Method:                 Least Squares   F-statistic:                     98.53
Date:                Sun, 20 Oct 2024   Prob (F-statistic):           1.64e-69
Time:                        12:18:28   Log-Likelihood:                -1523.4
No. Observations:                 156   AIC:                             3149.
Df Residuals:                     105   BIC:                             3304.
Df Model:                          50                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -604.3423   1255.578     -0.481   