# The University of Hong Kong
## DASC7600 Data Science Project 2024
## VAR Model - Global

# Import Modules and Settings

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings

import tree_module
import var_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
covid_global_new_case_cnt_std = pd.read_csv('./data/std_data/global/covid_global_new_case_count_std.csv')

# Drop Columns and Modify Data Type of Date Column

In [3]:
for df in [covid_hk_case_cnt_std,
           covid_global_new_case_cnt_std]:
    df \
        .drop(['report_year', 'report_month', 'report_year_month', 'report_day'],
              axis=1,
              inplace=True,
              errors='ignore')
    
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Rename Column and Obtain Weekly New Case Count (Hong Kong Case Count Dataset)

In [4]:
# Rename columns
covid_hk_case_cnt_std.rename(columns={'new_case_cnt': 'HK'}, inplace=True)

# Weekly new case count
# Transform the report date to next Sunday 
covid_hk_case_cnt_std['report_weekday'] = covid_hk_case_cnt_std["report_date"].dt.dayofweek

covid_hk_case_cnt_std['report_date'] = covid_hk_case_cnt_std["report_date"] \
    - pd.to_timedelta(covid_hk_case_cnt_std['report_weekday'], 'd') \
    + pd.Timedelta(6, 'd')

# Obtain the weekly counts via GroupBy
covid_hk_case_cnt_std = covid_hk_case_cnt_std \
    .groupby('report_date') \
    ['HK'] \
    .sum() \
    .reset_index()

# Combine Datasets

In [5]:
# covid_hk_case_cnt_std: from 2020-01-12 to 2023-01-29
# covid_global_new_case_count_std: from 2020-01-05 to 2024-08-11
var_data = covid_hk_case_cnt_std \
    .merge(covid_global_new_case_cnt_std,
           'left',
           'report_date')

# Exclude HK counts from CN
var_data['CN'] = var_data['CN'] - var_data['HK']

# Set report date as index
var_data.set_index('report_date', inplace=True)

# Handle Non-Stationary Columns

In [6]:
var_module.stationary_and_difference_loop(var_data,
                                          col_list=['CN','US','GB','AU','JP',
                                                    'SG','MY','TH','IN','DE',
                                                    'FR','KR','PH','CA','ID',
                                                    'VN','SA','AE','BN','QA',
                                                    'BD','LK','EG','PT','OM',
                                                    'NP','IL','AT','RU','NZ'],
                                          print_ind=True)

Running the function stationary_and_difference using for loop ...

Running Loop #1 ...
The column CN has ADF p-value 0.99838 which is non-stationary.
--> Replacing the column CN with its difference column CN_diff_1 ...
The column US has ADF p-value 0.00500 which is stationary.
The column GB has ADF p-value 0.05379 which is non-stationary.
--> Replacing the column GB with its difference column GB_diff_1 ...
The column AU has ADF p-value 0.03969 which is stationary.
The column JP has ADF p-value 0.27687 which is non-stationary.
--> Replacing the column JP with its difference column JP_diff_1 ...
The column SG has ADF p-value 0.00244 which is stationary.
The column MY has ADF p-value 0.00091 which is stationary.
The column TH has ADF p-value 0.04273 which is stationary.
The column IN has ADF p-value 0.02087 which is stationary.
The column DE has ADF p-value 0.31202 which is non-stationary.
--> Replacing the column DE with its difference column DE_diff_1 ...
The column FR has ADF p-value 0

# Set Independent Variable, Dependent Variable and Threshold

In [7]:
X_col_list, y_col_list, p_val_thrhld = \
    ['CN','US','GB','AU','JP',
     'SG','MY','TH','IN','DE',
     'FR','KR','PH','CA','ID',
     'VN','SA','AE','BN','QA',
     'BD','LK','EG','PT','OM',
     'NP','IL','AT','RU','NZ'], \
    ['HK'], \
    0.05

# Fit Vector Autoregression Model and Obtain Significant Variables

In [8]:
# To update the column list since difference columns were created and replaced the original columns
X_col_list = [col for col in var_data.columns if col.split('_diff_')[0] in X_col_list]
y_col_list = [col for col in var_data.columns if col.split('_diff_')[0] in y_col_list]

In [9]:
var_model = var_module.fit_var_model(var_data, X_col_list, y_col_list, print_result_ind=True)

Fitting the VAR model with maximum lag 4 ...
Done.

Results for equation HK
                  coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------
const             -463.254633       615.432299           -0.753           0.452
L1.AE               -0.012283         0.519953           -0.024           0.981
L1.AU               -0.136975         0.048024           -2.852           0.004
L1.BD                0.307769         0.160654            1.916           0.055
L1.BN                4.104300         1.148431            3.574           0.000
L1.CA                0.063537         0.109840            0.578           0.563
L1.EG               -1.292473         0.784874           -1.647           0.100
L1.ID               -0.135057         0.053837           -2.509           0.012
L1.IL                0.553835         0.119919            4.618           0.000
L1.IN                0.002411         0.0107

In [10]:
var_signf_X_col_list = var_module.get_significant_variable(var_model, y_col_list, p_val_thrhld)

For HK,
The following variables are significant (p-value < 0.05):
(variable name: p-value)
L1.AU: 0.00434
L1.BN: 0.000352
L1.ID: 0.0121
L1.IL: 3.87e-06
L1.KR: 5.74e-14
L1.RU: 0.00679
L1.SA: 0.0446
L1.SG: 6.05e-05
L1.US: 0.0272
L1.VN: 3.78e-09
L1.JP_diff_1: 0.0313
L1.HK: 4.69e-10
L2.AE: 0.00583
L2.BD: 0.00404
L2.IL: 0.00161
L2.KR: 7.87e-06
L2.PT: 5.65e-05
L2.QA: 0.0168
L2.RU: 0.000659
L2.SG: 0.0056
L2.VN: 0.000581
L2.JP_diff_1: 0.0208
L2.LK_diff_1: 0.00694
L2.AT_diff_1: 1.34e-05
L2.HK: 5.76e-14
L3.KR: 0.000217
L3.MY: 0.024
L3.PH: 0.0166
L3.PT: 0.00395
L3.RU: 0.00271
L3.VN: 0.00193
L3.GB_diff_1: 0.0255
L3.DE_diff_1: 0.00025
L3.FR_diff_1: 0.00734
L3.LK_diff_1: 0.0338
L3.HK: 0.0369
L4.BD: 0.000593
L4.BN: 0.0296
L4.ID: 0.000415
L4.MY: 0.000464
L4.OM: 0.0473
L4.PT: 3.02e-05
L4.QA: 0.00804
L4.RU: 0.00292
L4.SG: 0.00204
L4.JP_diff_1: 1.25e-11
L4.DE_diff_1: 2.66e-08
L4.FR_diff_1: 0.0047
L4.AT_diff_1: 2.49e-09



# Fit Ordinary Least Squares Regress Model

In [11]:
X = var_module.add_lagged_column(var_data, var_signf_X_col_list)
X = X.iloc[var_model.max_lag_order:, :]
X = X[var_signf_X_col_list]
# X = X[sorted(X.columns)]

y = var_data[y_col_list].iloc[var_model.max_lag_order:, :]

In [12]:
tree_module.fit_and_print_random_forest_feature_importance(X, y, random_state=2024)

Random Forest
MSE:  1.12224e+09
Feature Importances:
1. L1.SG: 0.2066
2. L3.RU: 0.1727
3. L1.HK: 0.1611
4. L4.OM: 0.0939
5. L1.BN: 0.0694
6. L2.RU: 0.0683
7. L3.FR_diff_1: 0.0468
8. L4.AT_diff_1: 0.0193
9. L1.RU: 0.0185
10. L2.HK: 0.0142
11. L4.DE_diff_1: 0.0131
12. L1.ID: 0.0111
13. L3.PT: 0.0097
14. L4.QA: 0.0097
15. L4.FR_diff_1: 0.0093
16. L4.RU: 0.0087
17. L4.BD: 0.0086
18. L2.SG: 0.0071
19. L1.KR: 0.0051
20. L3.HK: 0.0049
21. L4.MY: 0.0039
22. L2.AT_diff_1: 0.0039
23. L4.BN: 0.0030
24. L4.ID: 0.0029
25. L2.PT: 0.0025
26. L3.PH: 0.0024
27. L2.KR: 0.0022
28. L2.LK_diff_1: 0.0021
29. L2.IL: 0.0020
30. L4.JP_diff_1: 0.0019
31. L2.BD: 0.0017
32. L1.SA: 0.0017
33. L3.DE_diff_1: 0.0016
34. L1.JP_diff_1: 0.0015
35. L1.AU: 0.0014
36. L3.KR: 0.0012
37. L3.LK_diff_1: 0.0011
38. L4.SG: 0.0011
39. L3.GB_diff_1: 0.0010
40. L3.MY: 0.0010
41. L1.IL: 0.0005
42. L2.QA: 0.0004
43. L4.PT: 0.0002
44. L3.VN: 0.0002
45. L1.US: 0.0002
46. L2.VN: 0.0002
47. L2.AE: 0.0001
48. L1.VN: 0.0001
49. L2.JP_diff_

In [13]:
# Add the constant term for OLS Regression Model
OLS_reg_X = sm.add_constant(X)

# Ordinary Least Squares Regression Model
OLS_reg_model = sm.OLS(y, OLS_reg_X)
OLS_reg_model = OLS_reg_model.fit()

# Print the summary of the fitted model
print(OLS_reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:                     HK   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.969
Method:                 Least Squares   F-statistic:                     100.7
Date:                Tue, 12 Nov 2024   Prob (F-statistic):           2.44e-70
Time:                        18:00:07   Log-Likelihood:                -1524.0
No. Observations:                 156   AIC:                             3148.
Df Residuals:                     106   BIC:                             3300.
Df Model:                          49                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          905.1527   1237.949      0.731   