# The University of Hong Kong
## DASC7600 Data Science Project 2024
## Vector Autogression Model

# Import Modules and Settings

In [1]:
import pandas as pd
import warnings
from statsmodels.tsa.vector_ar.var_model import VAR

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
hk_daily_avg_temp_std = pd.read_csv('./data/std_data/hk/hk_daily_avg_temp_std.csv')

# Modify Data Type

In [3]:
for df in [covid_hk_case_cnt_std,
           hk_daily_avg_temp_std]:
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Obtain Case Count Column

In [4]:
covid_hk_case_cnt_std['new_case_cnt'] = covid_hk_case_cnt_std['cuml_case_cnt'].diff().fillna(0)

# Combine Case Count and Temperature

In [5]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# hk_daily_avg_temp_std: from 2020-04-01 to 2024-06-30
var_data = covid_hk_case_cnt_std[['report_date', 'new_case_cnt']] \
    .merge(hk_daily_avg_temp_std[['report_date', 'avg_temp']],
           'left',
           'report_date')

var_data.set_index('report_date', inplace=True)

# Train Model

## Vector Autoregressions (VAR) Model

###  Data for VAR Model

In [6]:
covid_module.stationary_and_difference(var_data)

The column new_case_cnt has ADF p-value 0.00002 which is stationary.
The column avg_temp has ADF p-value 0.41152 which is non-stationary
Creating a difference column avg_temp_diff_1 ...
Droping the original column avg_temp ...


### Training Data and Test Data for VAR Model

In [7]:
# Use the last 5 days as test data
var_train = var_data[:-5]
var_test = var_data[-5:]

### Fit VAR Model

In [8]:
# VAR model
var_model = VAR(var_train)

# Select lag order based on AIC
var_lag_order = var_model.select_order().aic

# Fit the VAR model
var_model = var_model.fit(maxlags=var_lag_order)

# Summary of the fitted VAR model
print(var_model.summary())

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Thu, 26, Sep, 2024
Time:                     15:05:28
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                    15.1909
Nobs:                     1099.00    HQIC:                   15.0268
Log likelihood:          -11263.2    FPE:                3.03879e+06
AIC:                      14.9269    Det(Omega_mle):     2.88455e+06
--------------------------------------------------------------------
Results for equation new_case_cnt
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
const                      61.562195        41.272107            1.492           0.136
L1.new_case_cnt             0.868408         0.030389           28.576           0.000
L1.avg_temp_diff_1         40

### Prediction of the Fitted VAR Model

In [9]:
# Predict case counts and average temperatures for the next 5 days
print('Predictions of new case counts for the next 5 days using the trained VAR model:')
pd.DataFrame(var_model.forecast(var_train.values[-var_lag_order:], steps=5), columns=var_train.columns)['new_case_cnt'].astype('int')

Predictions of new case counts for the next 5 days using the trained VAR model:


0    702
1    708
2    870
3    959
4    860
Name: new_case_cnt, dtype: int32

In [10]:
print('The true new case counts for the next 5 days:')
var_test[['new_case_cnt']].astype('int').reset_index()

The true new case counts for the next 5 days:


Unnamed: 0,report_date,new_case_cnt
0,2023-01-25,485
1,2023-01-26,371
2,2023-01-27,455
3,2023-01-28,498
4,2023-01-29,456
