# The University of Hong Kong
## DASC7600 Data Science Project 2024
## Time Series Regression Model

# Import Modules and Settings

In [1]:
import graphviz
import os
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from statsmodels.tsa.vector_ar.var_model import VAR

import covid_module

# Settings
warnings.filterwarnings('ignore')

os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin" # To fix a graphviz bug

# Functions

In [2]:
def add_lag_columns(df: pd.DataFrame,
                       col: str,
                       lag_nbr: int) -> pd.DataFrame:
    for nbr in range(1, lag_nbr+1):
        df[f'{col}_lag{nbr}'] = df[col].shift(nbr)
    return df

# Load Data

In [3]:
# Read csv files
covid_hk_std = pd.read_csv('./data/std_data/hk/covid_hk_case_std.csv')
hk_daily_temp_std = pd.read_csv('./data/std_data/hk/hk_daily_temp_std.csv')

# Modify Data Type

In [4]:
covid_hk_std['report_date'] = pd.to_datetime(covid_hk_std['report_date'], format='%Y%m%d')
hk_daily_temp_std['report_date'] = pd.to_datetime(hk_daily_temp_std['report_date'], format='%Y%m%d')

# New Case Counts

In [5]:
covid_hk_new_case_cnt_df = covid_module.get_date_count(covid_hk_std, 'report_date', '%Y%m%d')

# Combine Case Count and Temperature

In [6]:
covid_hk_case_cnt_temp = covid_hk_new_case_cnt_df \
    .merge(hk_daily_temp_std,
           'left',
           'report_date')

# Train Model

## Regression Tree Model

### Add Lag Columns for Regression Tree Model

In [7]:
# Lagged number
lag_nbr = 7

covid_hk_case_cnt_temp_lag = covid_hk_case_cnt_temp.copy()
for col in ['count', 'avg_temp']:
    add_lag_columns(covid_hk_case_cnt_temp_lag, col, lag_nbr)

# Drop first 7 rows
# They have missing values due to the lagging
covid_hk_case_cnt_temp_lag = covid_hk_case_cnt_temp_lag.iloc[7:]

In [8]:
# To confirm there is no missing values due to join or lagging
covid_module.print_missing_val_count(covid_hk_case_cnt_temp_lag)

The dataframe does not have missing values.


### Training Data and Test Data for Tree Model

In [9]:
tree_X_col_list = [col for col in covid_hk_case_cnt_temp_lag.columns if ('count_lag' in col) or ('avg_temp' in col)]

tree_X = covid_hk_case_cnt_temp_lag[tree_X_col_list]
tree_y = covid_hk_case_cnt_temp_lag['count']

# Split into train set and test set with ratio 0.8 : 0.2
tree_X_train, tree_X_test, tree_y_train, tree_y_test = train_test_split(tree_X, tree_y, test_size=0.2, random_state=2024)

### Fit Regression Tree Model

In [10]:
# Regression Tree model
reg_tree_model = DecisionTreeRegressor(min_samples_leaf=5, max_depth=3, random_state=2024)

# Fit the Regression Tree model
reg_tree_model.fit(tree_X_train, tree_y_train)

print(f"Test score of Regression Tree Model is {reg_tree_model.score(tree_X_test, tree_y_test):.3f}.")

Test score of Regression Tree Model is 0.771.


### Graph and Feature Importance of the Tree Model

In [11]:
# # Export a decision tree in DOT format with Graphviz
# dot_graph = export_graphviz(reg_tree_model,
#                             feature_names=reg_tree_model.feature_names_in_,
#                             filled=True,
#                             rounded=True,
#                             special_characters=True)

# tree_graph = graphviz.Source(dot_graph, format="png")

# # Print the structure of regression tree
# tree_graph

In [12]:
# # Feature importances
# feature_importance_df = pd.DataFrame({'Variable Name': reg_tree_model.feature_names_in_, 'Importance': reg_tree_model.feature_importances_})

# feature_importance_df.sort_values(by='Importance', ascending=False)

## Vector Autoregressions (VAR) Model

###  Data for VAR Model

In [13]:
var_data = covid_hk_case_cnt_temp[['report_date', 'count', 'avg_temp']]

var_data.set_index('report_date', inplace=True)

### Training Data and Test Data for VAR Model

In [14]:
# Use the last 5 days as test data
var_train = var_data[:-5]
var_test = var_data[-5:]

### Fit VAR Model

In [15]:
# VAR model
var_model = VAR(var_train)

# Select lag order based on AIC
var_lag_order = var_model.select_order().aic

# Fit the VAR model
var_model = var_model.fit(maxlags=var_lag_order)

# Summary of the fitted VAR model
print(var_model.summary())

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 24, Aug, 2024
Time:                     09:35:36
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                    5.56131
Nobs:                     726.000    HQIC:                   5.32072
Log likelihood:          -3874.84    FPE:                    175.851
AIC:                      5.16953    Det(Omega_mle):         161.743
--------------------------------------------------------------------
Results for equation count
                  coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------
const                3.564321         2.252002            1.583           0.113
L1.count             0.568997         0.037971           14.985           0.000
L1.avg_temp         -0.193394         0.310520           -0.623 

### Prediction of the Fitted VAR Model

In [16]:
# Predict case counts and average temperatures for the next 5 days
print('Predictions of new case counts for the next 5 days using the trained VAR model:')
pd.DataFrame(var_model.forecast(var_train.values, steps=5), columns=var_train.columns)

Predictions of new case counts for the next 5 days using the trained VAR model:


Unnamed: 0,count,avg_temp
0,103.023872,13.069949
1,110.453119,11.82001
2,85.837418,12.757006
3,98.225874,12.631705
4,100.766712,13.622823


In [17]:
print('The true new case counts for the next 5 days:')
var_test

The true new case counts for the next 5 days:


Unnamed: 0_level_0,count,avg_temp
report_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-02,116,15.6
2022-02-03,142,13.4
2022-02-04,131,14.4
2022-02-05,356,15.2
2022-02-06,355,16.0
