# The University of Hong Kong
## DASC7600 Data Science Project 2024
## Time Series Regression Model

# Import Modules and Settings

In [1]:
import graphviz
import os
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from statsmodels.tsa.vector_ar.var_model import VAR

import covid_module

# Settings
warnings.filterwarnings('ignore')

os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin" # To fix a graphviz bug

# Functions

In [2]:
def get_cnt_from_cuml_col(df: pd.DataFrame,
                          cuml_cnt_col: str,
                          new_cnt_col: str) -> pd.DataFrame:
    df[new_cnt_col] = df[cuml_cnt_col] - df[cuml_cnt_col].shift(1).fillna(0)
    return df

def add_lag_columns(df: pd.DataFrame,
                    col: str,
                    lag_nbr: int) -> pd.DataFrame:
    for nbr in range(1, lag_nbr+1):
        df[f'{col}_lag{nbr}'] = df[col].shift(nbr).fillna(0)
    return df

# Load Data

In [3]:
# Read csv files
covid_hk_case_cnt_std = pd.read_csv('./data/std_data/hk/covid_hk_case_count_std.csv')
hk_daily_avg_temp_std = pd.read_csv('./data/std_data/hk/hk_daily_avg_temp_std.csv')
covid_hk_policy_std = pd.read_csv('./data/std_data/hk/covid_hk_policy_std.csv')

# Modify Data Type

In [4]:
for df in [covid_hk_case_cnt_std,
           hk_daily_avg_temp_std,
           covid_hk_policy_std]:
    df['report_date'] = pd.to_datetime(df['report_date'], format='%Y%m%d')

# Obtain Case Count Column

In [5]:
covid_hk_case_cnt_std = get_cnt_from_cuml_col(covid_hk_case_cnt_std, 'cuml_case_cnt', 'case_cnt')

# Combine Case Count and Temperature

In [6]:
# covid_hk_case_cnt_std: from 2020-01-08 to 2023-01-29
# hk_daily_avg_temp_std: from 2020-04-01 to 2024-06-30
# covid_hk_policy_std: from 2020-01-01 to 2023-02-28
time_series_data = covid_hk_case_cnt_std[['report_date', 'case_cnt']] \
    .merge(hk_daily_avg_temp_std[['report_date', 'avg_temp']],
           'left',
           'report_date') \
    .merge(covid_hk_policy_std,
           'left',
           'report_date')

# Train Model

## Regression Tree Model

### Add Lag Columns for Regression Tree Model

In [7]:
# Lagged number
lag_nbr = 7

reg_tree_data = time_series_data.copy()
for col in reg_tree_data.columns:
    if col != 'report_date':
        add_lag_columns(reg_tree_data, col, lag_nbr)

In [8]:
# To confirm there is no missing values due to join or lagging
covid_module.print_missing_val_count(reg_tree_data)

This dataframe does not have missing values.


### Training Data and Test Data for Tree Model

In [9]:
reg_tree_X_col_list = [col for col in reg_tree_data.columns if (col != 'report_date') and (col != 'case_cnt')]

reg_tree_X = reg_tree_data[reg_tree_X_col_list]
reg_tree_y = reg_tree_data['case_cnt']

# Split into train set and test set with ratio 0.8 : 0.2
reg_tree_X_train, reg_tree_X_test, reg_tree_y_train, reg_tree_y_test = train_test_split(reg_tree_X, reg_tree_y, test_size=0.2, random_state=2024)

### Fit Regression Tree Model

In [10]:
# Regression Tree model
reg_tree_model = DecisionTreeRegressor(min_samples_leaf=5, max_depth=3, random_state=2024)

# Fit Regression Tree model
reg_tree_model.fit(reg_tree_X_train, reg_tree_y_train)

# Training score and test score
print(f"Training score of Regression Tree Classifier is {reg_tree_model.score(reg_tree_X_train, reg_tree_y_train):.3f}.")
print(f"Test score of Regression Tree Model is {reg_tree_model.score(reg_tree_X_test, reg_tree_y_test):.3f}.")

Training score of Regression Tree Classifier is 0.860.
Test score of Regression Tree Model is 0.898.


### Graph and Feature Importance of the Tree Model

In [11]:
# # Export Regression Tree in DOT format with Graphviz
# reg_dot_graph = export_graphviz(reg_tree_model,
#                                 feature_names=reg_tree_model.feature_names_in_,
#                                 filled=True,
#                                 rounded=True,
#                                 special_characters=True)

# reg_tree_graph = graphviz.Source(reg_dot_graph, format="png")

# # Print the structure of Regression Tree model
# reg_tree_graph

In [12]:
# # Feature importances
# reg_tree_feat_import = pd.DataFrame({'Variable Name': reg_tree_model.feature_names_in_,
#                                      'Importance': reg_tree_model.feature_importances_})

# reg_tree_feat_import.sort_values(by='Importance', ascending=False)[:10]

## Vector Autoregressions (VAR) Model

###  Data for VAR Model

In [13]:
var_data = time_series_data.copy()

var_data.set_index('report_date', inplace=True)

### Training Data and Test Data for VAR Model

In [14]:
# Use the last 5 days as test data
var_train = var_data[:-5]
var_test = var_data[-5:]

### Fit VAR Model

In [15]:
# VAR model
var_model = VAR(var_train)

# Select lag order based on AIC
var_lag_order = var_model.select_order().aic

# Fit the VAR model
var_model = var_model.fit(maxlags=var_lag_order)

# Summary of the fitted VAR model
# print(var_model.summary())

### Prediction of the Fitted VAR Model

In [16]:
# Predict case counts and average temperatures for the next 5 days
print('Predictions of new case counts for the next 5 days using the trained VAR model:')
pd.DataFrame(var_model.forecast(var_train.values, steps=5), columns=var_train.columns)['case_cnt']

Predictions of new case counts for the next 5 days using the trained VAR model:


0     501.614917
1     690.152841
2     833.081522
3     964.704494
4    1074.189370
Name: case_cnt, dtype: float64

In [17]:
print('The true new case counts for the next 5 days:')
var_test[['case_cnt']].reset_index()

The true new case counts for the next 5 days:


Unnamed: 0,report_date,case_cnt
0,2023-01-25,485.0
1,2023-01-26,371.0
2,2023-01-27,455.0
3,2023-01-28,498.0
4,2023-01-29,456.0
