# The University of Hong Kong
## DASC7600 Data Science Project 2024
## Classification Model

# Import Modules and Settings

In [1]:
import graphviz
import os
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from imblearn.under_sampling import RandomUnderSampler

# Settings
warnings.filterwarnings("ignore")
%matplotlib inline

os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin" # To fix a graphviz bug

# Load Data

In [2]:
# Read csv files
covid_hk_std = pd.read_csv('./data/std_data/hk/covid_hk_case_std.csv')

# Classification Tree Model

In [3]:
# Keep records with outcomes Discharged or Deceased
covid_hk_std = covid_hk_std[covid_hk_std["case_outcome"].isin(['Discharged', 'Deceased'])]

# Remove cases with imcomplete information
covid_hk_std = covid_hk_std[~covid_hk_std['resident'].isna()]

# Independent variables and target varible 
tree_X = covid_hk_std[['gender', 'age_group', 'resident', 'import_local', 'report_year_month']]
tree_y = covid_hk_std[['case_outcome']]

# Rename columns and transform the values to numeric for Decision Tree Classifier
tree_X = tree_X \
    .rename(columns={'gender': 'male_ind',
                     'import_local': 'local_ind'}) \
    .replace({'male_ind': {'F': 0, 'M': 1},
              'age_group': {'0-11': 0,'12-19': 12, '20-29': 20, '30-39': 30, '40-49': 40,
                            '50-59': 50, '60-69': 60, '70-79': 70, '80 and above': 80},
              'resident': {'Non-HK resident': 0, 'HK resident': 1},
              'local_ind': {'import': 0, 'local': 1}})

# Under-sampling since the dataset is imbalance
tree_X_sample, tree_y_sample = RandomUnderSampler(random_state=2024, replacement=True).fit_resample(tree_X, tree_y)

# Split into train set and test set with ratio 0.8 : 0.2
tree_X_train, tree_X_test, tree_y_train, tree_y_test = train_test_split(tree_X_sample, tree_y_sample, test_size=0.2, random_state=2024)

In [4]:
# Fit Decision Tree Classifier
covid_tree = DecisionTreeClassifier(criterion="gini", min_samples_leaf=2, max_depth=3, random_state=2024)
covid_tree.fit(tree_X_train, tree_y_train)
print(f"Test score of Decision Tree Classifier is {covid_tree.score(tree_X_test, tree_y_test):.3f}.")

Test score of Decision Tree Classifier is 0.826.


In [5]:
# # Export a decision tree in DOT format with Graphviz
# dot_graph = export_graphviz(covid_tree,
#                             feature_names=covid_tree.feature_names_in_,
#                             class_names=covid_tree.classes_,
#                             filled=True,
#                             rounded=True,
#                             special_characters=True)

# tree_graph = graphviz.Source(dot_graph, format="png")

# # Print the structure of regression tree
# tree_graph

1. Age group is a key feature since the 1st and 2nd split used the age group feature. 
2. The risk of being "Deceased" is higher for elderly.
3. Date is also a feature that affects the outcome. In the year 2022, it is more likely to be "Discharged". 