# The University of Hong Kong
## DASC7600 Data Science Project 2024

# Import modules

In [1]:
import os
import numpy as np
import pandas as pd
import graphviz
import warnings
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

warnings.filterwarnings("ignore")
%matplotlib inline

os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin" # Fix the error message of graphviz

# Load Data

In [2]:
# Read csv files
covid_hk_std = pd.read_csv('./data/std_data/hk/covid_hk_std.csv')

# Decision Tree Classifier

In [3]:
# To remove cases with imcomplete information
covid_hk_std = covid_hk_std[covid_hk_std["report_year"] <= 2021]
covid_hk_std = covid_hk_std[covid_hk_std["case_outcome"].isin(['Discharged', 'Deceased'])]

# Independent variables and target varible 
X = covid_hk_std[['gender_M', 'age_group', 'resident', 'import_local', 'report_year_month']]
y = covid_hk_std[['case_outcome']]

# Transform values to numeric for Decision Tree Classifier
X = X.replace({"age_group": {"0-19": 0, "20-39": 2, "40-59": 4, "60-79": 6, "80-100": 8}})
X = X.replace({"resident": {"Non-HK resident": 0, "HK resident": 1}})
X = X.rename(columns={'import_local': 'local'}).replace({"local": {"import": 0, "local": 1}})

# Under-sampling since the dataset is imbalance
X_sample, y_sample = RandomUnderSampler(random_state=2024, replacement=True).fit_resample(X, y)

# Split into train set and test set with ratio 0.8 : 0.2
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=2024)

In [4]:
# Fie Decision Tree Classifier
covid_tree = DecisionTreeClassifier(criterion="gini", min_samples_leaf=2, max_depth=3, random_state=2024)
covid_tree.fit(X_train, y_train)
covid_tree_score = covid_tree.score(X_test, y_test)
print(f"Test score of Decision Tree Classifier is {covid_tree_score: .4f}.")

Test score of Decision Tree Classifier is  0.8256.


In [5]:
# Export a decision tree in DOT format with Graphviz
dot_graph = export_graphviz(covid_tree,
                            feature_names=covid_tree.feature_names_in_,
                            class_names=covid_tree.classes_,
                            filled=True,
                            rounded=True,
                            special_characters=True)

tree_graph = graphviz.Source(dot_graph, format="png")

# Print the structure of regression tree
# tree_graph