# The University of Hong Kong
## DASC7600 Data Science Project 2024
## Classification Model

# Import Modules and Settings

In [1]:
import graphviz
import math
import os
import pandas as pd
import warnings
from bisect import bisect
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Settings
warnings.filterwarnings("ignore")

# Add Graphviz path
os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin" # To fix a bug

# Load Data

In [2]:
# Read csv files
covid_hk_case_detail_std = pd.read_csv('./data/std_data/hk/covid_hk_case_detail_std.csv')

# Classification Tree Model

In [3]:
# Keep records with outcomes 'No admission', 'Discharged' and 'Deceased'
classf_tree_data = covid_hk_case_detail_std[covid_hk_case_detail_std["case_outcome"] \
    .isin(['No admission', 'Discharged', 'Deceased'])]


# Replace values 'No admission' and 'Discharged' in the column 'case_outcome' to 'Recovered'
classf_tree_data = classf_tree_data \
    .replace({'case_outcome': {'No admission': 'Recovered',
                               'Discharged': 'Recovered'}})

# Remove cases with imcomplete information
classf_tree_data = classf_tree_data[~classf_tree_data['resident'].isna()]

# Independent variables and target varible 
classf_tree_X = classf_tree_data[['gender', 'age_group', 'resident', 'import_local', 'report_year_month']]
classf_tree_y = classf_tree_data[['case_outcome']]

# Rename columns and transform the values to numeric for Classification Tree Model
classf_tree_X = classf_tree_X \
    .rename(columns={'gender': 'male_ind',
                     'resident': 'hk_resident_ind',
                     'import_local': 'local_case_ind'}) \
    .replace({'male_ind': {'F': 0, 'M': 1},
              'age_group': {'0-11': 0,'12-19': 12, '20-29': 20, '30-39': 30, '40-49': 40,
                            '50-59': 50, '60-69': 60, '70-79': 70, '80 and above': 80},
              'hk_resident_ind': {'Non-HK resident': 0, 'HK resident': 1},
              'local_case_ind': {'import': 0, 'local': 1}})

# Under-sampling since the dataset is imbalance
classf_tree_X_sample, classf_tree_y_sample = RandomUnderSampler(random_state=2024, replacement=True).fit_resample(classf_tree_X, classf_tree_y)

# Split into train set and test set with ratio 0.8 : 0.2
classf_tree_X_train, classf_tree_X_test, classf_tree_y_train, classf_tree_y_test = \
    train_test_split(classf_tree_X_sample, classf_tree_y_sample, test_size=0.2, random_state=2024)

In [4]:
# Fit Classification Tree model
classf_tree_model = DecisionTreeClassifier(criterion="gini", min_samples_leaf=2, max_depth=3, random_state=2024)
classf_tree_model.fit(classf_tree_X_train, classf_tree_y_train)

# Training score and test score
print(f"Training score of Classification Tree model is {classf_tree_model.score(classf_tree_X_train, classf_tree_y_train):.3f}.")
print(f"Test score of Classification Tree model is {classf_tree_model.score(classf_tree_X_test, classf_tree_y_test):.3f}.")

Training score of Classification Tree model is 0.866.
Test score of Classification Tree model is 0.837.


In [5]:
# Confusion matrix of Classification Tree model
print('Confusion matrix of the classification tree model:')
print(confusion_matrix(classf_tree_y_test, classf_tree_model.predict(classf_tree_X_test), labels=['Recovered', 'Deceased']))

Confusion matrix of the classification tree model:
[[40  3]
 [11 32]]


In [6]:
# To correct the thresholds for tree diagram
classf_tree_node_nbr = len(classf_tree_model.tree_.feature)
age_group_numeric_values = sorted(classf_tree_X['age_group'].drop_duplicates().to_list())
year_month_numeric_values = sorted(classf_tree_X['report_year_month'].drop_duplicates().to_list())

for index in range(classf_tree_node_nbr):
    feat_name_index = classf_tree_model.tree_.feature[index]
    if feat_name_index != -2:
        feat_name = classf_tree_model.feature_names_in_[feat_name_index]
        thrhld = classf_tree_model.tree_.threshold[index]
        if 'ind' in feat_name:
            if thrhld == 0.5:
                classf_tree_model.tree_.threshold[index] = 0
        elif feat_name == 'report_year_month':
            pos_index = bisect(year_month_numeric_values, thrhld)
            classf_tree_model.tree_.threshold[index] = year_month_numeric_values[pos_index - 1]
        elif feat_name == 'age_group':
            pos_index = bisect(age_group_numeric_values, thrhld)
            classf_tree_model.tree_.threshold[index] = age_group_numeric_values[pos_index] - 1

In [7]:
# # Tree diagram
# classf_dot_graph = export_graphviz(classf_tree_model,
#                                    feature_names=classf_tree_model.feature_names_in_,
#                                    class_names=classf_tree_model.classes_,
#                                    filled=True,
#                                    rounded=True,
#                                    special_characters=True)

# classf_tree_graph = graphviz.Source(classf_dot_graph, format="png")

# # Print the tree diagram
# classf_tree_graph