# The University of Hong Kong
## DASC7600 Data Science Project 2024
## EDA - HK - Cases

# Import Modules and Settings

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings

import covid_module

# Settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [2]:
# Read csv file
# Dataset name: Data in Coronavirus Disease (COVID-19)
# URL: https://data.gov.hk/en-data/dataset/hk-dh-chpsebcddr-novel-infectious-agent
# Data resource name: Latest local situation of COVID-19 (English)
covid_hk_case_df = pd.read_csv('./data/raw_data/hk/enhanced_sur_covid_19_eng.csv')

# Basic Information of Data Set

In [3]:
# Print first 10 records
covid_hk_case_df.head(10)

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Name of hospital admitted,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Classification*,Case status*
0,1,23/01/2020,21/01/2020,M,39,,Discharged,Non-HK resident,Imported case,Confirmed
1,2,23/01/2020,18/01/2020,M,56,,Discharged,HK resident,Imported case,Confirmed
2,3,24/01/2020,20/01/2020,F,62,,Discharged,Non-HK resident,Imported case,Confirmed
3,4,24/01/2020,23/01/2020,F,62,,Discharged,Non-HK resident,Imported case,Confirmed
4,5,24/01/2020,23/01/2020,M,63,,Discharged,Non-HK resident,Imported case,Confirmed
5,6,26/01/2020,21/01/2020,M,47,,Discharged,HK resident,Imported case,Confirmed
6,7,26/01/2020,21/01/2020,F,68,,Discharged,HK resident,Imported case,Confirmed
7,8,26/01/2020,25/01/2020,M,64,,Discharged,Non-HK resident,Imported case,Confirmed
8,9,29/01/2020,25/01/2020,F,73,,Discharged,Non-HK resident,Imported case,Confirmed
9,10,29/01/2020,25/01/2020,M,72,,Discharged,Non-HK resident,Imported case,Confirmed


In [4]:
# Basic information of dataframe
covid_hk_case_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15441 entries, 0 to 15440
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Case no.                          15441 non-null  int64  
 1   Report date                       15441 non-null  object 
 2   Date of onset                     15421 non-null  object 
 3   Gender                            15435 non-null  object 
 4   Age                               15435 non-null  object 
 5   Name of hospital admitted         0 non-null      float64
 6   Hospitalised/Discharged/Deceased  15435 non-null  object 
 7   HK/Non-HK resident                15435 non-null  object 
 8   Classification*                   15435 non-null  object 
 9   Case status*                      15441 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.2+ MB


# Modify Data Types

In [5]:
# Modify data type for datatime column
covid_hk_case_df['Report date'] = pd.to_datetime(covid_hk_case_df['Report date'])

# Exploratory Data Analysis (EDA)

## Univariate Analysis

## Field - Case no.

In [6]:
print('All values are distinct:', covid_hk_case_df['Case no.'].nunique() == covid_hk_case_df['Case no.'].shape[0])

All values are distinct: True


'Case no.' is a key column, there is no duplicate values.

## Field - Name of hospital admitted

In [7]:
# Distinct values in hospital name column
print('Distinct values in hospital name column are:',
      str(covid_hk_case_df['Name of hospital admitted'].drop_duplicates().to_list())[1:-1])

Distinct values in hospital name column are: nan


## Field - Report date

In [8]:
# Report date range of data set
min_date, max_date = covid_hk_case_df['Report date'].agg(['min', 'max']).astype('str').to_list()
print('Date range of report date:')
print(f'Min date: {min_date}')
print(f'Max date: {max_date}')

Date range of report date:
Min date: 2020-01-02
Max date: 2022-12-01


## Field - Gender

In [9]:
# Distinct values in gender column
print('Distinct values in gender column are:',
      str(covid_hk_case_df['Gender'].drop_duplicates().to_list())[1:-1])

Distinct values in gender column are: 'M', 'F', nan, 'Pending'


## Field - Age

In [10]:
# Distinct values in age column
print('Distinct values in age column are:',
      str(sorted(covid_hk_case_df['Age'].astype('str').drop_duplicates().to_list()))[1:-1])

Distinct values in age column are: '1', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '<1', 'Pending', 'nan'


## Field - Classification*

In [11]:
# Distinct values in classification column
print('Distinct values in classification column are:',
      str(covid_hk_case_df['Classification*'].drop_duplicates().to_list())[1:-1])

Distinct values in classification column are: 'Imported case', 'Epidemiologically linked with imported case', 'Possibly local case', 'Local case', 'Epidemiologically linked with local case', 'Epidemiologically linked with possibly local case', nan, 'Locally acquired case'


## Field - Hospitalised/Discharged/Deceased

In [12]:
# Distinct values in Hospitalised/Discharged/Deceased column
print('Distinct values in Hospitalised/Discharged/Deceased column are:',
      str(covid_hk_case_df['Hospitalised/Discharged/Deceased'].drop_duplicates().to_list())[1:-1])

Distinct values in Hospitalised/Discharged/Deceased column are: 'Discharged', 'Deceased', 'No admission', nan, 'Hospitalised', 'Pending admission'


## Field - Case status*

In [13]:
# Distinct values in Case status column
print('Distinct values in case status column are:',
      str(covid_hk_case_df['Case status*'].drop_duplicates().to_list())[1:-1])

Distinct values in case status column are: 'Confirmed', 'Deleted', 'Asymptomatic', 'Re-positive', 'Pending', 'Unknown'


In [14]:
# Print records with case status "Deleted"
covid_hk_case_df[covid_hk_case_df['Case status*'] == 'Deleted']

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Name of hospital admitted,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Classification*,Case status*
11251,11252,2021-03-13,,,,,,,,Deleted
11264,11265,2021-03-14,,,,,,,,Deleted
11351,11352,2021-03-19,,,,,,,,Deleted
11702,11703,2021-04-20,,,,,,,,Deleted
11725,11726,2021-04-23,,,,,,,,Deleted
13143,13144,2022-01-22,,,,,,,,Deleted


In [15]:
# Print records with case status "Unknown"
covid_hk_case_df[covid_hk_case_df['Case status*'] == 'Unknown']

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Name of hospital admitted,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Classification*,Case status*
14531,14532,2022-03-02,Unknown,M,41,,Pending admission,Non-HK resident,Imported case,Unknown


## Field - Report date

In [16]:
# # Line chart - Number of new Covid-19 cases (Hong Kong)
# plt.subplots(figsize=(15, 8))
# plt.plot(covid_hk_case_df['Report date'].dt.strftime('%Y-%m').value_counts().sort_index())
# plt.title('Number of new Covid-19 cases (Hong Kong)')
# plt.xlabel('Year-Month (%Y-%m)')
# plt.ylabel('Number of newly reported cases')
# plt.xticks(rotation=90)
# plt.show()

## Field - Age

In [17]:
# # Bar plot - Number of Covid-19 cases by Age (Hong Kong)
# plt.subplots(figsize=(15, 8))
# plt.hist(covid_hk_case_df[~covid_hk_case_df['Age'].isin(['<1', 'Pending'])]['Age'].dropna().astype('int'))
# plt.title('Number of Covid-19 cases vs age (Hong Kong)')
# plt.xlabel('Age')
# plt.ylabel('Number of cases')
# plt.show()

## Fields - Gender and HK/Non-HK resident

In [18]:
# # Pie Charts
# # Value counts of Gender and HK/Non-HK resident
# gender_count = covid_hk_case_df['Gender'].dropna().astype('str').value_counts()
# resident_count = covid_hk_case_df['HK/Non-HK resident'].dropna().astype('str').value_counts()

# # Create a figure and add the axes for subgraphs
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# fig.suptitle('Distribution of Covid-19 cases (Hong Kong)', fontsize=16)

# # Pie Chart - Percentage of Covid-19 cases by Gender (Hong Kong)
# axes[0].pie(gender_count.values, labels = gender_count.index, autopct='%1.2f%%', pctdistance=0.66, labeldistance=1.05)
# axes[0].set_xlabel('Gender')

# # Pie Chart - Percentage of Covid-19 cases by HK/Non-HK resident (Hong Kong)
# axes[1].pie(resident_count.values, labels = resident_count.index, autopct='%1.2f%%', pctdistance=0.66, labeldistance=1.05)
# axes[1].set_xlabel('Resident / Non-Resident')

# plt.tight_layout()
# plt.show()

## Fields - Hospitalised/Discharged/Deceased and Classification

In [19]:
# # Bar plots
# # Value counts of Hospitalised/Discharged/Deceased and Classification
# case_outcome_count = covid_hk_case_df['Hospitalised/Discharged/Deceased'].dropna().astype('str').value_counts()
# classification_count = covid_hk_case_df['Classification*'].dropna().astype('str').value_counts()

# # Create a figure and add the axes for subgraphs
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))
# fig.suptitle('Number of Covid-19 cases (Hong Kong)', fontsize=16)

# # Bar plot - Number of Covid-19 cases by Hospitalised/Discharged/Deceased (Hong Kong)
# axes[0].bar(case_outcome_count.index, case_outcome_count.values)
# axes[0].set_xlabel('Hospitalised/Discharged/Deceased')
# axes[0].set_ylabel('Number of cases')
# axes[0].set_xticklabels(case_outcome_count.index, rotation=45)

# # Bar plot - Number of Covid-19 cases by Classification (Hong Kong)
# axes[1].bar(classification_count.index, classification_count.values)
# axes[1].set_xlabel('Classification*')
# axes[1].set_ylabel('Number of cases')
# axes[1].set_xticklabels(classification_count.index, rotation=90)

# plt.tight_layout()
# plt.show()

## Multivariate Analysis

## Fields - Age and Gender

In [20]:
# # Stack Bar Plot of Age Group and Gender
# age_gender_df = covid_hk_case_df[
#     (covid_hk_case_df['Age'] != 'Pending') & \
#     (~covid_hk_case_df['Age'].isna()) &  \
#     (covid_hk_case_df['Gender'] != 'Pending')]

# age_gender_df = age_gender_df[['Age', 'Gender']].replace({'Age': {'<1':'0'}})

# age_gender_df['Age Group'] = pd.cut(age_gender_df['Age'].astype('int'),
#                                     bins=[0, 12, 20, 30, 40, 50, 60, 70, 80, 200],
#                                     labels=['0-11', '12-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80 and above'],
#                                     right=False)

# age_gender_cnt = age_gender_df.groupby(['Age Group', 'Gender']).size().reset_index(name='count')
# age_gender_cnt = age_gender_cnt.pivot(index='Age Group', columns='Gender', values='count')
# age_gender_cnt.reset_index(inplace=True)
# age_gender_cnt['Age Group'] = pd.Categorical(age_gender_cnt['Age Group'])
# age_gender_cnt = age_gender_cnt.sort_values('Age Group')

# age_gender_cnt.plot(
#     x = 'Age Group', 
#     kind = 'barh', 
#     stacked = True, 
#     title = 'Stacked Bar Chart of Age Group and Gender for Covid-19 cases (Hong Kong)', 
#     mark_right = True) 

# plt.xlabel('Count')
# plt.ylabel('Age Group')
# plt.show()

## Missing Value Analysis

In [21]:
# Number of missing values in each column
covid_module.print_missing_val_count(covid_hk_case_df)

The following columns have missing values:
Date of onset: 20 (0.1%)
Gender: 6 (0.0%)
Age: 6 (0.0%)
Name of hospital admitted: 15441 (100.0%)
Hospitalised/Discharged/Deceased: 6 (0.0%)
HK/Non-HK resident: 6 (0.0%)
Classification*: 6 (0.0%)


The field "Name of hospital admitted" does not have non-NULL values.

As shown in univariate analysis on field "Case status*", all missing values in fields "Gender", "Age", "Hospitalised/Discharged/Deceased", "HK/Non-HK resident" and "Classification*" belong to records with "Case status*" == "Deleted".