# The University of Hong Kong
## DASC7600 Data Science Project 2024

# Import modules and configuration

In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

%matplotlib inline

# Functions

In [2]:
def print_missing_val_count(df: pd.DataFrame) -> None:
    # Missing value counts
    df_na_cnt = df.isnull().sum()
    df_record_cnt = df.shape[0]
    
    # Print the count of missing value for each feature
    if df_na_cnt.sum() > 0:
        print("The following columns have missing values:")
        for col, na_cnt in zip(df_na_cnt.index, df_na_cnt.values):
            if na_cnt > 0:
                print(f"{col}: {na_cnt} ({100*na_cnt/df_record_cnt:0.1f}%)")
    else:
        print("The dataframe does not have missing values.")

# Load Data

In [3]:
# Read csv files
covid_hk_df = pd.read_csv('./data/hk/enhanced_sur_covid_19_eng.csv')

# Data Exploratory Analysis (EDA)

In [4]:
# Print first 10 records in dataframe
covid_hk_df.head(10)

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Name of hospital admitted,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Classification*,Case status*
0,1,23/01/2020,21/01/2020,M,39,,Discharged,Non-HK resident,Imported case,Confirmed
1,2,23/01/2020,18/01/2020,M,56,,Discharged,HK resident,Imported case,Confirmed
2,3,24/01/2020,20/01/2020,F,62,,Discharged,Non-HK resident,Imported case,Confirmed
3,4,24/01/2020,23/01/2020,F,62,,Discharged,Non-HK resident,Imported case,Confirmed
4,5,24/01/2020,23/01/2020,M,63,,Discharged,Non-HK resident,Imported case,Confirmed
5,6,26/01/2020,21/01/2020,M,47,,Discharged,HK resident,Imported case,Confirmed
6,7,26/01/2020,21/01/2020,F,68,,Discharged,HK resident,Imported case,Confirmed
7,8,26/01/2020,25/01/2020,M,64,,Discharged,Non-HK resident,Imported case,Confirmed
8,9,29/01/2020,25/01/2020,F,73,,Discharged,Non-HK resident,Imported case,Confirmed
9,10,29/01/2020,25/01/2020,M,72,,Discharged,Non-HK resident,Imported case,Confirmed


In [5]:
# Basic information of dataframe
covid_hk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15441 entries, 0 to 15440
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Case no.                          15441 non-null  int64  
 1   Report date                       15441 non-null  object 
 2   Date of onset                     15421 non-null  object 
 3   Gender                            15435 non-null  object 
 4   Age                               15435 non-null  object 
 5   Name of hospital admitted         0 non-null      float64
 6   Hospitalised/Discharged/Deceased  15435 non-null  object 
 7   HK/Non-HK resident                15435 non-null  object 
 8   Classification*                   15435 non-null  object 
 9   Case status*                      15441 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.2+ MB


In [6]:
# Rename columns
col_name_map = {'Case no.': 'case_id',
                'Report date': 'report_date',
                'Date of onset': 'onset_date',
                'Gender': 'gender',
                'Age': 'age',
                'Name of hospital admitted': 'hospital_name',
                'Hospitalised/Discharged/Deceased': 'case_outcome',
                'HK/Non-HK resident': 'resident',
                'Classification*': 'case_type',
                'Case status*': 'case_status'}

covid_hk_df.rename(columns=col_name_map, inplace=True)

In [7]:
# Number of missing values in each coumns
print_missing_val_count(covid_hk_df)

The following columns have missing values:
onset_date: 20 (0.1%)
gender: 6 (0.0%)
age: 6 (0.0%)
hospital_name: 15441 (100.0%)
case_outcome: 6 (0.0%)
resident: 6 (0.0%)
case_type: 6 (0.0%)


In [8]:
# Distinct values in hospital name column
covid_hk_df["hospital_name"].drop_duplicates()

0   NaN
Name: hospital_name, dtype: float64

In [9]:
# Print records with case status "Deleted"
covid_hk_df[covid_hk_df["case_status"] == "Deleted"]

Unnamed: 0,case_id,report_date,onset_date,gender,age,hospital_name,case_outcome,resident,case_type,case_status
11251,11252,13/03/2021,,,,,,,,Deleted
11264,11265,14/03/2021,,,,,,,,Deleted
11351,11352,19/03/2021,,,,,,,,Deleted
11702,11703,20/04/2021,,,,,,,,Deleted
11725,11726,23/04/2021,,,,,,,,Deleted
13143,13144,22/01/2022,,,,,,,,Deleted


In [10]:
# Print records with case status "Unknown"
covid_hk_df[covid_hk_df["case_status"] == "Unknown"]

Unnamed: 0,case_id,report_date,onset_date,gender,age,hospital_name,case_outcome,resident,case_type,case_status
14531,14532,03/02/2022,Unknown,M,41,,Pending admission,Non-HK resident,Imported case,Unknown


In [11]:
# Drop hospital name column
covid_hk_df = covid_hk_df.drop(["hospital_name"], axis=1)

In [12]:
# Drop records with status "Deleted" or "Unknown"
covid_hk_df = covid_hk_df[~covid_hk_df["case_status"].isin(["Deleted", "Unknown"])]