# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Case Details

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Functions

In [3]:
def get_import_local_col(value: str) -> str:
    if 'IMPORT' in value.upper():
        return 'import'
    elif 'LOCAL' in value.upper():
        return 'local'
    else:
        return 'N/A'

# Load Data

In [4]:
# Read csv file
covid_hk_case_std = pd.read_csv(project_dir + '/data/raw_data/hk/enhanced_sur_covid_19_eng.csv')

# Drop Non-Informative Column

In [5]:
# Drop hospital name column
covid_hk_case_std = covid_hk_case_std.drop(['Case no.', 'Name of hospital admitted'], axis=1)

# Modify Column Names

In [6]:
# Rename columns
col_name_map = {'Report date': 'report_date',
                'Date of onset': 'onset_date',
                'Gender': 'gender',
                'Age': 'age',
                'Hospitalised/Discharged/Deceased': 'case_outcome',
                'HK/Non-HK resident': 'resident',
                'Classification*': 'case_type',
                'Case status*': 'case_status'}

covid_hk_case_std.rename(columns=col_name_map, inplace=True)

# Drop Deleted Records

In [7]:
# Drop records with status "Deleted"
covid_hk_case_std = covid_hk_case_std[covid_hk_case_std['case_status'] != 'Deleted']

# Replace Values

In [8]:
# Modify value in field "onset_date" and "age"
covid_hk_case_std = covid_hk_case_std \
    .replace({'onset_date': {'Asymptomatic': 'N/A',
                             'January': '01/01/2020', # Report date for this record is 27/3/2020
                             'Mid-March': '15/03/2020', # Report date for this record is 6/6/2020
                             'Mid-July': '15/07/2020', # Report date for this record is 14/8/2020
                             'October': '01/10/2020'}, # Report date for this record is 11/11/2020
              'age': {'<1': '0'}})

# Fill Missing Values as N/A

In [9]:
# Fill NA with "N/A"
covid_hk_case_std = covid_hk_case_std \
    .fillna('N/A')

# Replace "Pending" and "Unknown" with "N/A"
covid_hk_case_std = covid_hk_case_std \
    .replace({'Pending': 'N/A',
              'Unknown': 'N/A'})

# Update Date Format

In [10]:
# Update date format to YYYYMMDD for report_date
covid_module.change_date_format(covid_hk_case_std,
                                'report_date', '%d/%m/%Y',
                                'report_date', '%Y%m%d')

# Update date format to YYYYMMDD for onset_date
covid_module.change_date_format(covid_hk_case_std,
                                'onset_date', '%d/%m/%Y',
                                'onset_date', '%Y%m%d')

# Feature Creation

In [11]:
# Create age group column
covid_hk_case_std['age_group'] = pd.cut(covid_hk_case_std[(covid_hk_case_std['age'] != 'N/A')]['age'].astype('int'),
                                        bins=[0, 12, 20, 30, 40, 50, 60, 70, 80, 200],
                                        labels=['0-11', '12-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80 and above'],
                                        right=False)

In [12]:
# Create import/ local column
covid_hk_case_std['import_local'] = covid_hk_case_std['case_type'].apply(lambda col: get_import_local_col(col))

In [13]:
# Create year and year-month columns for report date
covid_module.get_year_month_part(covid_hk_case_std, 'report_date', '%Y%m%d')

# Export Data Pre-processing Result to csv File

In [14]:
# Export to csv file
covid_hk_case_std.to_csv(project_dir + '/data/std_data/hk/covid_hk_case_detail_std.csv', index=False)