# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Case Counts

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import datetime
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Functions

In [3]:
def correct_value_for_monotonic(df: pd.DataFrame,
                                date_col: str,
                                target_col: str,
                                target_date_str: str) -> None:
    target_date_dt = datetime.datetime.strptime(target_date_str, '%Y%m%d')
    
    target_date_minus_1_dt = target_date_dt - datetime.timedelta(days=1)
    target_date_plus_1_dt = target_date_dt + datetime.timedelta(days=1)
    
    target_date_minus_1_str = target_date_minus_1_dt.strftime('%Y%m%d')
    target_date_plus_1_str = target_date_plus_1_dt.strftime('%Y%m%d')
    
    df.loc[df[date_col] == target_date_str, target_col] = \
        int(df.loc[df[date_col] == target_date_minus_1_str, target_col].values[0] + \
            df.loc[df[date_col] == target_date_plus_1_str, target_col].values[0]) / 2

# Load Data

In [4]:
# Read csv file
covid_hk_case_count_std = pd.read_csv(project_dir + '/data/raw_data/hk/latest_situation_of_reported_cases_covid_19_eng.csv')

# Modify Column Names

In [5]:
# Rename columns
col_name_map = {'As of date': 'report_date',
                'Number of confirmed cases': 'cuml_confirm_case_cnt',
                'Number of cases tested positive for SARS-CoV-2 virus by nucleic acid tests': 'cuml_nucl_acid_case_cnt',
                'Number of discharge cases': 'cuml_dischg_cnt',
                'Number of death cases': 'cuml_death_cnt'}

covid_hk_case_count_std.rename(columns=col_name_map, inplace=True)

# Update Date Format

In [6]:
# Update date format to YYYYMMDD for onset_date
covid_module.change_date_format(covid_hk_case_count_std,
                                'report_date', '%d/%m/%Y',
                                'report_date', '%Y%m%d')

# Drop the Data

In [7]:
# Only keep data before 20230129
## The dataset has data till 2023-05-29
## Starting from 2023-01-30, the only 2 columns with values are 'Number of positive nucleic acid test laboratory detections' and 'Number of death cases related to COVID-19'
covid_hk_case_count_std = covid_hk_case_count_std[covid_hk_case_count_std['report_date'] <= '20230129']

# Replace Values

In [8]:
# Replace the discharge count with values -1 after 2022-02-13 to represent missing values
covid_hk_case_count_std['cuml_dischg_cnt'] = covid_hk_case_count_std['cuml_dischg_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] >= '20220213', -1)

In [9]:
# Correct the values on 2021-09-03 and 2021-10-09 for confirmed case counts and discharge counts columns
correct_value_for_monotonic(covid_hk_case_count_std, 'report_date', 'cuml_confirm_case_cnt', '20210903')
correct_value_for_monotonic(covid_hk_case_count_std, 'report_date', 'cuml_confirm_case_cnt', '20211009')
correct_value_for_monotonic(covid_hk_case_count_std, 'report_date', 'cuml_dischg_cnt', '20210903')
correct_value_for_monotonic(covid_hk_case_count_std, 'report_date', 'cuml_dischg_cnt', '20211009')

# Feature Creation

In [10]:
# Obtain cumulative case count from 2 columns
covid_hk_case_count_std['cuml_case_cnt'] = 0

covid_hk_case_count_std['cuml_case_cnt'] = covid_hk_case_count_std['cuml_case_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] < '20220101',
          covid_hk_case_count_std['cuml_confirm_case_cnt'])

covid_hk_case_count_std['cuml_case_cnt'] = covid_hk_case_count_std['cuml_case_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] >= '20220101',
          covid_hk_case_count_std['cuml_nucl_acid_case_cnt'])

# Obtain case count and recover count column
covid_hk_case_count_std['new_case_cnt'] = covid_hk_case_count_std['cuml_case_cnt'].diff().fillna(0)
covid_hk_case_count_std['new_dischg_cnt'] = covid_hk_case_count_std['cuml_dischg_cnt'].diff().fillna(0)

In [11]:
# Create year and year-month columns for report date
covid_module.get_year_month_part(covid_hk_case_count_std, 'report_date', '%Y%m%d')

# Only Keep Required Columns

In [12]:
covid_hk_case_count_std = covid_hk_case_count_std[['report_date', 'report_year', 'report_year_month',
                                                   'cuml_case_cnt', 'cuml_dischg_cnt', 'cuml_death_cnt',
                                                   'new_case_cnt', 'new_dischg_cnt']]

# Export Data Pre-processing Result to csv File

In [13]:
# Export to csv file
covid_hk_case_count_std.to_csv(project_dir + '/data/std_data/hk/covid_hk_case_count_std.csv', index=False)