# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Case Count

# Import Modules and Settings

In [1]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv file
covid_hk_case_count_std = pd.read_csv('./data/raw_data/hk/latest_situation_of_reported_cases_covid_19_eng.csv')

# Modify Column Names

In [3]:
# Rename columns
col_name_map = {'As of date': 'report_date',
                'Number of discharge cases': 'cuml_dischg_cnt',
                'Number of death cases': 'cuml_death_cnt'}

covid_hk_case_count_std.rename(columns=col_name_map, inplace=True)

# Update Date Format

In [4]:
# Update date format to YYYYMMDD for onset_date
covid_module.change_date_format(covid_hk_case_count_std,
                                'report_date', '%d/%m/%Y',
                                'report_date', '%Y%m%d')

# Drop the Last 

In [5]:
covid_hk_case_count_std = covid_hk_case_count_std[covid_hk_case_count_std['report_date'] <= '20230129']

# Replace Values

In [6]:
covid_hk_case_count_std['cuml_dischg_cnt'] = covid_hk_case_count_std['cuml_dischg_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] >= '20220213', -1)

In [None]:
# Correct the 2 typo

# Feature Creation

In [7]:
# Obtain cumulative case count from 2 columns
covid_hk_case_count_std['cuml_case_cnt'] = 0

covid_hk_case_count_std['cuml_case_cnt'] = covid_hk_case_count_std['cuml_case_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] < '20220101',
          covid_hk_case_count_std['Number of confirmed cases'])

covid_hk_case_count_std['cuml_case_cnt'] = covid_hk_case_count_std['cuml_case_cnt'] \
    .mask(covid_hk_case_count_std['report_date'] >= '20220101',
          covid_hk_case_count_std['Number of cases tested positive for SARS-CoV-2 virus by nucleic acid tests'])

In [8]:
# Create year and year-month columns for report date
covid_module.get_year_month_part(covid_hk_case_count_std, 'report_date', '%Y%m%d')

# Only Keep Required Columns

In [9]:
covid_hk_case_count_std = covid_hk_case_count_std[['report_date', 'cuml_case_cnt', 'cuml_dischg_cnt', 'cuml_death_cnt', 'report_year', 'report_year_month']]

# Export Data Pre-processing Result to csv File

In [10]:
# Export to csv file
covid_hk_case_count_std.to_csv('./data/std_data/hk/covid_hk_case_count_std.csv', index=False)