# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Vaccination

# Import Modules and Settings

In [1]:
import os
import sys

project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
# Read csv file
covid_hk_vacc_std = pd.read_csv(project_dir + '/data/raw_data/hk/vaccination-rates-over-time-by-age.csv')

# Modify Column Names

In [4]:
# Rename columns
# Date, age, sex columns
col_name_map = {'Date': 'report_date',
                'Age Group': 'age_group',
                'Sex': 'gender'}

covid_hk_vacc_std.rename(columns=col_name_map, inplace=True)

# Sinovac and BioNTech columns
col_name_list = covid_hk_vacc_std.columns
col_name_list = [col_name.replace(' ', '_').replace('Sinovac', 'sinov').replace('BioNTech', 'biont') for col_name in col_name_list]
covid_hk_vacc_std.columns = col_name_list

# Add Rows for 0 Dose

In [5]:
demo_columns = ['age_group', 'gender']
key_columns = ['report_date'] + demo_columns

covid_hk_vacc_std = covid_hk_vacc_std \
    .set_index(key_columns) \
    .reindex(pd.MultiIndex \
                 .from_product([pd.date_range(covid_hk_vacc_std['report_date'].min(),
                                              covid_hk_vacc_std['report_date'].max()) \
                                             .astype('str'),
                                covid_hk_vacc_std['age_group'].unique(),
                                covid_hk_vacc_std['gender'].unique()],
                               names=key_columns),
             fill_value = 0) \
    .reset_index()

# Update Date Format

In [6]:
# Update date format to YYYYMMDD for report_date
covid_module.change_date_format(covid_hk_vacc_std,
                                'report_date', '%Y-%m-%d',
                                'report_date', '%Y%m%d')

# Feature Creation

In [7]:
# Dose columns
vacc_list = ['sinov', 'biont']
ord_list = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th']

covid_hk_vacc_std['dose_all'] = 0
for vacc_type in vacc_list:
    for ord in ord_list:
        col_name =  f'{vacc_type}_{ord}_dose'
        # Dose (all) column
        covid_hk_vacc_std['dose_all'] += covid_hk_vacc_std[col_name]
        # Cumulative sum
        covid_hk_vacc_std[col_name + '_cum'] = covid_hk_vacc_std[demo_columns + [col_name]] \
            .groupby(demo_columns) \
            .cumsum(axis=0)

# Cumulative sum of dose (all) column
covid_hk_vacc_std['dose_all_cum'] = covid_hk_vacc_std[demo_columns + ['dose_all']] \
    .groupby(demo_columns) \
    .cumsum(axis=0)

In [8]:
# Create year and year-month columns for report date
covid_module.get_year_month_part(covid_hk_vacc_std, 'report_date', '%Y-%m-%d')

# Feature Transformation

In [9]:
# TBD

# Categorical Features Handling

In [10]:
# covid_module.one_hot_encoding(covid_hk_vacc_std, 'gender')

# Export Data Pre-processing Result to csv File

In [11]:
# Export to csv file
covid_hk_vacc_std.to_csv(project_dir + '/data/std_data/hk/covid_hk_vacc_std.csv', index=False)