# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Vaccination

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
# Read csv file
covid_hk_vacc_std = pd.read_csv(project_dir + '/data/raw_data/hk/vaccination-rates-over-time-by-age.csv')

# Modify Column Names

In [4]:
# Rename columns
## Date, age, sex columns
col_name_map = {'Date': 'report_date',
                'Age Group': 'age_group',
                'Sex': 'gender'}

covid_hk_vacc_std.rename(columns=col_name_map, inplace=True)

## Sinovac and BioNTech columns
col_name_list = covid_hk_vacc_std.columns
col_name_list = [col_name.replace(' ', '_').replace('Sinovac', 'sinov').replace('BioNTech', 'biont') for col_name in col_name_list]
covid_hk_vacc_std.columns = col_name_list

# Add Rows for 0 Dose

In [5]:
demo_columns = ['age_group', 'gender']
key_columns = ['report_date'] + demo_columns

covid_hk_vacc_std = covid_hk_vacc_std \
    .set_index(key_columns) \
    .reindex(pd.MultiIndex \
                 .from_product([pd.date_range(covid_hk_vacc_std['report_date'].min(),
                                              covid_hk_vacc_std['report_date'].max()) \
                                             .astype('str'),
                                covid_hk_vacc_std['age_group'].unique(),
                                covid_hk_vacc_std['gender'].unique()],
                               names=key_columns),
             fill_value = 0) \
    .reset_index()

# Update Date Format

In [6]:
# Update date format to YYYYMMDD for report_date
covid_module.change_date_format(covid_hk_vacc_std,
                                'report_date', '%Y-%m-%d',
                                'report_date', '%Y%m%d')

# Feature Creation

In [7]:
# # Dose columns
# vacc_list = ['sinov', 'biont']
# ord_list = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th']

# covid_hk_vacc_std['dose_all'] = 0
# for vacc_type in vacc_list:
#     for ord in ord_list:
#         col_name =  f'{vacc_type}_{ord}_dose'
#         # Dose (all) column
#         covid_hk_vacc_std['dose_all'] += covid_hk_vacc_std[col_name]
#         # Cumulative sum
#         covid_hk_vacc_std[col_name + '_cum'] = covid_hk_vacc_std[demo_columns + [col_name]] \
#             .groupby(demo_columns) \
#             .cumsum(axis=0)

# # Cumulative sum of dose (all) column
# covid_hk_vacc_std['dose_all_cum'] = covid_hk_vacc_std[demo_columns + ['dose_all']] \
#     .groupby(demo_columns) \
#     .cumsum(axis=0)

# Export Data Pre-processing Result to csv Files

## Dayly Counts

In [8]:
# Aggregation
covid_hk_vacc_daily_count_std = covid_hk_vacc_std \
    .groupby('report_date') \
    ['sinov_1st_dose', 'sinov_2nd_dose', 'sinov_3rd_dose',
     'biont_1st_dose', 'biont_2nd_dose', 'biont_3rd_dose'] \
    .sum() \
    .reset_index()

# Export to csv file
covid_hk_vacc_daily_count_std.to_csv(project_dir + '/data/std_data/hk/covid_hk_vacc_daily_count_std.csv', index=False)

## Age Group Daily Counts

In [9]:
# New Age Group
new_age_group_mapping = {'0-11': 'below_20',
                         '12-19': 'below_20',
                         '20-29': '20_to_69',
                         '30-39': '20_to_69',
                         '40-49': '20_to_69',
                         '50-59': '20_to_69',
                         '60-69': '20_to_69',
                         '70-79': 'over_69',
                         '80 and above': 'over_69'}

covid_hk_vacc_std['age_group'] = covid_hk_vacc_std['age_group'].replace(new_age_group_mapping)

# Aggregation
covid_hk_vacc_agg = covid_hk_vacc_std \
    .groupby(['report_date', 'age_group']) \
    .agg({'sinov_1st_dose': 'sum',
          'sinov_2nd_dose': 'sum',
          'biont_1st_dose': 'sum',
          'biont_2nd_dose': 'sum'}) \
    .reset_index()


# Use Pivot table structure
sinov_1st_dose_pivot = covid_hk_vacc_agg.pivot(index='report_date', columns='age_group', values='sinov_1st_dose')
sinov_2nd_dose_pivot = covid_hk_vacc_agg.pivot(index='report_date', columns='age_group', values='sinov_2nd_dose')
biont_1st_dose_pivot = covid_hk_vacc_agg.pivot(index='report_date', columns='age_group', values='biont_1st_dose')
biont_2nd_dose_pivot = covid_hk_vacc_agg.pivot(index='report_date', columns='age_group', values='biont_2nd_dose')

sinov_1st_dose_pivot.columns = [f'sinov_1st_age_{col}' for col in sinov_1st_dose_pivot.columns]
sinov_2nd_dose_pivot.columns = [f'sinov_2nd_age_{col}' for col in sinov_2nd_dose_pivot.columns]
biont_1st_dose_pivot.columns = [f'biont_1st_age_{col}' for col in biont_1st_dose_pivot.columns]
biont_2nd_dose_pivot.columns = [f'biont_2nd_age_{col}' for col in biont_2nd_dose_pivot.columns]

# Combine the pivoted results
covid_hk_vacc_age_grp_daily_count_std = sinov_1st_dose_pivot \
    .join(sinov_2nd_dose_pivot,
          'report_date',
          'outer') \
    .join(biont_1st_dose_pivot,
          'report_date',
          'outer') \
    .join(biont_2nd_dose_pivot,
          'report_date',
          'outer') \
    .fillna(0) \
    .reset_index()

# Export to csv file
covid_hk_vacc_age_grp_daily_count_std.to_csv(project_dir + '/data/std_data/hk/covid_hk_vacc_age_grp_daily_count_std.csv', index=False)