# The University of Hong Kong
## DASC7600 Data Science Project 2024
## EDA - HK - Vaccination

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings

import covid_module

# Settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [3]:
# Read csv file
## Dataset name: Daily count of vaccination by age groups
## Data resource name: Daily count of vaccination by age groups (English)
## URL: https://data.gov.hk/en-data/dataset/hk-hhb-hhbcovid19-vaccination-rates-over-time-by-age/resource/932f3f2d-70e4-4b7c-b3f7-6d935e3ffc94
covid_hk_vacc_df = pd.read_csv(project_dir + '/data/raw_data/hk/vaccination-rates-over-time-by-age.csv')

# Basic Information of Data Set

In [4]:
# Print first 10 records
covid_hk_vacc_df.head(10)

Unnamed: 0,Date,Age Group,Sex,Sinovac 1st dose,Sinovac 2nd dose,Sinovac 3rd dose,Sinovac 4th dose,Sinovac 5th dose,Sinovac 6th dose,Sinovac 7th dose,BioNTech 1st dose,BioNTech 2nd dose,BioNTech 3rd dose,BioNTech 4th dose,BioNTech 5th dose,BioNTech 6th dose,BioNTech 7th dose
0,2021-02-22,30-39,M,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2021-02-22,40-49,F,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2021-02-22,40-49,M,11,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2021-02-22,50-59,F,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2021-02-22,50-59,M,10,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2021-02-22,60-69,F,6,0,0,0,0,0,0,0,0,0,0,0,0,0
6,2021-02-22,60-69,M,7,0,0,0,0,0,0,0,0,0,0,0,0,0
7,2021-02-22,70-79,F,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,2021-02-22,70-79,M,8,0,0,0,0,0,0,0,0,0,0,0,0,0
9,2021-02-23,12-19,M,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Basic information of dataframe
covid_hk_vacc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19893 entries, 0 to 19892
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Date               19893 non-null  object
 1   Age Group          19893 non-null  object
 2   Sex                19893 non-null  object
 3   Sinovac 1st dose   19893 non-null  int64 
 4   Sinovac 2nd dose   19893 non-null  int64 
 5   Sinovac 3rd dose   19893 non-null  int64 
 6   Sinovac 4th dose   19893 non-null  int64 
 7   Sinovac 5th dose   19893 non-null  int64 
 8   Sinovac 6th dose   19893 non-null  int64 
 9   Sinovac 7th dose   19893 non-null  int64 
 10  BioNTech 1st dose  19893 non-null  int64 
 11  BioNTech 2nd dose  19893 non-null  int64 
 12  BioNTech 3rd dose  19893 non-null  int64 
 13  BioNTech 4th dose  19893 non-null  int64 
 14  BioNTech 5th dose  19893 non-null  int64 
 15  BioNTech 6th dose  19893 non-null  int64 
 16  BioNTech 7th dose  19893 non-null  int64

# Modify Data Types

In [6]:
# Modify data type for datatime column
covid_hk_vacc_df['Date'] = pd.to_datetime(covid_hk_vacc_df['Date'],  format='%Y-%m-%d')

# Exploratory Data Analysis (EDA)

## Univariate Analysis

### Field - Date

In [7]:
# Date range of data set
min_date, max_date = covid_hk_vacc_df['Date'].agg(['min', 'max']).astype('str').to_list()
print('Date range of the dataset:')
print(f'Min date: {min_date}')
print(f'Max date: {max_date}')

Date range of the dataset:
Min date: 2021-02-22
Max date: 2024-07-07


In [8]:
# Dates which are not included in the dataset
date_df = covid_hk_vacc_df[['Date']].set_index('Date')

print('The following dates are not included in the dataset:',
    str(pd.date_range(start=min_date, end=max_date).difference(date_df.index).astype('str').to_list())[1:-1])

The following dates are not included in the dataset: '2021-02-24', '2021-02-25', '2023-09-01', '2024-02-10'


### Field - Sex

In [9]:
# Distinct values in sex column
print('Distinct values in sex column are:',
      str(covid_hk_vacc_df['Sex'].drop_duplicates().sort_values().to_list())[1:-1])

Distinct values in sex column are: 'F', 'M'


### Field - Age Group

In [10]:
# Distinct values in age group column
print('Distinct values in age group column are:',
      str(covid_hk_vacc_df['Age Group'].drop_duplicates().sort_values().to_list())[1:-1])

Distinct values in age group column are: '0-11', '12-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80 and above'


### Fields - Sinovac and BioNTech columns

In [11]:
# Total dose
vacc_list = ['BioNTech', 'Sinovac']
ord_list = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th']

print('Total Doses Administered:')
for vacc_type in vacc_list:
    for ord in ord_list:
        col_name =  f'{vacc_type} {ord} dose'
        print('{}: {:,}'.format(col_name, covid_hk_vacc_df[f'{col_name}'].sum()))
    print('')

Total Doses Administered:
BioNTech 1st dose: 3,825,705
BioNTech 2nd dose: 3,770,715
BioNTech 3rd dose: 3,519,646
BioNTech 4th dose: 775,580
BioNTech 5th dose: 134,681
BioNTech 6th dose: 24,339
BioNTech 7th dose: 1,714

Sinovac 1st dose: 3,094,264
Sinovac 2nd dose: 3,032,513
Sinovac 3rd dose: 2,342,060
Sinovac 4th dose: 425,794
Sinovac 5th dose: 56,095
Sinovac 6th dose: 8,056
Sinovac 7th dose: 568



## Multivariate Analysis

### Fields - Date, Age Group and Sex

In [12]:
# Print the number of rows for each date
print('Number of rows for each date:')
print(covid_hk_vacc_df.groupby('Date')['Date'].count())

Number of rows for each date:
Date
2021-02-22     9
2021-02-23    15
2021-02-26    16
2021-02-27    16
2021-02-28    16
              ..
2024-07-03    14
2024-07-04    15
2024-07-05    16
2024-07-06    13
2024-07-07     6
Name: Date, Length: 1228, dtype: int64


There are 9 age groups and 2 sexes, but not all dates have all 18 combinations.

In [13]:
# Print a date with less than 18 rows
covid_hk_vacc_df[covid_hk_vacc_df['Date'] == '2023-07-30']

Unnamed: 0,Date,Age Group,Sex,Sinovac 1st dose,Sinovac 2nd dose,Sinovac 3rd dose,Sinovac 4th dose,Sinovac 5th dose,Sinovac 6th dose,Sinovac 7th dose,BioNTech 1st dose,BioNTech 2nd dose,BioNTech 3rd dose,BioNTech 4th dose,BioNTech 5th dose,BioNTech 6th dose,BioNTech 7th dose
15193,2023-07-30,0-11,F,0,1,0,0,0,0,0,1,1,1,0,0,0,0
15194,2023-07-30,0-11,M,0,1,0,0,0,0,0,1,3,2,0,0,0,0
15195,2023-07-30,50-59,F,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15196,2023-07-30,50-59,M,0,0,0,0,1,0,0,0,0,0,0,1,0,0
15197,2023-07-30,60-69,F,0,0,0,0,1,0,0,0,0,0,0,0,0,0
15198,2023-07-30,60-69,M,0,0,0,0,1,0,0,0,0,0,0,0,1,0
15199,2023-07-30,70-79,F,0,0,0,1,1,0,0,0,0,0,0,0,0,0
15200,2023-07-30,70-79,M,0,0,0,1,0,0,0,0,0,0,0,0,0,0


Assumption: A missing row represents that there is no vaccination for that age group and sex on that date.

In [14]:
# Print a date with 0 dose
vacc_list = ['Sinovac', 'BioNTech']
ord_list = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th']

covid_hk_vacc_df['Dose (All)'] = 0
for vacc_type in vacc_list:
    for ord in ord_list:
        col_name =  f'{vacc_type} {ord} dose'
        covid_hk_vacc_df['Dose (All)'] = covid_hk_vacc_df['Dose (All)'] + covid_hk_vacc_df[col_name]

covid_hk_vacc_df[covid_hk_vacc_df['Dose (All)'] == 0]

Unnamed: 0,Date,Age Group,Sex,Sinovac 1st dose,Sinovac 2nd dose,Sinovac 3rd dose,Sinovac 4th dose,Sinovac 5th dose,Sinovac 6th dose,Sinovac 7th dose,BioNTech 1st dose,BioNTech 2nd dose,BioNTech 3rd dose,BioNTech 4th dose,BioNTech 5th dose,BioNTech 6th dose,BioNTech 7th dose,Dose (All)
19701,2024-06-21,40-49,M,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


However, there is 1 row that there is no dose at all. <br>
Assuption: Typo/ manual error.

### Fields - Date, Sinovac and BioNTech columns

In [15]:
# Create a year month column with datatime datatype
covid_hk_vacc_df['Date_YYYYMM'] = pd.to_datetime(covid_hk_vacc_df['Date'].dt.strftime('%Y%m'), format='%Y%m')

In [16]:
# # Sinovac first 3 dose column names
# sinov_first3_dose_col_list = ['Sinovac 1st dose', 'Sinovac 2nd dose', 'Sinovac 3rd dose']

# # Sinovac aggregated counts
# vacc_sinov_agg_cnt = covid_hk_vacc_df \
#     .groupby('Date_YYYYMM') \
#     [sinov_first3_dose_col_list] \
#     .sum() \
#     .reset_index()

# # Figure and axis
# fig, ax = plt.subplots(figsize=(16, 8))

# # Plot line charts for counts of the first 3 dose
# for col in sinov_first3_dose_col_list:
#     ax.plot_date(vacc_sinov_agg_cnt['Date_YYYYMM'],
#                  vacc_sinov_agg_cnt[col],
#                  marker='', linestyle='-', label=col)
  
# # Title 
# plt.title('Line Chart of Sinovac Vaccination Counts') 

# # x-axis label
# plt.xlabel('Year-Month') 
# fig.autofmt_xdate()

# # Legend
# plt.legend(loc="upper right")

# # Show the plot
# plt.show()

In [17]:
# # BioNTech first 3 dose column names
# biont_first3_dose_col_list = ['BioNTech 1st dose', 'BioNTech 2nd dose', 'BioNTech 3rd dose']

# # BioNTech aggregated counts
# vacc_biont_agg_cnt = covid_hk_vacc_df \
#     .groupby('Date_YYYYMM') \
#     [biont_first3_dose_col_list] \
#     .sum() \
#     .reset_index()

# # Figure and axis
# fig, ax = plt.subplots(figsize=(16, 8))

# # Plot line charts for counts of the first 3 dose
# for col in biont_first3_dose_col_list:
#     ax.plot_date(vacc_biont_agg_cnt['Date_YYYYMM'],
#                  vacc_biont_agg_cnt[col],
#                  marker='', linestyle='-', label=col)
  
# # Title 
# plt.title('Line Chart of BioNTech Vaccination Counts') 

# # x-axis label
# plt.xlabel('Year-Month') 
# fig.autofmt_xdate()

# # Legend
# plt.legend(loc="upper right")

# # Show the plot
# plt.show()

## Missing Value Analysis

In [18]:
# Number of missing values in each column
covid_module.print_missing_val_count(covid_hk_vacc_df)

This dataframe does not have missing values.
