# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Policy

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings

import covid_module

# Settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [3]:
# Read csv file
## The dataset is manually prepared based on the following websites.
## Therefore, there is no EDA notebook and this STD notebook is used to display the dataset structure and values in columns.
## Website name: news.gov.hk
## URL: https://www.news.gov.hk/eng/categories/covid19/index.html 
covid_hk_policy_std = pd.read_csv(project_dir + '/data/std_data/hk/covid_hk_policy_std.csv')

# Basic Information of Data Set

In [4]:
# Print first 5 records
covid_hk_policy_std.head(5)

Unnamed: 0,report_date,school,no_hong_kong_p,hong_kong_group,hong_kong_all,home_other_14,home_21,home_14,home_7,home_3,type_1_close,type_2_close,type_3_close,people2,people4,people8,0500_1800,0500_2200,0500_0200
0,20200101,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20200102,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,20200103,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20200104,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,20200105,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Basic information of dataframe
covid_hk_policy_std.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   report_date      1155 non-null   int64  
 1   school           1155 non-null   float64
 2   no_hong_kong_p   1155 non-null   int64  
 3   hong_kong_group  1155 non-null   int64  
 4   hong_kong_all    1155 non-null   int64  
 5   home_other_14    1155 non-null   int64  
 6   home_21          1155 non-null   int64  
 7   home_14          1155 non-null   int64  
 8   home_7           1155 non-null   int64  
 9   home_3           1155 non-null   int64  
 10  type_1_close     1155 non-null   int64  
 11  type_2_close     1155 non-null   int64  
 12  type_3_close     1155 non-null   int64  
 13  people2          1155 non-null   int64  
 14  people4          1155 non-null   int64  
 15  people8          1155 non-null   int64  
 16  0500_1800        1155 non-null   int64  
 17  0500_2200     

# Modify Data Types

In [6]:
# Modify data type for datatime column
covid_hk_policy_std['report_date'] = pd.to_datetime(covid_hk_policy_std['report_date'], format='%Y%m%d')

## Field - report_date

In [7]:
# Report date range of data set
min_date, max_date = covid_hk_policy_std['report_date'].agg(['min', 'max']).astype('str').to_list()
print('Date range of dateset:')
print(f'Min date: {min_date}')
print(f'Max date: {max_date}')

Date range of dateset:
Min date: 2020-01-01
Max date: 2023-02-28


In [8]:
# Dates which are not included in the dataset
date_df = covid_hk_policy_std[['report_date']].set_index('report_date')
missing_date_index = pd.date_range(start=min_date, end=max_date).difference(date_df.index).astype('str')

if len(missing_date_index) == 0:
    print('There is no missing date in the dataset.')
else:
    print(f'There are missing dates in the dataset, between {missing_date_index.min()} and {missing_date_index.max()}.')

There is no missing date in the dataset.


## Fields - All columns (except Date)

In [9]:
# Distinct values in country code column
print('Please find below the distinct values in each column.')
for col in covid_hk_policy_std.columns:
    if col != 'report_date':
        print(f'{col}: {str(covid_hk_policy_std[col].drop_duplicates().to_list())[1:-1]}')

Please find below the distinct values in each column.
school: 0.0, 1.0, 0.5
no_hong_kong_p: 0, 1
hong_kong_group: 0, 1
hong_kong_all: 0, 1
home_other_14: 0, 1
home_21: 0, 1
home_14: 0, 1
home_7: 0, 1
home_3: 0, 1
type_1_close: 0, 1
type_2_close: 0, 1
type_3_close: 0, 1
people2: 0, 1
people4: 0, 1
people8: 0, 1
0500_1800: 0, 1
0500_2200: 0, 1
0500_0200: 0, 1


The descriptions of the columns and the meanings of their values are listed below: <br>
“school” is an indicator column of school closure. 1 represents the suspension for face-to-face classes; 0.5 represents only part of face-to-face classes were allowed; 0 represents no restriction. <br>

“no_hong_kong_p”, “hong_kong_group” and “hong_kong_all” are indicator columns of travel restrictions. <br>
“no_hong_kong_p”: Travel restrictions on non-Hong Kong residents <br>
“hong_kong_group”: Travel restrictions in some countries on Hong Kong residents <br>
“hong_kong_all”: Travel restrictions on Hong Kong residents in all regions. <br>

“home_other_14”, “home_21”, “home_14”, “home_7” and “home_3” are indicator columns for mandatory quarantine or medical surveillance: <br>
“home_other_14”: 14 days in designated locations <br>
“home_21”, “home_14”, “home_7” and “home_3”: 21, 14, 7 and 3 days in designated hotel respectively. <br>

“type_1_close”, “type_2_close” and “type_3_close” are indicator columns for the closure of the following public entertainment venues. <br>
“type_1_close”: karaoke, bathhouse, party room, nightclubs and dance venues. <br>
“type_2_close”: ice skating rinks, fitness centers and gyms and massage parlor. <br>
“type_3_close”: cinemas and theaters, sports venues and beauty salon. <br>

“people2”, “people4” and “people8” are indicator columns for maximum number of people allowed to stay together. “people2”, “people4” and “people8”: At most 2, 4 and 8 people allowed respectively. <br>

“0500_1800”, “0500_2200” and “0500_0200” are indicator columns for the restriction of restaurant opening hours. “0500_1800”, “0500_2200” and “0500_0200”: From 05:00 to 18:00, from 05:00 to 22:00, and from 05:00 to 02:00 respectively. <br>

## Missing Value Analysis

In [10]:
# Number of missing values in each column
covid_module.print_missing_val_count(covid_hk_policy_std)

This dataframe does not have missing values.
