# The University of Hong Kong
## DASC7600 Data Science Project 2024

# Import modules

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Functions

In [2]:
def get_import_local_col(value: str) -> str:
    if 'IMPORT' in value.upper():
        return 'import'
    elif 'LOCAL' in value.upper():
        return 'local'
    else:
        return 'NA'

def get_year_month_part(df: pd.DataFrame,
                  column_name: str,
                  date_format: str) -> None:
    for (date_part, date_part_format) in [('year','%Y'), ('year_month','%Y%m')]:
        new_column_name = column_name.removesuffix('_date') + '_' + date_part
        df[new_column_name] = \
            pd.to_datetime(df[column_name], format=date_format, errors='coerce') \
            .dt.strftime(date_part_format) \
            .fillna('NA')

def one_hot_encoding(df: pd.DataFrame,
                     column_name: str) -> pd.DataFrame:
    distinct_values = set(df[column_name])
    
    for value in distinct_values:
        new_column_name = column_name + '_' + value
        df[new_column_name] = df[column_name].apply(lambda col: 1 if col == value else 0)
    
    df = df.drop(column_name, axis=1)
    
    return df

# Load Data

In [3]:
# Read csv files
covid_hk_std = pd.read_csv('./data/raw_data/hk/enhanced_sur_covid_19_eng.csv')

# Modify Column Names

In [4]:
# Rename columns
col_name_map = {'Case no.': 'case_id',
                'Report date': 'report_date',
                'Date of onset': 'onset_date',
                'Gender': 'gender',
                'Age': 'age',
                'Name of hospital admitted': 'hospital_name',
                'Hospitalised/Discharged/Deceased': 'case_outcome',
                'HK/Non-HK resident': 'resident',
                'Classification*': 'case_type',
                'Case status*': 'case_status'}

covid_hk_std.rename(columns=col_name_map, inplace=True)

# Drop non-informative column

In [5]:
# Drop hospital name column
covid_hk_std = covid_hk_std.drop(['case_id', 'hospital_name'], axis=1)

# Drop deleted records

In [6]:
# Drop records with status "Deleted"
covid_hk_std = covid_hk_std[~covid_hk_std['case_status'].isin(['Deleted'])]

# Fill NA and replace values in columns

In [7]:
# Fill NA with "N/A"
covid_hk_std = covid_hk_std \
    .fillna('NA')

In [8]:
# Replace "Pending" and "Unknown" with "N/A"
covid_hk_std = covid_hk_std \
    .replace({'Pending': 'NA',
              'Unknown': 'NA'})

In [9]:
# Modify value in field "onset_date" and "age"
covid_hk_std = covid_hk_std \
    .replace({'onset_date': {'Asymptomatic': 'NA',
                             'January': '15/01/2020',
                             'Mid-March': '15/03/2020',
                             'Mid-July': '15/07/2020',
                             'October': '15/10/2020'},
              'age': {'<1': '0'}})

# Feature Creation

In [10]:
# Create age group column
covid_hk_std['age_group'] = pd.cut(covid_hk_std[(covid_hk_std['age'] != 'NA')]['age'].astype('int'),
                                   bins=[0, 20, 40, 60, 80, 101],
                                   labels=['0-19','20-39','40-59','60-79','80-100'],
                                   right=False)

In [11]:
covid_hk_std['import_local'] = covid_hk_std['case_type'].apply(lambda col: get_import_local_col(col))

In [12]:
# Create year, month and day columns for report date and onset date
get_year_month_part(covid_hk_std, 'report_date', '%d/%m/%Y')
# get_year_month_part(covid_hk_std, 'onset_date', '%d/%m/%Y')

# Feature Transformation

In [13]:
# TBD

# Categorical Features Handling

In [14]:
covid_hk_std = one_hot_encoding(covid_hk_std, 'gender')
# covid_hk_std = one_hot_encoding(covid_hk_std, 'resident')
# covid_hk_std = one_hot_encoding(covid_hk_std, 'case_type')

# Export Data Pre-processing Result to csv File

In [15]:
# Export to csv file
covid_hk_std.to_csv('./data/std_data/hk/covid_hk_std.csv')