# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Average Humidity

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
# Read csv files
hk_daily_avg_humid_std = pd.read_csv(project_dir + '/data/raw_data/hk/daily_HKO_RH_ALL.csv', skiprows=2)

# Drop Non-Informative Column

In [4]:
# Drop data completeness indicator column
hk_daily_avg_humid_std = hk_daily_avg_humid_std.drop(['數據完整性/data Completeness'], axis=1)

# Modify Column Names

In [5]:
# Rename columns
col_name_map = {'年/Year': 'report_year',
                '月/Month': 'report_month',
                '日/Day': 'report_day',
                '數值/Value': 'avg_humid'}

hk_daily_avg_humid_std.rename(columns=col_name_map, inplace=True)

# Drop Comments Rows and Old Records

In [6]:
# Drop comments rows
hk_daily_avg_humid_std \
    .drop(hk_daily_avg_humid_std[hk_daily_avg_humid_std['report_year'] \
          .isin(['*** 沒有數據/unavailable',
                 '# 數據不完整/data incomplete',
                 'C 數據完整/data Complete'])].index,
          inplace = True)

# Drop old records
hk_daily_avg_humid_std \
    .drop(hk_daily_avg_humid_std[hk_daily_avg_humid_std['report_year'] <= '2019'].index,
          inplace = True)

# Feature Creation

In [7]:
# Combine year, month and day columns
hk_daily_avg_humid_std['report_date'] = hk_daily_avg_humid_std['report_year'].astype('int').astype('str') \
    + '/' \
    + hk_daily_avg_humid_std['report_month'].astype('int').astype('str') \
    + '/' \
    + hk_daily_avg_humid_std['report_day'].astype('int').astype('str')

covid_module.change_date_format(hk_daily_avg_humid_std,
                                'report_date', '%Y/%m/%d',
                                'report_date', '%Y%m%d')

# Export Data Pre-processing Result to csv File

In [8]:
# Export to csv file
hk_daily_avg_humid_std.to_csv(project_dir + '/data/std_data/hk/hk_daily_avg_humid_std.csv', index=False)