# The University of Hong Kong
## DASC7600 Data Science Project 2024
## STD - HK - Temperature

# Import Modules and Settings

In [1]:
import pandas as pd
import warnings

import covid_module

# Settings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Read csv files
hk_daily_avg_temp_df = pd.read_csv('./data/raw_data/hk/CLMTEMP_HKO_.csv', skiprows=2)
hk_daily_min_temp_df = pd.read_csv('./data/raw_data/hk/CLMMINT_HKO_.csv', skiprows=2)
hk_daily_max_temp_df = pd.read_csv('./data/raw_data/hk/CLMMAXT_HKO_.csv', skiprows=2)

In [3]:
# List of all dataframes
df_list = [hk_daily_avg_temp_df,
           hk_daily_min_temp_df,
           hk_daily_max_temp_df]

# Modify Column Names

In [4]:
# Rename columns
col_name_map = {'年/Year': 'report_year',
                '月/Month': 'report_month',
                '日/Day': 'report_day',
                '數值/Value': 'daily_temp',
                '數據完整性/data Completeness': 'comple'}

for df in df_list:
    df.rename(columns=col_name_map, inplace=True)

hk_daily_avg_temp_df['avg_temp'] = hk_daily_avg_temp_df['daily_temp']
hk_daily_min_temp_df['min_temp'] = hk_daily_min_temp_df['daily_temp']
hk_daily_max_temp_df['max_temp'] = hk_daily_max_temp_df['daily_temp']

# Drop Comments Rows and Old Records

In [5]:
for df in df_list:
    # Drop comments rows
    df.drop(df[df['report_year'] \
                .isin(['*** 沒有數據/unavailable',
                       '# 數據不完整/data incomplete',
                       'C 數據完整/data Complete'])].index,
            inplace = True)
    
    # Drop old records
    df.drop(df[df['report_year'] <= '2019'].index,
            inplace = True)

# Feature Creation

In [6]:
# Combine year, month and day columns
for df in df_list:
    df['report_date'] = df['report_year'].astype('int').astype('str') \
        + '/' \
        + df['report_month'].astype('int').astype('str') \
        + '/' \
        + df['report_day'].astype('int').astype('str')
    
    covid_module.change_date_format(df,
                                    'report_date', '%Y/%m/%d',
                                    'report_date', '%Y%m%d')

# Feature Transformation

In [7]:
# TBD

# Combine 3 Dataframes

In [8]:
hk_daily_temp_std = hk_daily_avg_temp_df[['report_date', 'report_year', 'report_month', 'report_day', 'avg_temp']] \
    .merge(hk_daily_min_temp_df[['report_date', 'min_temp']],
           'left',
           'report_date') \
    .merge(hk_daily_max_temp_df[['report_date', 'max_temp']],
           'left',
           'report_date')

# Export Data Pre-processing Result to csv File

In [9]:
# Export to csv file
hk_daily_temp_std.to_csv('./data/std_data/hk/hk_daily_temp_std.csv', index=False)