In [1]:
%reload_ext autoreload
%autoreload 2

# Link external variables to health data

We need to link the processed climate and socioeconomic/demographic features to the sample dengue dataset. List of external variables are enumerated in the file imports section. 

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np

from pathlib import Path
import os
from loguru import logger
from tqdm import tqdm

import sys

sys.path.append("../../")
# import directories
from src.settings import (
    DATA_DIR,
    RAW_DIR,
    PROCESSED_DIR,
    OUTPUT_DIR,
    CLIMATE_VARIABLES_LIST,
    PROJ_CRS,
    METRIC_CRS,
)

# import utils
from src.model_data_prep import *



In [3]:
# file directories

CLIMATE_DIR = RAW_DIR / "climate"

OSM_DIR = OUTPUT_DIR / "osm"

POP_COUNT_DIR = OUTPUT_DIR / "worldpop" / "population_count"
POP_D_DIR = OUTPUT_DIR / "worldpop" / "population_density"

NIGHTLIGHTS_DIR = OUTPUT_DIR / "nightlights"
OOKLA_DIR = OUTPUT_DIR / "ookla"

GEOPORTAL_DOH = OUTPUT_DIR / "doh_health_geoportal.csv"
RWI = OUTPUT_DIR / "rwi" / "RWI_stats_reshaped.csv"
HAZARDS = OUTPUT_DIR / "noah" / "hz_proportion.csv"
LANDCOVER = OUTPUT_DIR / "landcover_features_ESA_2021.csv"
BLDGS = OUTPUT_DIR / "google_bldgs_v3_features.csv"

# accessibility of health facilities
HOSPITAL_ISO = PROCESSED_DIR / "revised_hospitals_pop_reached_citylevel.csv"
HEALTHCENTER_ISO = PROCESSED_DIR / "revised_brgy_healthcenter_pop_reached_citylevel.csv"
RHU_ISO = PROCESSED_DIR / "revised_rhu_pop_reached_citylevel.csv"

In [4]:
LABELED_CASES = PROCESSED_DIR / "health" / "pidsr_city_weekly_zamboanga_labeled_v2.csv"

## Load Aggregated Zamboanga Dengue Data

In [5]:
dengue_df = pd.read_csv(LABELED_CASES)
dengue_df.head()

Unnamed: 0,Source,Year,Month,Week,Date,Region,PSGC_Region,Municipality,PSGC_Municipality,ICD,Disease,Cases,Claims,Deaths,Case_Type,Date_Type,outbreak,outbreak_group
0,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,"FEVER, DENGUE",0.0,,,Morbidity,Weekly,0,0
1,PIDSR-DOH,2008.0,1.0,2,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,"FEVER, DENGUE",0.0,,,Morbidity,Weekly,0,0
2,PIDSR-DOH,2008.0,1.0,3,2008-01-14,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,"FEVER, DENGUE",0.0,,,Morbidity,Weekly,0,0
3,PIDSR-DOH,2008.0,1.0,4,2008-01-21,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,"FEVER, DENGUE",0.0,,,Morbidity,Weekly,0,0
4,PIDSR-DOH,2008.0,1.0,5,2008-01-28,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,"FEVER, DENGUE",0.0,,,Morbidity,Weekly,0,0


# Load admin boundaries

In [6]:
admin_bounds = gpd.read_file("../../data/01-admin-bounds/target_admin_bounds.shp")
admin_bounds.head(2)

Unnamed: 0,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,ADM4_PCODE,geometry
0,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Lomboy,PH015518016,"POLYGON ((120.32742 16.05423, 120.32719 16.053..."
1,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Tapuac,PH015518031,"POLYGON ((120.33380 16.03974, 120.33389 16.039..."


In [7]:
admin_bounds = admin_bounds.to_crs(METRIC_CRS)
admin_bounds["brgy_total_area"] = admin_bounds.area
admin_bounds = admin_bounds.to_crs(PROJ_CRS)

In [8]:
# get adm3_pcode
zambo_adm4_pcodes = admin_bounds[admin_bounds["ADM3_EN"] == "Zamboanga City"][
    "ADM4_PCODE"
].tolist()
print(zambo_adm4_pcodes)

['PH097332062', 'PH097332028', 'PH097332067', 'PH097332095', 'PH097332060', 'PH097332034', 'PH097332902', 'PH097332027', 'PH097332056', 'PH097332044', 'PH097332020', 'PH097332093', 'PH097332047', 'PH097332051', 'PH097332061', 'PH097332090', 'PH097332901', 'PH097332080', 'PH097332038', 'PH097332100', 'PH097332092', 'PH097332097', 'PH097332096', 'PH097332023', 'PH097332010', 'PH097332070', 'PH097332098', 'PH097332050', 'PH097332039', 'PH097332064', 'PH097332073', 'PH097332076', 'PH097332032', 'PH097332002', 'PH097332099', 'PH097332089', 'PH097332101', 'PH097332053', 'PH097332071', 'PH097332037', 'PH097332085', 'PH097332904', 'PH097332001', 'PH097332083', 'PH097332087', 'PH097332043', 'PH097332021', 'PH097332084', 'PH097332035', 'PH097332031', 'PH097332017', 'PH097332046', 'PH097332019', 'PH097332004', 'PH097332052', 'PH097332058', 'PH097332045', 'PH097332048', 'PH097332033', 'PH097332016', 'PH097332063', 'PH097332026', 'PH097332069', 'PH097332059', 'PH097332091', 'PH097332072', 'PH097332

In [9]:
len(zambo_adm4_pcodes)

101

# Aggregate all climate variables into one dataframe

In [10]:
# climate_df_basic_agg = []
# climate_df_weighted_avg = []
result_list = []

for var in tqdm(CLIMATE_VARIABLES_LIST):
    prepped_df = prep_climate_var_df(var)
    basic_agg = align_climate_var(prepped_df, var)
    weighted_avg = climate_weighted_avg(var, prepped_df, admin_bounds)
    # climate_df_basic_agg.append(basic_agg)
    # climate_df_weighted_avg.append(weighted_avg)
    result = basic_agg.merge(
        weighted_avg, on=["start_of_week", "ADM4_PCODE"], how="left"
    )
    result_list.append(result)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [52:10<00:00, 164.78s/it] 


In [11]:
result_list[0]

Unnamed: 0,start_of_week,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,WEIGHTED_AVG_CO
0,2013-01-01,PH015518001,0.097867,0.0878,0.1159,0.010291,0.097867
1,2013-01-01,PH015518002,0.097867,0.0878,0.1159,0.010291,0.097867
2,2013-01-01,PH015518003,0.097867,0.0878,0.1159,0.010291,0.097867
3,2013-01-01,PH015518004,0.097867,0.0878,0.1159,0.010291,0.097867
4,2013-01-01,PH015518006,0.097867,0.0878,0.1159,0.010291,0.097867
...,...,...,...,...,...,...,...
465865,2022-12-26,PH137603005,0.122800,0.1203,0.1285,0.003062,0.122800
465866,2022-12-26,PH137603006,0.122800,0.1203,0.1285,0.003062,0.122800
465867,2022-12-26,PH137603007,0.122800,0.1203,0.1285,0.003062,0.122800
465868,2022-12-26,PH137603008,0.122800,0.1203,0.1285,0.003062,0.122800


In [12]:
# merge into one dataframe

# Initialize an empty dataframe for merged data
climate_merged_df = pd.DataFrame()

# Merge dataframes one by one
for df in result_list:
    if climate_merged_df.empty:
        climate_merged_df = df
    else:
        # Merge on 'date' and 'adm4_pcode' columns
        climate_merged_df = pd.merge(
            climate_merged_df, df, on=["start_of_week", "ADM4_PCODE"], how="outer"
        )

climate_merged_df.head()

Unnamed: 0,start_of_week,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,WEIGHTED_AVG_CO,HI_AVG,HI_MIN,HI_MAX,...,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WEIGHTED_AVG_UVR,WS_AVG,WS_MIN,WS_MAX,WS_STD,WEIGHTED_AVG_WS
0,2013-01-01,PH015518001,0.097867,0.0878,0.1159,0.010291,0.097867,29.063333,27.35,30.04,...,22.53,20.02,24.58,1.550019,22.53,1.521667,0.59,2.88,0.807228,1.521667
1,2013-01-01,PH015518002,0.097867,0.0878,0.1159,0.010291,0.097867,29.063333,27.35,30.04,...,22.53,20.02,24.58,1.550019,22.53,1.521667,0.59,2.88,0.807228,1.521667
2,2013-01-01,PH015518003,0.097867,0.0878,0.1159,0.010291,0.097867,29.063333,27.35,30.04,...,22.53,20.02,24.58,1.550019,22.53,1.521667,0.59,2.88,0.807228,1.521667
3,2013-01-01,PH015518004,0.097867,0.0878,0.1159,0.010291,0.097867,29.063333,27.35,30.04,...,22.53,20.02,24.58,1.550019,22.53,1.521667,0.59,2.88,0.807228,1.521667
4,2013-01-01,PH015518006,0.097867,0.0878,0.1159,0.010291,0.097867,29.063333,27.35,30.04,...,22.53,20.02,24.58,1.550019,22.53,1.521667,0.59,2.88,0.807228,1.521667


In [14]:
climate_merged_df.to_csv(
    PROCESSED_DIR / "climate_aggregated_weekly_brgy.csv", index=False
)

In [100]:
climate_merged_df = pd.read_csv(PROCESSED_DIR / "climate_aggregated_weekly_brgy.csv")

In [101]:
# filter to zamboanga barangays
climate_merged_df_zambo = climate_merged_df.copy()
climate_merged_df_zambo = climate_merged_df_zambo[
    climate_merged_df_zambo["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]
climate_merged_df_zambo = climate_merged_df_zambo.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)
climate_merged_df_zambo.insert(
    1, "ADM3_PCODE", climate_merged_df_zambo.pop("ADM3_PCODE")
)
climate_merged_df_zambo.head(3)

Unnamed: 0,start_of_week,ADM3_PCODE,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,WEIGHTED_AVG_CO,HI_AVG,HI_MIN,...,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WEIGHTED_AVG_UVR,WS_AVG,WS_MIN,WS_MAX,WS_STD,WEIGHTED_AVG_WS
0,2013-01-01,PH097332000,PH097332001,0.065233,0.0577,0.0716,0.005224,0.065233,28.933333,27.91,...,23.128333,12.07,28.49,6.118733,23.128333,1.868333,0.25,3.7,1.153159,1.868333
1,2013-01-01,PH097332000,PH097332002,0.065233,0.0577,0.0716,0.005224,0.065233,28.303333,26.98,...,22.923333,10.66,28.19,6.683384,22.923333,1.823333,0.46,3.36,1.026522,1.823333
2,2013-01-01,PH097332000,PH097332004,0.065233,0.0577,0.0716,0.005224,0.065233,28.303333,26.98,...,22.923333,10.66,28.19,6.683384,22.923333,1.823333,0.46,3.36,1.026522,1.823333


# Aggregate geopatial exposure variables

## OSM features

In [102]:
osm_files = os.listdir(OSM_DIR)
print(osm_files)

['osm_features_water_2016.csv', 'osm_features_water_2018.csv', 'osm-poi-updated-feat-2014.csv', 'osm_features_water_2020.csv', 'osm_features_waterways_2020.csv', 'osm_features_waterways_2021.csv', 'osm-poi-updated-feat-2022.csv', 'osm-poi-updated-feat-2018.csv', 'osm_features_waterways_2016.csv', 'osm_features_waterways_2014.csv', 'osm-poi-updated-feat-2016.csv', 'osm_features_waterways_2019.csv', 'osm_features_water_2015.csv', 'osm_features_waterways_2017.csv', 'osm_features_waterways_2015.csv', 'osm_features_waterways_2018.csv', 'osm_features_water_2019.csv', 'osm_features_water_2017.csv', 'osm-poi-updated-feat-2021.csv', 'osm_features_water_2022.csv', 'osm-poi-updated-feat-2017.csv', 'osm_features_waterways_2022.csv', 'osm_features_water_2014.csv', 'osm_features_water_2021.csv', 'osm-poi-updated-feat-2019.csv', 'osm-poi-updated-feat-2020.csv', 'osm-poi-updated-feat-2015.csv']


In [103]:
# separate filenames by osm type
osm_pois_files = [filename for filename in osm_files if "poi" in filename]
osm_waterway_files = [filename for filename in osm_files if "waterway" in filename]
osm_water_files = [filename for filename in osm_files if "water_" in filename]

# combine each osm type to one dataframe
osm_pois = combine_indiv_files(OSM_DIR, osm_pois_files)
osm_waterway = combine_indiv_files(OSM_DIR, osm_waterway_files)
osm_water = combine_indiv_files(OSM_DIR, osm_water_files)

In [104]:
# merge into one dataframe
merged_osm = osm_pois.merge(osm_waterway, on=["date", "ADM4_PCODE", "freq"])
merged_osm = merged_osm.merge(osm_water, on=["date", "ADM4_PCODE", "freq"])

# add year column and remove freq col
merged_osm = add_year(merged_osm)

# filter to zamboanga barangays only
merged_osm = merged_osm[merged_osm["ADM4_PCODE"].isin(zambo_adm4_pcodes)]

merged_osm = merged_osm.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE", "brgy_total_area"]],
    on="ADM4_PCODE",
    how="left",
)
merged_osm.insert(0, "ADM3_PCODE", merged_osm.pop("ADM3_PCODE"))
merged_osm.insert(4, "brgy_total_area", merged_osm.pop("brgy_total_area"))
merged_osm.head(2)

Unnamed: 0,ADM3_PCODE,ADM4_PCODE,date,year,brgy_total_area,poi_count,clinic_count,clinic_nearest,dentist_count,dentist_nearest,...,waste_transfer_station_nearest,osm_river_nearest,osm_stream_nearest,osm_canal_nearest,osm_drain_nearest,osm_wetland_nearest,osm_reservoir_nearest,osm_water_nearest,osm_riverbank_nearest,osm_dock_nearest
0,PH097332000,PH097332001,2014-01-01,2014,961262.7,0.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,2491.364593,4406.041222,8392.320981,0.0,10000.0,1463.991302,0.0,10000.0
1,PH097332000,PH097332002,2014-01-01,2014,4139359.0,0.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,1175.647907,10000.0,10000.0,10000.0,10000.0,7248.859907,1846.055233,10000.0


In [105]:
exclude_osm = [
    # unneeded osm features
    "atm_count",
    "atm_nearest",
    "bank_count",
    "bank_nearest",
    "college_count",
    "college_nearest",
    "community_centre_count",
    "community_centre_nearest",
    "comms_tower_count",
    "comms_tower_nearest",
    "convenience_count",
    "convenience_nearest",
    "fire_station_count",
    "fire_station_nearest",
    "kindergarten_count",
    "kindergarten_nearest",
    "lighthouse_count",
    "lighthouse_nearest",
    "market_place_count",
    "market_place_nearest",
    "park_count",
    "park_nearest",
    "public_building_count",
    "public_building_nearest",
    "police_count",
    "police_nearest",
    "school_count",
    "school_nearest",
    "shelter_count",
    "shelter_nearest",
    "supermarket_count",
    "supermarket_nearest",
    "telephone_count",
    "telephone_nearest",
    "tower_count",
    "tower_nearest",
    "town_hall_count",
    "town_hall_nearest",
    "university_count",
    "university_nearest",
    "cable_count",
    "cable_nearest",
    "compensator_count",
    "compensator_nearest",
    "connection_count",
    "connection_nearest",
    "converter_count",
    "converter_nearest",
    "generator_count",
    "generator_nearest",
    "insulator_count",
    "insulator_nearest",
    "line_count",
    "line_nearest",
    "busbar_count",
    "busbar_nearest",
    "bay_count",
    "bay_nearest",
    "minor_line_count",
    "minor_line_nearest",
    "plant_count",
    "plant_nearest",
    "pole_count",
    "pole_nearest",
    "portal_count",
    "portal_nearest",
    "substation_count",
    "substation_nearest",
    "tower_count_y",
    "transformer_count",
    "transformer_nearest",
    "exchange_count",
    "exchange_nearest",
    "connection_point_count",
    "connection_point_nearest",
    "distribution_point_count",
    "distribution_point_nearest",
    "service_device_count",
    "service_device_nearest",
    "data_center_count",
    "data_center_nearest",
]

# Remove columns based on the list of strings or prefixes
columns_to_keep = [
    col
    for col in merged_osm.columns
    if not any(col.startswith(prefix) for prefix in exclude_osm)
]
filtered_osm_df = merged_osm[columns_to_keep]

In [106]:
# subset columns to count and nearest
# for count just get the sum and mean
poi_count_cols = [col for col in filtered_osm_df.columns if "_count" in col]
nearest_dist_cols = [col for col in filtered_osm_df.columns if "_nearest" in col]
info_cols = ["ADM3_PCODE", "ADM4_PCODE", "date", "year", "brgy_total_area"]

# subset to dataframes for count and nearest features
poi_count_df = filtered_osm_df[info_cols + poi_count_cols]
nearest_df = filtered_osm_df[info_cols + nearest_dist_cols]

# convert poi counts to city level
city_poi_df = convert_to_city(
    poi_count_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
city_poi_df = city_poi_df.drop(
    columns=["date", "brgy_total_area_sum", "brgy_total_area_mean"]
)
# get weighted avg
nearest_df_list = []
for dist_col in nearest_dist_cols:
    df = (
        nearest_df.groupby(["year", "ADM4_PCODE"])
        .apply(weighted_average, dist_col)
        .rename(f"weighted_avg_{dist_col}")
    )
    df = df.reset_index()
    nearest_df_list.append(df)

# merge to one dataframe
nearest_df_merged = merge_multi_dfs(
    nearest_df_list, merge_on_cols=["year", "ADM4_PCODE"]
)
nearest_df_merged = nearest_df_merged.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)
nearest_df_merged.insert(0, "ADM3_PCODE", nearest_df_merged.pop("ADM3_PCODE"))

city_nearest_df = convert_to_city(
    nearest_df_merged,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "year"],
    agg_list=[("mean", "mean")],
)

In [107]:
# merge into one
city_osm = city_poi_df.merge(city_nearest_df, on=["ADM3_PCODE", "year"])
city_osm.head(2)

Unnamed: 0,ADM3_PCODE,year,poi_count_sum,poi_count_mean,clinic_count_sum,clinic_count_mean,dentist_count_sum,dentist_count_mean,doctors_count_sum,doctors_count_mean,...,weighted_avg_waste_transfer_station_nearest_mean,weighted_avg_osm_river_nearest_mean,weighted_avg_osm_stream_nearest_mean,weighted_avg_osm_canal_nearest_mean,weighted_avg_osm_drain_nearest_mean,weighted_avg_osm_wetland_nearest_mean,weighted_avg_osm_reservoir_nearest_mean,weighted_avg_osm_water_nearest_mean,weighted_avg_osm_riverbank_nearest_mean,weighted_avg_osm_dock_nearest_mean
0,PH097332000,2014,64.0,0.633663,0.0,0.0,0.0,0.0,0.0,0.0,...,10000.0,606.745114,1433.584489,4793.248204,6481.345338,5457.234797,10000.0,2994.32823,4127.148433,10000.0
1,PH097332000,2015,851.0,8.425743,0.0,0.0,2.0,0.019802,0.0,0.0,...,10000.0,606.745114,1433.584489,4793.248204,6481.345338,5457.234797,10000.0,2994.32823,4127.148433,10000.0


## Population features

In [108]:
pop_count_files = os.listdir(POP_COUNT_DIR)
pop_density_files = os.listdir(POP_D_DIR)

In [109]:
pop_count_df = combine_indiv_files(POP_COUNT_DIR, pop_count_files)
pop_density_df = combine_indiv_files(POP_D_DIR, pop_density_files)

In [110]:
merged_population = pop_count_df.merge(
    pop_density_df, on=["date", "ADM4_PCODE", "freq"]
)
merged_population = add_year(merged_population)
merged_population = merged_population[
    merged_population["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
merged_population = merged_population.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE", "brgy_total_area"]],
    on="ADM4_PCODE",
    how="left",
)
merged_population.insert(0, "ADM3_PCODE", merged_population.pop("ADM3_PCODE"))
merged_population.insert(4, "brgy_total_area", merged_population.pop("brgy_total_area"))
merged_population.head()

Unnamed: 0,ADM3_PCODE,ADM4_PCODE,date,year,brgy_total_area,pop_count_total,pop_count_mean,pop_count_median,pop_count_stdev,pop_count_min,pop_count_max,pop_density_mean,pop_density_median,pop_density_stdev,pop_density_min,pop_density_max
0,PH097332000,PH097332001,2000-01-01,2000,961262.7,5733.472168,59.723668,41.130318,45.626944,17.22714,218.46463,8621.525391,8621.525391,0.0,8621.525391,8621.525391
1,PH097332000,PH097332002,2000-01-01,2000,4139359.0,9514.5625,20.157971,6.06998,29.318064,0.728731,212.672974,2379.857422,2388.723389,1833.508759,437.172852,4304.810059
2,PH097332000,PH097332004,2000-01-01,2000,1300887.0,16094.892578,107.299284,98.10009,63.451642,20.900679,319.702301,19084.578125,19084.578125,4136.836914,14947.742188,23221.416016
3,PH097332000,PH097332005,2000-01-01,2000,34470930.0,6577.566406,1.657235,1.319477,1.180323,0.08547,8.077068,184.606885,152.145767,108.799295,23.923004,382.564636
4,PH097332000,PH097332010,2000-01-01,2000,8235039.0,5116.272461,5.665861,4.24681,4.959968,1.071326,69.962112,713.516211,698.731445,354.95202,221.705765,1480.763916


In [111]:
merged_population = merged_population[
    ["ADM3_PCODE", "ADM4_PCODE", "date", "year", "brgy_total_area", "pop_count_total"]
]
merged_population.head(2)

Unnamed: 0,ADM3_PCODE,ADM4_PCODE,date,year,brgy_total_area,pop_count_total
0,PH097332000,PH097332001,2000-01-01,2000,961262.7,5733.472168
1,PH097332000,PH097332002,2000-01-01,2000,4139359.0,9514.5625


In [112]:
# sum total population
pop_count_total_sum = (
    merged_population.groupby(["ADM3_PCODE", "year"])["pop_count_total"]
    .sum()
    .reset_index()
)

# process barangay area (group by adm4_en) and sum into adm3_en
brgy_total_area_mean = (
    merged_population.dropna()
    .groupby(["ADM3_PCODE", "ADM4_PCODE"])["brgy_total_area"]
    .mean()
    .reset_index()
)
brgy_total_area_sum = (
    brgy_total_area_mean.dropna().groupby("ADM3_PCODE").sum().reset_index()
)

# merge into a single dataframe
merged_brgy_sum = pd.merge(
    pop_count_total_sum, brgy_total_area_sum, on="ADM3_PCODE", how="left"
)

# rename columns
merged_brgy_sum.rename(
    columns={
        "pop_count_total": "pop_count_total",
        "brgy_total_area": "city_total_area",
    },
    inplace=True,
)

# convert to km2
merged_brgy_sum["city_total_area"] = merged_brgy_sum["city_total_area"] / 1_000_000

# create new column for pop_density
merged_brgy_sum["pop_density_per_m2"] = merged_brgy_sum["pop_count_total"] / (
    merged_brgy_sum["city_total_area"]
)

# drop city area column
merged_brgy_sum = merged_brgy_sum.drop(columns=["city_total_area"])
merged_brgy_sum = merged_brgy_sum.replace(0, np.nan)
merged_brgy_sum.sort_values("year")

  brgy_total_area_mean.dropna().groupby("ADM3_PCODE").sum().reset_index()


Unnamed: 0,ADM3_PCODE,year,pop_count_total,pop_density_per_m2
0,PH097332000,2000,636441.101075,415.785339
1,PH097332000,2001,640249.55453,418.273392
2,PH097332000,2002,654950.973494,427.877791
3,PH097332000,2003,675238.506535,441.131584
4,PH097332000,2004,677129.094577,442.366701
5,PH097332000,2005,690686.697508,451.223848
6,PH097332000,2006,705817.73222,461.108914
7,PH097332000,2007,701936.852028,458.573545
8,PH097332000,2008,724155.315517,473.08881
9,PH097332000,2009,734271.027293,479.697379


In [113]:
city_population = convert_to_city(
    merged_population,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
city_population = city_population.drop(columns=["date", "brgy_total_area_mean"])
city_population = city_population.rename(
    columns={"pop_count_total_mean": "brgy_pop_count_mean"}
)

In [114]:
city_population_merge = merged_brgy_sum.merge(
    city_population, on=["ADM3_PCODE", "year"], how="left"
)
city_population_merge

Unnamed: 0,ADM3_PCODE,year,pop_count_total,pop_density_per_m2,brgy_pop_count_mean
0,PH097332000,2000,636441.101075,415.785339,6301.39704
1,PH097332000,2001,640249.55453,418.273392,6339.1045
2,PH097332000,2002,654950.973494,427.877791,6484.663104
3,PH097332000,2003,675238.506535,441.131584,6685.529768
4,PH097332000,2004,677129.094577,442.366701,6704.248461
5,PH097332000,2005,690686.697508,451.223848,6838.482154
6,PH097332000,2006,705817.73222,461.108914,6988.294378
7,PH097332000,2007,701936.852028,458.573545,6949.869822
8,PH097332000,2008,724155.315517,473.08881,7169.854609
9,PH097332000,2009,734271.027293,479.697379,7270.010171


## Connectivity Features (Ookla)

In [115]:
ookla_files = os.listdir(OOKLA_DIR)
print(ookla_files)

['ookla_features_2021.csv', 'ookla_features_2019.csv', 'ookla_features_2022.csv', 'ookla_features_2020.csv']


In [116]:
ookla_df = combine_indiv_files(OOKLA_DIR, ookla_files)
ookla_df = ookla_df.drop(columns=["Unnamed: 0", "freq"])
ookla_df["date"] = pd.to_datetime(ookla_df["date"])
ookla_df.insert(2, "year", ookla_df["date"].dt.year)
ookla_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,fixed_mean_avg_d_kbps_mean,fixed_mean_avg_u_kbps_mean,fixed_mean_avg_lat_ms_mean,fixed_mean_num_tests_mean,fixed_mean_num_devices_mean,mobile_mean_avg_d_kbps_mean,mobile_mean_avg_u_kbps_mean,mobile_mean_avg_lat_ms_mean,mobile_mean_num_tests_mean,mobile_mean_num_devices_mean
735,PH015518001,2019-10-01,2019,2741.940307,2211.481662,5.627849,11.043301,2.538388,1703.69706,782.911132,8.600433,0.626745,0.473824
733,PH015518002,2019-10-01,2019,1932.898273,1446.769077,3.475531,3.405412,1.318271,1433.374015,986.309648,5.5757,0.384723,0.293926


In [117]:
ookla_df = ookla_df[
    ookla_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
ookla_df = ookla_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE", "brgy_total_area"]],
    on="ADM4_PCODE",
    how="left",
)
ookla_df.insert(0, "ADM3_PCODE", ookla_df.pop("ADM3_PCODE"))
ookla_df.insert(4, "brgy_total_area", ookla_df.pop("brgy_total_area"))
ookla_df = convert_to_city(
    ookla_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
ookla_df

Unnamed: 0,ADM3_PCODE,date,year,brgy_total_area_mean,fixed_mean_avg_d_kbps_mean_mean,fixed_mean_avg_u_kbps_mean_mean,fixed_mean_avg_lat_ms_mean_mean,fixed_mean_num_tests_mean_mean,fixed_mean_num_devices_mean_mean,mobile_mean_avg_d_kbps_mean_mean,mobile_mean_avg_u_kbps_mean_mean,mobile_mean_avg_lat_ms_mean_mean,mobile_mean_num_tests_mean_mean,mobile_mean_num_devices_mean_mean
0,PH097332000,2019-10-01,2019,15155410.0,1102.588328,1077.213193,4.727306,8.142313,1.951016,711.849265,368.961823,4.316471,0.871976,0.483579
1,PH097332000,2020-10-01,2020,15155410.0,1038.854877,1051.777101,2.25923,10.502841,2.294825,686.93784,375.813284,2.718934,1.43344,0.529142
2,PH097332000,2021-10-01,2021,15155410.0,2479.527676,2249.181705,2.039032,12.2753,2.867956,1055.005818,406.759671,2.782771,2.488941,1.069597
3,PH097332000,2022-10-01,2022,15155410.0,3634.927677,3249.635544,1.657208,9.54462,2.624948,1588.900661,451.654118,3.02335,4.385754,1.531314


## Nightlights

In [118]:
# aggregate into one table as well
ntl_files = os.listdir(NIGHTLIGHTS_DIR)
print(ntl_files)

['nightlights_2016.csv', 'nightlights_2019.csv', 'nightlights_2018.csv', 'nightlights_2021.csv', 'nightlights_2015.csv', 'nightlights_2022.csv', 'nightlights_2012.csv', 'nightlights_2013.csv', 'nightlights_2017.csv', 'nightlights_2014.csv', 'nightlights_2020.csv']


In [119]:
ntl_df = combine_indiv_files(NIGHTLIGHTS_DIR, ntl_files)
ntl_df = add_year(ntl_df)
ntl_df = ntl_df[
    ntl_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
ntl_df = ntl_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE", "brgy_total_area"]],
    on="ADM4_PCODE",
    how="left",
)
ntl_df.insert(0, "ADM3_PCODE", ntl_df.pop("ADM3_PCODE"))
ntl_df.insert(4, "brgy_total_area", ntl_df.pop("brgy_total_area"))

ntl_df = convert_to_city(
    ntl_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
ntl_df

Unnamed: 0,ADM3_PCODE,date,year,brgy_total_area_mean,avg_rad_min_mean,avg_rad_max_mean,avg_rad_mean_mean,avg_rad_std_mean,avg_rad_median_mean
0,PH097332000,2012-01-01,2012,15155410.0,1.639831,5.460402,3.030957,1.149283,2.856192
1,PH097332000,2013-01-01,2013,15155410.0,1.678676,5.612633,3.081904,1.173999,2.907438
2,PH097332000,2014-01-01,2014,15155410.0,1.738257,5.266028,2.946094,1.036254,2.805252
3,PH097332000,2015-01-01,2015,15155410.0,1.76007,5.257482,2.979118,1.031192,2.8479
4,PH097332000,2016-01-01,2016,15155410.0,1.589075,4.353375,2.613751,0.832275,2.500951
5,PH097332000,2017-01-01,2017,15155410.0,2.164469,5.65707,3.425482,1.0548,3.280053
6,PH097332000,2018-01-01,2018,15155410.0,2.244963,5.713613,3.444397,1.040448,3.262335
7,PH097332000,2019-01-01,2019,15155410.0,2.246399,5.687465,3.460818,1.017337,3.300401
8,PH097332000,2020-01-01,2020,15155410.0,2.27833,5.602449,3.458333,0.987384,3.328969
9,PH097332000,2021-01-01,2021,15155410.0,2.239734,7.009234,3.974466,1.448885,3.714241


## Load Static features

### Geoportal health facilities

In [120]:
health_facilities_doh_df = pd.read_csv(GEOPORTAL_DOH)
health_facilities_doh_df = add_year(health_facilities_doh_df)
health_facilities_doh_df = health_facilities_doh_df[
    health_facilities_doh_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
health_facilities_doh_df = health_facilities_doh_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE", "brgy_total_area"]],
    on="ADM4_PCODE",
    how="left",
)
health_facilities_doh_df.insert(
    0, "ADM3_PCODE", health_facilities_doh_df.pop("ADM3_PCODE")
)
health_facilities_doh_df.insert(
    4, "brgy_total_area", health_facilities_doh_df.pop("brgy_total_area")
)

In [121]:
# do same thing as osm part
poi_count_cols = [col for col in health_facilities_doh_df.columns if "_count" in col]
nearest_dist_cols = [
    col for col in health_facilities_doh_df.columns if "_nearest" in col
]
info_cols = ["ADM3_PCODE", "ADM4_PCODE", "date", "year", "brgy_total_area"]

# subset to dataframes for count and nearest features
poi_count_df = health_facilities_doh_df[info_cols + poi_count_cols]
nearest_df = health_facilities_doh_df[info_cols + nearest_dist_cols]

city_doh_poi_df = convert_to_city(
    poi_count_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
city_doh_poi_df = city_doh_poi_df.drop(
    columns=["date", "brgy_total_area_sum", "brgy_total_area_mean"]
)

# get weighted avg
nearest_df_list = []
for dist_col in nearest_dist_cols:
    df = (
        nearest_df.groupby(["year", "ADM4_PCODE"])
        .apply(weighted_average, dist_col)
        .rename(f"weighted_avg_{dist_col}")
    )
    df = df.reset_index()
    nearest_df_list.append(df)

# merge to one dataframe
nearest_df_merged = merge_multi_dfs(
    nearest_df_list, merge_on_cols=["year", "ADM4_PCODE"]
)
nearest_df_merged = nearest_df_merged.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)
nearest_df_merged.insert(0, "ADM3_PCODE", nearest_df_merged.pop("ADM3_PCODE"))

city_doh_nearest_df = convert_to_city(
    nearest_df_merged,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "year"],
    agg_list=[("mean", "mean")],
)

# merge into one
city_doh = city_doh_poi_df.merge(city_doh_nearest_df, on=["ADM3_PCODE", "year"])
city_doh.head(2)

Unnamed: 0,ADM3_PCODE,year,doh_pois_count_sum,doh_pois_count_mean,doh_brgy_health_station_count_sum,doh_brgy_health_station_count_mean,doh_rural_health_unit_count_sum,doh_rural_health_unit_count_mean,doh_hospital_count_sum,doh_hospital_count_mean,...,doh_medical_clinic_count_sum,doh_medical_clinic_count_mean,weighted_avg_doh_brgy_health_station_nearest_mean,weighted_avg_doh_rural_health_unit_nearest_mean,weighted_avg_doh_hospital_nearest_mean,weighted_avg_doh_birthing_home_lying_in_clinic_nearest_mean,weighted_avg_doh_infirmary_nearest_mean,weighted_avg_doh_drug_abuse_treatment_rehabilitation_center_nearest_mean,weighted_avg_doh_social_hygiene_clinic_nearest_mean,weighted_avg_doh_medical_clinic_nearest_mean
0,PH097332000,2022,128.0,1.267327,82.0,0.811881,16.0,0.158416,11.0,0.108911,...,0.0,0.0,294.157561,2255.243275,5378.512598,2259.555427,8847.226434,10000.0,10000.0,10000.0


In [122]:
rwi_df = pd.read_csv(RWI)
rwi_df = rwi_df.drop(columns=["Unnamed: 0", "Year"])
rwi_df = add_year(rwi_df)

# filter to zamboanga barangays only
rwi_df = rwi_df[rwi_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)]

# add city/adm3_pcode
rwi_df = rwi_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)

# aggregate to city level
rwi_df = convert_to_city(
    rwi_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
rwi_df = rwi_df.drop(columns=["date"])
rwi_df.head(2)

Unnamed: 0,ADM3_PCODE,year,RWI_max_mean,RWI_mean_mean,RWI_median_mean,RWI_min_mean,RWI_std_mean
0,PH097332000,2016,0.495014,0.412015,0.410069,0.343276,0.04266
1,PH097332000,2017,0.509977,0.432567,0.432628,0.35538,0.041584


In [123]:
hazards_df = pd.read_csv(HAZARDS)
hazards_df = add_year(hazards_df)

# filter to zamboanga barangays only
hazards_df = hazards_df[hazards_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)]

# add city/adm3_pcode
hazards_df = hazards_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)

# aggregate to city level
hazards_df = convert_to_city(
    hazards_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
hazards_df = hazards_df.drop(columns=["date"])
hazards_df.head(2)

Unnamed: 0,ADM3_PCODE,year,pct_area_flood_hazard_100yr_low_mean,pct_area_flood_hazard_100yr_med_mean,pct_area_flood_hazard_100yr_high_mean,pct_area_flood_hazard_25yr_low_mean,pct_area_flood_hazard_25yr_med_mean,pct_area_flood_hazard_25yr_high_mean,pct_area_flood_hazard_5yr_low_mean,pct_area_flood_hazard_5yr_med_mean,pct_area_flood_hazard_5yr_high_mean,pct_area_landslide_hazard_low_mean,pct_area_landslide_hazard_med_mean,pct_area_landslide_hazard_high_mean
0,PH097332000,2021,11.963866,10.18882,3.436566,7.516484,5.431349,1.395336,5.267785,2.931399,0.813011,6.161432,16.813583,6.15745


In [124]:
landcover_df = pd.read_csv(LANDCOVER)
landcover_df = add_year(landcover_df)

# filter to zamboanga barangays only
landcover_df = landcover_df[landcover_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)]


# add city/adm3_pcode
landcover_df = landcover_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)

landcover_df = convert_to_city(
    landcover_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
landcover_df = landcover_df.drop(columns=["date"])
landcover_df.head(2)

Unnamed: 0,ADM3_PCODE,year,pct_area_bare_sparse_vegetation_mean,pct_area_builtup_mean,pct_area_cropland_mean,pct_area_grassland_mean,pct_area_herbaceous_wetland_mean,pct_area_mangroves_mean,pct_area_permanent_water_bodies_mean,pct_area_shrubland_mean,pct_area_tree_cover_mean
0,PH097332000,2021,0.574257,21.051881,2.781881,6.282871,0.067921,7.54604,13.458812,0.011287,64.952475


In [125]:
infra_df = pd.read_csv(BLDGS)
infra_df = infra_df.drop(columns=["Unnamed: 0"])
infra_df = add_year(infra_df)

# filter to zamboanga barangays only
infra_df = infra_df[infra_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)]

# add city/adm3_pcode
infra_df = infra_df.merge(
    admin_bounds[["ADM3_PCODE", "ADM4_PCODE"]], on="ADM4_PCODE", how="left"
)

infra_df = convert_to_city(
    infra_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
infra_df = infra_df.drop(columns=["date", "year"])
infra_df.head(2)

Unnamed: 0,ADM3_PCODE,google_bldgs_count_sum,google_bldgs_count_mean,google_bldgs_area_total_sum,google_bldgs_area_total_mean,google_bldgs_area_mean_sum,google_bldgs_area_mean_mean,google_bldgs_count_lt100_sqm_sum,google_bldgs_count_lt100_sqm_mean,google_bldgs_count_100_200_sqm_sum,google_bldgs_count_100_200_sqm_mean,google_bldgs_count_gt_200_sqm_sum,google_bldgs_count_gt_200_sqm_mean,google_bldgs_density_sum,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean
0,PH097332000,307045,3040.049505,20637380.0,204330.512491,6560.34809,64.953941,258800,2562.376238,37023,366.564356,11222,111.108911,0.112676,0.001116,908.037478,8.99047


## Aggregate the city level isochrones into one table

In [126]:
cols_to_drop = [
    "ADM1_EN",
    "ADM1_PCODE",
    "ADM2_EN",
    "ADM2_PCODE",
    "ADM3_EN",
    "total_population_count",
]

hospital_reach_df = pd.read_csv(HOSPITAL_ISO)
hospital_reach_df = hospital_reach_df.drop(columns=cols_to_drop)
hospital_reach_df.head(2)

Unnamed: 0,ADM3_PCODE,hospital_travel_time,pop_count_cumsum,pct_population_reached
0,PH015518000,5,121978.07,58.55
1,PH015518000,6,140660.24,67.52


In [127]:
hc_reach_df = pd.read_csv(HEALTHCENTER_ISO)
hc_reach_df = hc_reach_df.drop(columns=cols_to_drop)
hc_reach_df.head(2)

Unnamed: 0,ADM3_PCODE,brgy_healthcenters_travel_time,pop_count_cumsum,pct_population_reached
0,PH015518000,5,148946.05,71.5
1,PH015518000,6,182512.52,87.61


In [128]:
rhu_reach_df = pd.read_csv(RHU_ISO)
rhu_reach_df = rhu_reach_df.drop(columns=cols_to_drop)
rhu_reach_df.head(2)

Unnamed: 0,ADM3_PCODE,rhu_travel_time,pop_count_cumsum,pct_population_reached
0,PH015518000,5,27039.46,12.98
1,PH015518000,6,40661.21,19.52


In [129]:
# rename the accesibility to health care features
# to not overlap in the merging
hospital_reach_df = hospital_reach_df.rename(
    columns={
        "hospital_travel_time": "travel_time",
        "pop_count_cumsum": "hospital_pop_reached_total",
        "pct_population_reached": "hospital_pct_population_reached",
    }
)
hc_reach_df = hc_reach_df.rename(
    columns={
        "brgy_healthcenters_travel_time": "travel_time",
        "pop_count_cumsum": "healthcenter_pop_reached_total",
        "pct_population_reached": "healthcenter_pct_population_reached",
    }
)
rhu_reach_df = rhu_reach_df.rename(
    columns={
        "rhu_travel_time": "travel_time",
        "pop_count_cumsum": "rhu_pop_reached_total",
        "pct_population_reached": "rhu_pct_population_reached",
    }
)

merged_health_access = hospital_reach_df.merge(
    hc_reach_df, on=["ADM3_PCODE", "travel_time"]
)
merged_health_access = merged_health_access.merge(
    rhu_reach_df, on=["ADM3_PCODE", "travel_time"]
)
# merged_health_access = add_year(merged_health_access)
merged_health_access.head(3)

Unnamed: 0,ADM3_PCODE,travel_time,hospital_pop_reached_total,hospital_pct_population_reached,healthcenter_pop_reached_total,healthcenter_pct_population_reached,rhu_pop_reached_total,rhu_pct_population_reached
0,PH015518000,5,121978.07,58.55,148946.05,71.5,27039.46,12.98
1,PH015518000,6,140660.24,67.52,182512.52,87.61,40661.21,19.52
2,PH015518000,7,151668.58,72.8,198100.99,95.09,62411.94,29.96


In [130]:
# merged_health_access = merged_health_access[
#     merged_health_access["ADM4_PCODE"].isin(zambo_adm4_pcodes)
# ]  # filter to zamboanga barangays only
# merged_health_access.insert(0, "ADM3_PCODE", "PH097332000")
# merged_health_access = convert_to_city(
#     merged_health_access,
#     key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year", "travel_time"],
#     agg_list=[("mean", "mean")],
# )

# Link to aggregated Zamboanga Dengue LGU dataset

In [131]:
dengue_df["Date"] = pd.to_datetime(dengue_df["Date"])

In [132]:
climate_merged_df_zambo

Unnamed: 0,start_of_week,ADM3_PCODE,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,WEIGHTED_AVG_CO,HI_AVG,HI_MIN,...,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WEIGHTED_AVG_UVR,WS_AVG,WS_MIN,WS_MAX,WS_STD,WEIGHTED_AVG_WS
0,2013-01-01,PH097332000,PH097332001,0.065233,0.0577,0.0716,0.005224,0.065233,28.933333,27.91,...,23.128333,12.07,28.49,6.118733,23.128333,1.868333,0.25,3.70,1.153159,1.868333
1,2013-01-01,PH097332000,PH097332002,0.065233,0.0577,0.0716,0.005224,0.065233,28.303333,26.98,...,22.923333,10.66,28.19,6.683384,22.923333,1.823333,0.46,3.36,1.026522,1.823333
2,2013-01-01,PH097332000,PH097332004,0.065233,0.0577,0.0716,0.005224,0.065233,28.303333,26.98,...,22.923333,10.66,28.19,6.683384,22.923333,1.823333,0.46,3.36,1.026522,1.823333
3,2013-01-01,PH097332000,PH097332005,0.065233,0.0577,0.0716,0.005224,0.065233,28.303333,26.98,...,22.923333,10.66,28.19,6.683384,22.923333,1.823333,0.46,3.36,1.026522,1.823333
4,2013-01-01,PH097332000,PH097332010,0.065233,0.0577,0.0716,0.005224,0.065233,28.616667,27.56,...,23.025000,11.36,28.34,6.399865,23.025000,1.836667,0.32,3.53,1.095019,1.836667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53525,2022-12-26,PH097332000,PH097332103,0.074800,0.0666,0.0801,0.004791,0.074800,28.510000,27.83,...,22.236667,18.00,26.02,3.373714,22.236667,1.793333,0.78,2.93,0.864145,1.793333
53526,2022-12-26,PH097332000,PH097332104,0.074600,0.0670,0.0800,0.004569,0.074600,28.205000,27.49,...,22.291667,17.39,25.89,3.487322,22.291667,1.598333,0.55,2.62,0.734450,1.598333
53527,2022-12-26,PH097332000,PH097332901,0.074600,0.0670,0.0800,0.004569,0.074600,27.856667,26.93,...,22.370000,18.17,25.75,3.305674,22.370000,1.713333,0.74,2.65,0.643542,1.713333
53528,2022-12-26,PH097332000,PH097332902,0.076000,0.0672,0.0802,0.004988,0.076000,28.238333,27.38,...,22.121667,17.00,25.03,3.124602,22.121667,1.941667,0.53,3.19,0.972490,1.941667


In [133]:
climate_merged_df_zambo = convert_to_city(
    climate_merged_df_zambo,
    key_columns=["start_of_week", "ADM3_PCODE", "ADM4_PCODE"],
    agg_list=[("mean", "mean")],
)
climate_merged_df_zambo.head(3)

Unnamed: 0,start_of_week,ADM3_PCODE,CO_AVG_mean,CO_MIN_mean,CO_MAX_mean,CO_STD_mean,WEIGHTED_AVG_CO_mean,HI_AVG_mean,HI_MIN_mean,HI_MAX_mean,...,UVR_AVG_mean,UVR_MIN_mean,UVR_MAX_mean,UVR_STD_mean,WEIGHTED_AVG_UVR_mean,WS_AVG_mean,WS_MIN_mean,WS_MAX_mean,WS_STD_mean,WEIGHTED_AVG_WS_mean
0,2013-01-01,PH097332000,0.065419,0.057866,0.072049,0.005295,0.065419,28.585627,27.438317,29.559703,...,22.979191,11.405446,28.207525,6.32249,22.979191,1.873069,0.474257,3.638713,1.12453,1.873069
1,2013-01-07,PH097332000,0.066417,0.063019,0.070317,0.00277,0.066417,27.809293,26.113366,28.632772,...,20.151301,11.324752,27.758317,7.771338,20.151301,2.447284,1.63505,3.159604,0.506528,2.447284
2,2013-01-14,PH097332000,0.084637,0.064092,0.095125,0.010181,0.084637,28.109491,27.63198,28.725644,...,25.650651,19.950891,29.248713,3.054714,25.650651,3.43116,0.624653,5.090891,1.603288,3.43116


In [134]:
climate_merged_df_zambo["start_of_week"] = pd.to_datetime(
    climate_merged_df_zambo["start_of_week"]
)
climate_merged_df_zambo = climate_merged_df_zambo.drop(columns=["ADM3_PCODE"])

In [135]:
# join with climate variables first
health_climate_weekly_df = dengue_df.merge(
    climate_merged_df_zambo, left_on=["Date"], right_on=["start_of_week"], how="left"
)
health_climate_weekly_df.head()

Unnamed: 0,Source,Year,Month,Week,Date,Region,PSGC_Region,Municipality,PSGC_Municipality,ICD,...,UVR_AVG_mean,UVR_MIN_mean,UVR_MAX_mean,UVR_STD_mean,WEIGHTED_AVG_UVR_mean,WS_AVG_mean,WS_MIN_mean,WS_MAX_mean,WS_STD_mean,WEIGHTED_AVG_WS_mean
0,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,,,,,,,,,,
1,PIDSR-DOH,2008.0,1.0,2,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,,,,,,,,,,
2,PIDSR-DOH,2008.0,1.0,3,2008-01-14,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,,,,,,,,,,
3,PIDSR-DOH,2008.0,1.0,4,2008-01-21,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,,,,,,,,,,
4,PIDSR-DOH,2008.0,1.0,5,2008-01-28,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,,,,,,,,,,


In [136]:
health_climate_weekly_df = health_climate_weekly_df.rename(
    columns={"PSGC_Municipality": "ADM3_PCODE", "Year": "year"}
)

In [137]:
health_climate_weekly_df.shape

(792, 114)

In [138]:
# Dataframes that will be linked based on year + pcode
yearly_dfs_to_link = [
    city_osm,
    city_population_merge,
    # ookla_df,
    ntl_df,
    # rwi_df,
]

linked_df = health_climate_weekly_df.copy()

# Merge dataframes one by one
for df in yearly_dfs_to_link:
    try:
        df = df.drop(columns=["date"])
        linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE", "year"], how="left")
    except:
        linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE", "year"], how="left")

In [139]:
# for static variables just join by pcode
# will repeat throughout the dataset
static_dfs_to_link = [
    city_doh,
    hazards_df,
    landcover_df,
    infra_df,
    merged_health_access,
]

# Merge dataframes one by one
for df in static_dfs_to_link:
    try:
        df = df.drop(columns=["year"])
        linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE"], how="left")
    except:
        linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE"], how="left")

linked_df.head()

Unnamed: 0,Source,year,Month,Week,Date,Region,PSGC_Region,Municipality,ADM3_PCODE,ICD,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total,hospital_pct_population_reached,healthcenter_pop_reached_total,healthcenter_pct_population_reached,rhu_pop_reached_total,rhu_pct_population_reached
0,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
1,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,6,164549.21,19.12,530385.69,61.64,367163.5,42.67
2,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,7,210125.58,24.42,595930.36,69.25,415132.29,48.24
3,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,8,242981.97,28.24,645428.73,75.01,463787.51,53.9
4,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,9,275581.06,32.03,680570.09,79.09,505799.9,58.78


In [140]:
# check row count for duplicates
linked_df.shape

(20592, 255)

In [141]:
# drop duplicates
linked_df = linked_df.drop_duplicates(subset=["year", "Month", "Week", "Date"])
linked_df

Unnamed: 0,Source,year,Month,Week,Date,Region,PSGC_Region,Municipality,ADM3_PCODE,ICD,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total,hospital_pct_population_reached,healthcenter_pop_reached_total,healthcenter_pct_population_reached,rhu_pop_reached_total,rhu_pct_population_reached
0,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
26,PIDSR-DOH,2008.0,1.0,2,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
52,PIDSR-DOH,2008.0,1.0,3,2008-01-14,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
78,PIDSR-DOH,2008.0,1.0,4,2008-01-21,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
104,PIDSR-DOH,2008.0,1.0,5,2008-01-28,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20462,PIDSR-DOH,2022.0,11.0,48,2022-11-21,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
20488,PIDSR-DOH,2022.0,11.0,49,2022-11-28,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
20514,PIDSR-DOH,2022.0,12.0,50,2022-12-05,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
20540,PIDSR-DOH,2022.0,12.0,51,2022-12-12,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06


In [142]:
linked_df.columns.tolist()

['Source',
 'year',
 'Month',
 'Week',
 'Date',
 'Region',
 'PSGC_Region',
 'Municipality',
 'ADM3_PCODE',
 'ICD',
 'Disease',
 'Cases',
 'Claims',
 'Deaths',
 'Case_Type',
 'Date_Type',
 'outbreak',
 'outbreak_group',
 'start_of_week',
 'CO_AVG_mean',
 'CO_MIN_mean',
 'CO_MAX_mean',
 'CO_STD_mean',
 'WEIGHTED_AVG_CO_mean',
 'HI_AVG_mean',
 'HI_MIN_mean',
 'HI_MAX_mean',
 'HI_STD_mean',
 'WEIGHTED_AVG_HI_mean',
 'NDVI_AVG_mean',
 'NDVI_MIN_mean',
 'NDVI_MAX_mean',
 'NDVI_STD_mean',
 'WEIGHTED_AVG_NDVI_mean',
 'NO2_AVG_mean',
 'NO2_MIN_mean',
 'NO2_MAX_mean',
 'NO2_STD_mean',
 'WEIGHTED_AVG_NO2_mean',
 'O3_AVG_mean',
 'O3_MIN_mean',
 'O3_MAX_mean',
 'O3_STD_mean',
 'WEIGHTED_AVG_O3_mean',
 'PM10_AVG_mean',
 'PM10_MIN_mean',
 'PM10_MAX_mean',
 'PM10_STD_mean',
 'WEIGHTED_AVG_PM10_mean',
 'PM25_AVG_mean',
 'PM25_MIN_mean',
 'PM25_MAX_mean',
 'PM25_STD_mean',
 'WEIGHTED_AVG_PM25_mean',
 'PNP_AVG_mean',
 'PNP_MIN_mean',
 'PNP_MAX_mean',
 'PNP_STD_mean',
 'WEIGHTED_AVG_PNP_mean',
 'PR_AVG_me

In [143]:
linked_df.to_csv(
    PROCESSED_DIR / "linked_training_data/linked_df_pidsr_city_weekly_dengue.csv",
    index=False,
)