In [1]:
import pandas as pd
import geopandas as gpd

from pathlib import Path
import os
from loguru import logger
from tqdm import tqdm



# Link external variables to health data

We need to link the processed climate and socioeconomic datasets to the labeled LGU health data. 

1. Climate Data: 20 climate variables which are 'CO', 'HI', 'NDVI', 'NO2', 'O3', 'PM10', 'PM25', 'PNP', 'PR', 'RH', 'SO2', 'SPI3', 'SPI6', 'SR', 'Tave', 'Tmax', 'Tmin', 'UVR', 'WS'
  - These datsets are in an hourly format and must be aggregated to weekly to match the LGU health data aggregation.
2. Exposure Data: Collected from geospatial and satellite-derived datasets.
  - Also referred to as exposure variables.  

In [2]:
# file directories

CLIMATE_DIR = Path("../../data/02-raw/climate/")
VARIABLES_LIST = [
    "CO",
    "HI",
    "NDVI",
    "NO2",
    "O3",
    "PM10",
    "PM25",
    "PNP",
    "PR",
    "RH",
    "SO2",
    "SPI3",
    "SPI6",
    "SR",
    "Tave",
    "Tmax",
    "Tmin",
    "UVR",
    "WS",
]

# exposure features
OUTPUT_DIR = Path("../../data/04-output/")

OSM_DIR = OUTPUT_DIR / "osm"

POP_COUNT_DIR = OUTPUT_DIR / "worldpop" / "population_count"
POP_D_DIR = OUTPUT_DIR / "worldpop" / "population_density"

NIGHTLIGHTS_DIR = OUTPUT_DIR / "nightlights"
OOKLA_DIR = OUTPUT_DIR / "ookla"

GEOPORTAL_DOH = OUTPUT_DIR / "doh_health_geoportal.csv"
RWI = OUTPUT_DIR / "rwi" / "RWI_stats_reshaped.csv"
HAZARDS = OUTPUT_DIR / "noah" / "hz_proportion.csv"
LANDCOVER = OUTPUT_DIR / "landcover_features_ESA_2021.csv"
BLDGS = OUTPUT_DIR / "google_bldgs_v3_features.csv"

# accessibility of health facilities
HOSPITAL_ISO = OUTPUT_DIR / "hospitals_brgy_population_reached.csv"
HEALTHCENTER_ISO = OUTPUT_DIR / "brgy_healthcenter_brgy_population_reached.csv"
RHU_ISO = OUTPUT_DIR / "rhu_brgy_population_reached.csv"

In [3]:
PROCESSED_DIR = Path("../../data/03-processed/")
LABELED_CASES = (
    PROCESSED_DIR / "health" / "Zamboanga" / "brgy_weekly_zamboanga_labeled_v3.csv"
)

## Load Labeled Dengue Health Data

In [4]:
dengue_df = pd.read_csv(LABELED_CASES)
dengue_df.head()

Unnamed: 0,start_of_week,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,Male,outbreak
0,2013-01-07,PH097332001,0.0,,,,,,,,0
1,2013-01-07,PH097332002,3.0,5.333333,22.333333,15.027778,17.416667,8.748148,0.0,3.0,0
2,2013-01-07,PH097332004,2.0,5.666667,10.333333,8.0,8.0,3.299832,2.0,0.0,0
3,2013-01-07,PH097332005,0.0,,,,,,,,0
4,2013-01-07,PH097332010,0.0,,,,,,,,0


## Aggregate climate data

In [5]:
# view climate files
CLIMATE_FILES = os.listdir(CLIMATE_DIR)
print(CLIMATE_FILES)

['Tmax_2003-2022.csv', 'PNP_2003-2022_CHIRPS.csv', 'NO2_2003-2022.csv', 'PM25_2003-2022.csv', 'RH_2003-2022.csv', 'CO_2003-2022.csv', 'SO2_2003-2022.csv', 'SPI6_2003-2022_CHIRPS.csv', 'NDVI+NDVI_gapfill_2003-2022.csv', 'WS_2003-2022.csv', 'HI_2003-2022.csv', 'Tmin_2003-2022.csv', 'O3_2003-2022.csv', 'PR_2003-2022_CHIRPS.csv', 'UVR_2003-2022.csv', 'PM10_2003-2022.csv', 'NDVI_gapfill_2003-2022.csv', 'SPI3_2003-2022_CHIRPS.csv', 'SR_2003-2022.csv', 'Tave_2003-2022.csv']


In [6]:
def align_climate_var(climate_var, min_year=2013):
    """
    climate_var: Climate variable.
    min_year: minimum year to filter the climate dataset to match health data to be used.
    """
    raw_df = [
        filename for filename in CLIMATE_FILES if filename.startswith(climate_var)
    ][0]
    var_df = pd.read_csv(CLIMATE_DIR / raw_df)
    var_df["DATE"] = pd.to_datetime(var_df["DATE"])
    # filter date
    var_df = var_df[var_df["DATE"].dt.year >= min_year]
    # add weekly timestamp
    var_df["start_of_week"] = var_df["DATE"] - pd.to_timedelta(
        var_df["DATE"].dt.dayofweek, unit="D"
    )

    # Check if the Monday is from the previous year (December)
    previous_year_mask = var_df["start_of_week"].dt.year < var_df["DATE"].dt.year
    # # Adjust the start of the week to the current year
    var_df.loc[previous_year_mask, "start_of_week"] = pd.to_datetime(
        var_df[previous_year_mask]["DATE"].dt.year, format="%Y"
    )

    agg_dict = {
        "AVG": ("mean", "AVG"),
        "MIN": ("min", "MIN"),
        "MAX": ("max", "MAX"),
        "STD": ("std", "STD"),
    }

    result_dict = {}

    for agg_function, column_name in agg_dict.values():
        new_column_name = f"{climate_var}_{column_name}"
        result = var_df.groupby(["start_of_week", "ADM4_PCODE"])[climate_var].agg(
            **{new_column_name: agg_function}
        )
        result_dict[new_column_name] = result

    # Combine the results into a single DataFrame
    var_weekly = pd.concat(result_dict.values(), axis=1).reset_index()

    return var_weekly

In [7]:
climate_df_list = []

for var in tqdm(VARIABLES_LIST):
    df = align_climate_var(var)
    climate_df_list.append(df)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [05:19<00:00, 16.80s/it]


In [8]:
# merge into one dataframe

# Initialize an empty dataframe for merged data
climate_merged_df = pd.DataFrame()

# Merge dataframes one by one
for df in climate_df_list:
    if climate_merged_df.empty:
        climate_merged_df = df
    else:
        # Merge on 'date' and 'adm4_pcode' columns
        climate_merged_df = pd.merge(
            climate_merged_df, df, on=["start_of_week", "ADM4_PCODE"], how="outer"
        )

climate_merged_df.head()

Unnamed: 0,start_of_week,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,...,Tmin_MAX,Tmin_STD,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WS_AVG,WS_MIN,WS_MAX,WS_STD
0,2013-01-01,PH015518001,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
1,2013-01-01,PH015518002,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
2,2013-01-01,PH015518003,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
3,2013-01-01,PH015518004,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
4,2013-01-01,PH015518006,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228


## Aggregate geopatial exposure variables

In [9]:
# concat first
def combine_indiv_files(directory, list_of_filenames):
    dfs = []
    for file in list_of_filenames:
        df = pd.read_csv(directory / file)
        dfs.append(df)

    result_df = pd.concat(dfs)
    result_df = result_df.sort_values(by=["date", "ADM4_PCODE"])
    return result_df

### OSM features

In [10]:
osm_files = os.listdir(OSM_DIR)
print(osm_files)

['osm_features_water_2016.csv', 'osm_features_water_2018.csv', 'osm-poi-updated-feat-2014.csv', 'osm_features_water_2020.csv', 'osm_features_waterways_2020.csv', 'osm_features_waterways_2021.csv', 'osm-poi-updated-feat-2022.csv', 'osm-poi-updated-feat-2018.csv', 'osm_features_waterways_2016.csv', 'osm_features_waterways_2014.csv', 'osm-poi-updated-feat-2016.csv', 'osm_features_waterways_2019.csv', 'osm_features_water_2015.csv', 'osm_features_waterways_2017.csv', 'osm_features_waterways_2015.csv', 'osm_features_waterways_2018.csv', 'osm_features_water_2019.csv', 'osm_features_water_2017.csv', 'osm-poi-updated-feat-2021.csv', 'osm_features_water_2022.csv', 'osm-poi-updated-feat-2017.csv', 'osm_features_waterways_2022.csv', 'osm_features_water_2014.csv', 'osm_features_water_2021.csv', 'osm-poi-updated-feat-2019.csv', 'osm-poi-updated-feat-2020.csv', 'osm-poi-updated-feat-2015.csv']


In [11]:
osm_pois_files = [filename for filename in osm_files if "poi" in filename]
osm_waterway_files = [filename for filename in osm_files if "waterway" in filename]
osm_water_files = [filename for filename in osm_files if "water_" in filename]

In [12]:
osm_pois = combine_indiv_files(OSM_DIR, osm_pois_files)
osm_waterway = combine_indiv_files(OSM_DIR, osm_waterway_files)
osm_water = combine_indiv_files(OSM_DIR, osm_water_files)

In [13]:
osm_pois.head(3)

Unnamed: 0,ADM4_PCODE,date,freq,poi_count,clinic_count,clinic_nearest,dentist_count,dentist_nearest,doctors_count,doctors_nearest,...,toilet_count,toilet_nearest,recycling_count,recycling_nearest,waste_basket_count,waste_basket_nearest,wastewater_plant_count,wastewater_plant_nearest,waste_transfer_station_count,waste_transfer_station_nearest
735,PH015518001,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0
733,PH015518002,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0
270,PH015518003,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0


In [14]:
merged_osm = osm_pois.merge(osm_waterway, on=["date", "ADM4_PCODE", "freq"])
merged_osm = merged_osm.merge(osm_water, on=["date", "ADM4_PCODE", "freq"])
# clean dataframe
merged_osm = merged_osm.drop(columns=["freq"])
merged_osm["date"] = pd.to_datetime(merged_osm["date"])
merged_osm.insert(2, "year", merged_osm["date"].dt.year)
merged_osm.head()

Unnamed: 0,ADM4_PCODE,date,year,poi_count,clinic_count,clinic_nearest,dentist_count,dentist_nearest,doctors_count,doctors_nearest,...,waste_transfer_station_nearest,osm_river_nearest,osm_stream_nearest,osm_canal_nearest,osm_drain_nearest,osm_wetland_nearest,osm_reservoir_nearest,osm_water_nearest,osm_riverbank_nearest,osm_dock_nearest
0,PH015518001,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,4.922703,0.0,0.0,1020.416284,10000.0,0.0,0.0,10000.0
1,PH015518002,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,249.176968,0.0,620.087175,1531.683204,10000.0,0.0,212.308744,10000.0
2,PH015518003,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,721.132035,0.0,269.858043,899.314986,10000.0,534.239025,0.0,10000.0
3,PH015518004,2014-01-01,2014,4.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,10000.0,209.840371,474.344146,0.0,474.597641,411.695568,10000.0,389.128041,184.261138,10000.0
4,PH015518006,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,10000.0,0.0,2.120251,0.0,1258.00355,3128.918318,10000.0,26.190204,0.0,10000.0


### Population features

In [15]:
pop_count_files = os.listdir(POP_COUNT_DIR)
pop_density_files = os.listdir(POP_D_DIR)

In [16]:
pop_count_df = combine_indiv_files(POP_COUNT_DIR, pop_count_files)
pop_density_df = combine_indiv_files(POP_D_DIR, pop_density_files)

In [17]:
merged_population = pop_count_df.merge(
    pop_density_df, on=["date", "ADM4_PCODE", "freq"]
)

merged_population = merged_population.drop(columns=["freq"])
merged_population["date"] = pd.to_datetime(merged_population["date"])
merged_population.insert(2, "year", merged_population["date"].dt.year)

merged_population.head()

Unnamed: 0,ADM4_PCODE,date,year,pop_count_total,pop_count_mean,pop_count_median,pop_count_stdev,pop_count_min,pop_count_max,pop_density_mean,pop_density_median,pop_density_stdev,pop_density_min,pop_density_max
0,PH015518001,2000-01-01,2000,3301.73999,62.296981,58.635139,21.72617,33.255768,119.275719,,,,,
1,PH015518002,2000-01-01,2000,1381.896606,20.027487,16.779068,10.793548,5.425909,48.875465,,,,,
2,PH015518003,2000-01-01,2000,779.192749,59.937904,58.291641,10.277208,42.930283,81.198486,,,,,
3,PH015518004,2000-01-01,2000,739.494385,56.884183,58.748554,13.211522,38.855377,85.561195,,,,,
4,PH015518006,2000-01-01,2000,2448.997559,19.283445,17.903036,9.382027,2.864172,43.499416,2310.895752,2310.895752,0.0,2310.895752,2310.895752


In [34]:
merged_population.to_csv(OUTPUT_DIR / "merged_population.csv")

### Connectivity Features (Ookla)

In [18]:
ookla_files = os.listdir(OOKLA_DIR)
print(ookla_files)

['ookla_features_2021.csv', 'ookla_features_2019.csv', 'ookla_features_2022.csv', 'ookla_features_2020.csv']


In [19]:
ookla_df = combine_indiv_files(OOKLA_DIR, ookla_files)
ookla_df = ookla_df.drop(columns=["Unnamed: 0", "freq"])
ookla_df["date"] = pd.to_datetime(ookla_df["date"])
ookla_df.insert(2, "year", ookla_df["date"].dt.year)
ookla_df

Unnamed: 0,ADM4_PCODE,date,year,fixed_mean_avg_d_kbps_mean,fixed_mean_avg_u_kbps_mean,fixed_mean_avg_lat_ms_mean,fixed_mean_num_tests_mean,fixed_mean_num_devices_mean,mobile_mean_avg_d_kbps_mean,mobile_mean_avg_u_kbps_mean,mobile_mean_avg_lat_ms_mean,mobile_mean_num_tests_mean,mobile_mean_num_devices_mean
735,PH015518001,2019-10-01,2019,2741.940307,2211.481662,5.627849,11.043301,2.538388,1703.697060,782.911132,8.600433,0.626745,0.473824
733,PH015518002,2019-10-01,2019,1932.898273,1446.769077,3.475531,3.405412,1.318271,1433.374015,986.309648,5.575700,0.384723,0.293926
270,PH015518003,2019-10-01,2019,8492.355434,6551.555566,16.250000,135.162336,45.585137,4900.746098,3434.823103,24.940739,23.161838,14.612044
140,PH015518004,2019-10-01,2019,6452.543581,4352.653337,10.998537,89.894809,24.677308,3727.233918,2569.344428,16.086301,10.357107,6.922026
448,PH015518006,2019-10-01,2019,1502.544977,1157.231393,3.714599,2.234149,0.681859,1696.486047,1016.854194,5.752340,0.278719,0.184696
...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,PH137603005,2022-10-01,2022,2501.618755,2335.673245,0.252759,13.263507,2.728825,1189.265592,265.935755,0.636062,1.207914,0.585039
138,PH137603006,2022-10-01,2022,4045.956334,3541.999146,0.329133,46.630160,9.137081,2750.009162,436.460016,0.847968,3.503998,1.763122
442,PH137603007,2022-10-01,2022,4486.549238,4168.059852,0.448570,43.797804,8.654693,2449.997885,447.225930,1.129326,4.286305,1.901264
588,PH137603008,2022-10-01,2022,2249.795085,1969.770666,0.189419,8.993121,1.889387,1126.938859,201.927134,0.500257,1.020367,0.465397


### Nightlights

In [35]:
# aggregate into one table as well
ntl_files = os.listdir(NIGHTLIGHTS_DIR)
print(ntl_files)

['nightlights_2021.csv', 'nightlights_2022.csv', 'nightlights_2020.csv']


In [40]:
for file in ntl_files:
    if "_2022" in file:
        continue
    df = pd.read_csv(NIGHTLIGHTS_DIR / file)
    df = df.drop(
        columns=[
            "ADM1_EN",
            "ADM1_PCODE",
            "ADM2_EN",
            "ADM2_PCODE",
            "ADM3_EN",
            "ADM3_PCODE",
            "ADM4_EN",
        ]
    )
    df = df[
        [
            "ADM4_PCODE",
            "date",
            "freq",
            "avg_rad_min",
            "avg_rad_max",
            "avg_rad_mean",
            "avg_rad_std",
            "avg_rad_median",
        ]
    ]
    df.to_csv(OUTPUT_DIR / file, index=False)

In [42]:
ntl_df = combine_indiv_files(NIGHTLIGHTS_DIR, ntl_files)
ntl_df = add_year(ntl_df)
ntl_df.head()

Unnamed: 0,ADM4_PCODE,date,year,avg_rad_min,avg_rad_max,avg_rad_mean,avg_rad_std,avg_rad_median
735,PH015518001,2020-01-01,2020,6.906312,16.759203,10.556267,4.071839,9.279776
733,PH015518002,2020-01-01,2020,5.44319,7.279209,6.318113,0.752024,6.23194
270,PH015518003,2020-01-01,2020,,,,,
140,PH015518004,2020-01-01,2020,22.77886,22.77886,22.77886,0.0,22.77886
448,PH015518006,2020-01-01,2020,2.03729,6.170698,4.2439,1.276685,4.108033


### Load static features to dataframes

In [21]:
def add_year(df):
    df["date"] = pd.to_datetime(df["date"])
    df = df.drop(columns=["freq"])
    df.insert(2, "year", df["date"].dt.year)
    return df

In [22]:
health_facilities_doh_df = pd.read_csv(GEOPORTAL_DOH)
health_facilities_doh_df = add_year(health_facilities_doh_df)
health_facilities_doh_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,doh_pois_count,doh_brgy_health_station_count,doh_brgy_health_station_nearest,doh_rural_health_unit_count,doh_rural_health_unit_nearest,doh_hospital_count,doh_hospital_nearest,doh_birthing_home_lying_in_clinic_count,doh_birthing_home_lying_in_clinic_nearest,doh_infirmary_count,doh_infirmary_nearest,doh_drug_abuse_treatment_rehabilitation_center_count,doh_drug_abuse_treatment_rehabilitation_center_nearest,doh_social_hygiene_clinic_count,doh_social_hygiene_clinic_nearest,doh_medical_clinic_count,doh_medical_clinic_nearest
0,PH015518016,2022-01-01,2022,0.0,0.0,57.207979,0.0,2223.053002,0.0,1284.283872,0.0,1374.68323,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0
1,PH015518031,2022-01-01,2022,3.0,0.0,315.326271,0.0,956.852487,3.0,0.0,0.0,360.111127,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0


In [23]:
rwi_df = pd.read_csv(RWI)
rwi_df = rwi_df.drop(columns=["Unnamed: 0"])
rwi_df = add_year(rwi_df)
rwi_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,Year,RWI_max,RWI_mean,RWI_median,RWI_min,RWI_std
0,PH015518001,2016-01-01,2016,2016,0.729052,0.6686,0.670609,0.63657,0.03064
1,PH112402081,2016-01-01,2016,2016,0.65761,0.531397,0.530826,0.453885,0.035633


In [24]:
hazards_df = pd.read_csv(HAZARDS)
hazards_df = add_year(hazards_df)
hazards_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,pct_area_flood_hazard_100yr_low,pct_area_flood_hazard_100yr_med,pct_area_flood_hazard_100yr_high,pct_area_flood_hazard_25yr_low,pct_area_flood_hazard_25yr_med,pct_area_flood_hazard_25yr_high,pct_area_flood_hazard_5yr_low,pct_area_flood_hazard_5yr_med,pct_area_flood_hazard_5yr_high,pct_area_landslide_hazard_low,pct_area_landslide_hazard_med,pct_area_landslide_hazard_high
0,PH050506053,2021-11-01,2021,18.7703,0.6884,0.0907,2.0354,0.5726,0.0593,0.0,0.0,0.0,0.0,0.0,0.0
1,PH050506056,2021-11-01,2021,3.3062,5.8337,6.5147,0.0,0.0,0.0,0.0,0.0,0.0,10.1345,7.9272,1.9193


In [25]:
landcover_df = pd.read_csv(LANDCOVER)
landcover_df = add_year(landcover_df)
landcover_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,pct_area_bare_sparse_vegetation,pct_area_builtup,pct_area_cropland,pct_area_grassland,pct_area_herbaceous_wetland,pct_area_mangroves,pct_area_permanent_water_bodies,pct_area_shrubland,pct_area_tree_cover
0,PH015518016,2021-01-01,2021,0.28,2.3,0.01,0.24,0.08,0.0,100.0,0.0,0.63
1,PH015518031,2021-01-01,2021,2.83,48.06,6.56,10.68,0.02,0.0,30.68,0.0,10.61


In [26]:
infra_df = pd.read_csv(BLDGS)
infra_df = infra_df.drop(columns=["Unnamed: 0"])
infra_df = add_year(infra_df)
infra_df.head(2)

Unnamed: 0,ADM4_PCODE,date,year,google_bldgs_count,google_bldgs_area_total,google_bldgs_area_mean,google_bldgs_count_lt100_sqm,google_bldgs_count_100_200_sqm,google_bldgs_count_gt_200_sqm,google_bldgs_density,google_bldgs_pct_built_up_area
0,PH015518016,2023-01-01,2023,469,18878.581,40.252838,442,20,7,0.00046,1.850054
1,PH015518031,2023-01-01,2023,2209,234899.6797,106.337564,1535,449,225,0.002118,22.526707


**Aggregate the isochrones into one table**

In [27]:
hospital_reach_df = pd.read_csv(HOSPITAL_ISO)
hospital_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,0.0,0.0
1,PH015518031,2023-10-25,S,5,8139.9,101.73


In [28]:
hc_reach_df = pd.read_csv(HEALTHCENTER_ISO)
hc_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,112.67,10.75
1,PH015518031,2023-10-25,S,5,7959.48,99.48


In [29]:
rhu_reach_df = pd.read_csv(RHU_ISO)
rhu_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,0.0,0.0
1,PH015518031,2023-10-25,S,5,751.56,9.39


In [30]:
# rename the accesibility to health care features
# to not overlap in the merging
hospital_reach_df = hospital_reach_df.rename(
    columns={
        "pop_reached_total": "hospital_pop_reached_total",
        "pop_reached_pct": "hospital_pop_reached_pct",
    }
)
hc_reach_df = hc_reach_df.rename(
    columns={
        "pop_reached_total": "healthcenter_pop_reached_total",
        "pop_reached_pct": "healthcenter_pop_reached_pct",
    }
)
rhu_reach_df = rhu_reach_df.rename(
    columns={
        "pop_reached_total": "rhu_pop_reached_total",
        "pop_reached_pct": "rhu_pop_reached_pct",
    }
)

merged_health_access = hospital_reach_df.merge(
    hc_reach_df, on=["ADM4_PCODE", "date", "freq", "travel_time"]
)
merged_health_access = merged_health_access.merge(
    rhu_reach_df, on=["ADM4_PCODE", "date", "freq", "travel_time"]
)
merged_health_access = add_year(merged_health_access)
merged_health_access.head(3)

Unnamed: 0,ADM4_PCODE,date,year,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
0,PH015518016,2023-10-25,2023,5,0.0,0.0,112.67,10.75,0.0,0.0
1,PH015518031,2023-10-25,2023,5,8139.9,101.73,7959.48,99.48,751.56,9.39
2,PH015518022,2023-10-25,2023,5,29539.28,94.61,10105.87,32.37,3544.0,11.35


## Link to aggregated Zamboanga Dengue LGU

In [31]:
dengue_df["start_of_week"] = pd.to_datetime(dengue_df["start_of_week"])

In [46]:
# join to climate
health_climate_weekly_df = dengue_df.merge(
    climate_merged_df, on=["start_of_week", "ADM4_PCODE"], how="left"
)
# add year column
health_climate_weekly_df.insert(
    1, "year", health_climate_weekly_df["start_of_week"].dt.year
)
health_climate_weekly_df.head()

Unnamed: 0,start_of_week,year,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,Tmin_MAX,Tmin_STD,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WS_AVG,WS_MIN,WS_MAX,WS_STD
0,2013-01-07,2013,PH097332001,0.0,,,,,,,...,26.1,0.352359,20.001429,11.5,27.89,7.917562,2.431429,1.48,2.98,0.499781
1,2013-01-07,2013,PH097332002,3.0,5.333333,22.333333,15.027778,17.416667,8.748148,0.0,...,25.3,0.358349,20.365714,10.95,27.98,7.943607,2.425714,1.99,3.29,0.425983
2,2013-01-07,2013,PH097332004,2.0,5.666667,10.333333,8.0,8.0,3.299832,2.0,...,25.3,0.358349,20.365714,10.95,27.98,7.943607,2.425714,1.99,3.29,0.425983
3,2013-01-07,2013,PH097332005,0.0,,,,,,,...,25.3,0.358349,20.365714,10.95,27.98,7.943607,2.425714,1.99,3.29,0.425983
4,2013-01-07,2013,PH097332010,0.0,,,,,,,...,25.57,0.33953,20.184286,11.24,27.94,7.924569,2.418571,1.7,3.14,0.448941


In [47]:
yearly_dfs_to_link = [
    merged_osm,
    merged_population,
    ookla_df,
    ntl_df,
    rwi_df,
]

linked_df = health_climate_weekly_df

# Merge dataframes one by one
for df in yearly_dfs_to_link:
    # Merge on 'date' and 'adm4_pcode' columns
    df = df.drop(columns=["date"])
    linked_df = pd.merge(linked_df, df, on=["year", "ADM4_PCODE"], how="left")

# for static variables just join by barangay
# will repeat throughout the dataset
# watch out in modeling
static_dfs_to_link = [
    health_facilities_doh_df,
    hazards_df,
    landcover_df,
    infra_df,
    merged_health_access,
]

# Merge dataframes one by one
for df in static_dfs_to_link:
    # Merge on 'date' and 'adm4_pcode' columns
    df = df.drop(columns=["date", "year"])
    linked_df = pd.merge(linked_df, df, on=["ADM4_PCODE"], how="left")

linked_df.head()

Unnamed: 0,start_of_week,year,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_count_gt_200_sqm,google_bldgs_density,google_bldgs_pct_built_up_area,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
0,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,5,0.0,0.0,3611.67,76.28,0.0,0.0
1,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,6,0.0,0.0,446.57,9.43,0.0,0.0
2,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,7,0.0,0.0,1.36,0.03,0.0,0.0
3,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,8,0.0,0.0,155.34,3.28,0.0,0.0
4,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,9,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
linked_df.to_csv(PROCESSED_DIR / "linked_df_v1.csv", index=False)