In [1]:
%reload_ext autoreload
%autoreload 2

# Link external variables to health data

We need to link the processed climate and socioeconomic/demographic features to the sample dengue dataset. List of external variables are enumerated in the file imports section. 

In [2]:
import pandas as pd
import geopandas as gpd

from pathlib import Path
import os
from loguru import logger
from tqdm import tqdm

import sys

sys.path.append("../../")
# import directories
from src.settings import (
    DATA_DIR,
    RAW_DIR,
    PROCESSED_DIR,
    OUTPUT_DIR,
    CLIMATE_VARIABLES_LIST,
)

# import utils
from src.model_data_prep import align_climate_var



In [3]:
# file directories

CLIMATE_DIR = RAW_DIR / "climate"

OSM_DIR = OUTPUT_DIR / "osm"

POP_COUNT_DIR = OUTPUT_DIR / "worldpop" / "population_count"
POP_D_DIR = OUTPUT_DIR / "worldpop" / "population_density"

NIGHTLIGHTS_DIR = OUTPUT_DIR / "nightlights"
OOKLA_DIR = OUTPUT_DIR / "ookla"

GEOPORTAL_DOH = OUTPUT_DIR / "doh_health_geoportal.csv"
RWI = OUTPUT_DIR / "rwi" / "RWI_stats_reshaped.csv"
HAZARDS = OUTPUT_DIR / "noah" / "hz_proportion.csv"
LANDCOVER = OUTPUT_DIR / "landcover_features_ESA_2021.csv"
BLDGS = OUTPUT_DIR / "google_bldgs_v3_features.csv"

# accessibility of health facilities
HOSPITAL_ISO = OUTPUT_DIR / "hospitals_brgy_population_reached.csv"
HEALTHCENTER_ISO = OUTPUT_DIR / "brgy_healthcenter_brgy_population_reached.csv"
RHU_ISO = OUTPUT_DIR / "rhu_brgy_population_reached.csv"

In [4]:
LABELED_CASES = (
    PROCESSED_DIR / "health" / "Zamboanga" / "city_weekly_zamboanga_labeled_v1.csv"
)

# Load Aggregated Zamboanga Dengue Data

In [5]:
dengue_df = pd.read_csv(LABELED_CASES)
dengue_df.head()

Unnamed: 0,start_of_week,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,Male,outbreak
0,2013-01-01,66,0.5,68.083333,17.587121,14.75,14.592766,28,38,0
1,2013-01-07,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,43,1
2,2013-01-14,83,0.666667,45.0,13.447791,11.333333,10.454911,38,45,1
3,2013-01-21,75,0.416667,49.583333,12.897778,10.25,9.506395,37,38,1
4,2013-01-28,85,0.25,49.583333,12.851961,11.333333,9.896491,47,38,1


# Load admin boundaries

In [6]:
admin_bounds = gpd.read_file("../../data/01-admin-bounds/target_admin_bounds.shp")
admin_bounds

Unnamed: 0,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,ADM4_PCODE,geometry
0,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Lomboy,PH015518016,"POLYGON ((120.32742 16.05423, 120.32719 16.053..."
1,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Tapuac,PH015518031,"POLYGON ((120.33380 16.03974, 120.33389 16.039..."
2,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Pantal,PH015518022,"POLYGON ((120.34737 16.06009, 120.34761 16.060..."
3,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Barangay I (T. Bugallon),PH015518024,"POLYGON ((120.34054 16.04489, 120.34054 16.044..."
4,Region III,PH030000000,Nueva Ecija,PH034900000,Palayan City,PH034919000,Imelda Valley,PH034919017,"POLYGON ((121.12250 15.58028, 121.12687 15.579..."
...,...,...,...,...,...,...,...,...,...
874,National Capital Region,PH130000000,"NCR, Second District",PH137400000,City of Mandaluyong,PH137401000,Namayan,PH137401018,"POLYGON ((121.02328 14.58135, 121.02309 14.581..."
875,National Capital Region,PH130000000,"NCR, Second District",PH137400000,City of Mandaluyong,PH137401000,Plainview,PH137401022,"POLYGON ((121.03657 14.57933, 121.03734 14.579..."
876,National Capital Region,PH130000000,"NCR, Third District",PH137500000,City of Navotas,PH137503000,Navotas West,PH137503007,"MULTIPOLYGON (((120.94767 14.65256, 120.94768 ..."
877,National Capital Region,PH130000000,"NCR, Third District",PH137500000,City of Navotas,PH137503000,Tanza,PH137503014,"POLYGON ((120.91841 14.71296, 120.92356 14.708..."


In [7]:
# get adm3_pcode
zambo_adm4_pcodes = admin_bounds[admin_bounds["ADM3_EN"] == "Zamboanga City"][
    "ADM4_PCODE"
].tolist()
print(zambo_adm4_pcodes)

['PH097332062', 'PH097332028', 'PH097332067', 'PH097332095', 'PH097332060', 'PH097332034', 'PH097332902', 'PH097332027', 'PH097332056', 'PH097332044', 'PH097332020', 'PH097332093', 'PH097332047', 'PH097332051', 'PH097332061', 'PH097332090', 'PH097332901', 'PH097332080', 'PH097332038', 'PH097332100', 'PH097332092', 'PH097332097', 'PH097332096', 'PH097332023', 'PH097332010', 'PH097332070', 'PH097332098', 'PH097332050', 'PH097332039', 'PH097332064', 'PH097332073', 'PH097332076', 'PH097332032', 'PH097332002', 'PH097332099', 'PH097332089', 'PH097332101', 'PH097332053', 'PH097332071', 'PH097332037', 'PH097332085', 'PH097332904', 'PH097332001', 'PH097332083', 'PH097332087', 'PH097332043', 'PH097332021', 'PH097332084', 'PH097332035', 'PH097332031', 'PH097332017', 'PH097332046', 'PH097332019', 'PH097332004', 'PH097332052', 'PH097332058', 'PH097332045', 'PH097332048', 'PH097332033', 'PH097332016', 'PH097332063', 'PH097332026', 'PH097332069', 'PH097332059', 'PH097332091', 'PH097332072', 'PH097332

In [8]:
len(zambo_adm4_pcodes)

101

# Aggregate all climate variables into one dataframe

In [9]:
climate_df_list = []

for var in tqdm(CLIMATE_VARIABLES_LIST):
    df = align_climate_var(var)
    climate_df_list.append(df)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [13:44<00:00, 43.41s/it] 


In [10]:
# merge into one dataframe

# Initialize an empty dataframe for merged data
climate_merged_df = pd.DataFrame()

# Merge dataframes one by one
for df in climate_df_list:
    if climate_merged_df.empty:
        climate_merged_df = df
    else:
        # Merge on 'date' and 'adm4_pcode' columns
        climate_merged_df = pd.merge(
            climate_merged_df, df, on=["start_of_week", "ADM4_PCODE"], how="outer"
        )

climate_merged_df.head()

Unnamed: 0,start_of_week,ADM4_PCODE,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,...,Tmin_MAX,Tmin_STD,UVR_AVG,UVR_MIN,UVR_MAX,UVR_STD,WS_AVG,WS_MIN,WS_MAX,WS_STD
0,2013-01-01,PH015518001,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
1,2013-01-01,PH015518002,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
2,2013-01-01,PH015518003,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
3,2013-01-01,PH015518004,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228
4,2013-01-01,PH015518006,0.097867,0.0878,0.1159,0.010291,29.063333,27.35,30.04,0.945995,...,25.35,0.57587,22.53,20.02,24.58,1.550019,1.521667,0.59,2.88,0.807228


In [11]:
climate_merged_df.to_csv(
    PROCESSED_DIR / "climate_aggregated_weekly_brgy.csv", index=False
)

In [60]:
climate_merged_df = pd.read_csv(PROCESSED_DIR / "climate_aggregated_weekly_brgy.csv")

In [61]:
# filter to zamboanga barangays
climate_merged_df_zambo = climate_merged_df.copy()
climate_merged_df_zambo = climate_merged_df_zambo[
    climate_merged_df_zambo["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]

In [77]:
climate_merged_df_zambo["PNP_AVG_mean"].mean()

99.14530916666668

# Aggregate geopatial exposure variables

In [12]:
# Function for combining individual files
def combine_indiv_files(directory, list_of_filenames):
    dfs = []
    for file in list_of_filenames:
        df = pd.read_csv(directory / file)
        dfs.append(df)

    result_df = pd.concat(dfs)
    result_df = result_df.sort_values(by=["date", "ADM4_PCODE"])
    return result_df


# Insert "year" column for joining of annual datasets
def add_year(df):
    df["date"] = pd.to_datetime(df["date"])
    df = df.drop(columns=["freq"])
    df.insert(2, "year", df["date"].dt.year)
    return df


def convert_to_city(
    df,  # to aggregate
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year", "freq", "Year"],
    agg_list=[
        ("sum", "sum"),
        ("mean", "mean"),
        ("min", "min"),
        ("max", "max"),
        ("std", "std"),
    ],
):

    # Define aggregation functions for each column
    aggregation_functions = {}
    for column in df.columns:
        if column not in key_columns:
            aggregation_functions[
                column
            ] = agg_list  # Include multiple aggregation functions
    key_columns.remove("ADM4_PCODE")
    # Group by key columns and aggregate other columns
    aggregated_df = df.groupby(key_columns).agg(aggregation_functions).reset_index()

    # Flatten MultiIndex column names
    aggregated_df.columns = [
        f"{col[0]}_{col[1]}" if col[1] else col[0] for col in aggregated_df.columns
    ]

    return aggregated_df

## OSM features

In [13]:
osm_files = os.listdir(OSM_DIR)
print(osm_files)

['osm_features_water_2016.csv', 'osm_features_water_2018.csv', 'osm-poi-updated-feat-2014.csv', 'osm_features_water_2020.csv', 'osm_features_waterways_2020.csv', 'osm_features_waterways_2021.csv', 'osm-poi-updated-feat-2022.csv', 'osm-poi-updated-feat-2018.csv', 'osm_features_waterways_2016.csv', 'osm_features_waterways_2014.csv', 'osm-poi-updated-feat-2016.csv', 'osm_features_waterways_2019.csv', 'osm_features_water_2015.csv', 'osm_features_waterways_2017.csv', 'osm_features_waterways_2015.csv', 'osm_features_waterways_2018.csv', 'osm_features_water_2019.csv', 'osm_features_water_2017.csv', 'osm-poi-updated-feat-2021.csv', 'osm_features_water_2022.csv', 'osm-poi-updated-feat-2017.csv', 'osm_features_waterways_2022.csv', 'osm_features_water_2014.csv', 'osm_features_water_2021.csv', 'osm-poi-updated-feat-2019.csv', 'osm-poi-updated-feat-2020.csv', 'osm-poi-updated-feat-2015.csv']


In [14]:
osm_pois_files = [filename for filename in osm_files if "poi" in filename]
osm_waterway_files = [filename for filename in osm_files if "waterway" in filename]
osm_water_files = [filename for filename in osm_files if "water_" in filename]

In [15]:
osm_pois = combine_indiv_files(OSM_DIR, osm_pois_files)
osm_waterway = combine_indiv_files(OSM_DIR, osm_waterway_files)
osm_water = combine_indiv_files(OSM_DIR, osm_water_files)

In [16]:
osm_pois.head(3)

Unnamed: 0,ADM4_PCODE,date,freq,poi_count,clinic_count,clinic_nearest,dentist_count,dentist_nearest,doctors_count,doctors_nearest,...,toilet_count,toilet_nearest,recycling_count,recycling_nearest,waste_basket_count,waste_basket_nearest,wastewater_plant_count,wastewater_plant_nearest,waste_transfer_station_count,waste_transfer_station_nearest
735,PH015518001,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0
733,PH015518002,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0
270,PH015518003,2014-01-01,Y,0.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,...,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0,0.0,10000.0


In [17]:
merged_osm = osm_pois.merge(osm_waterway, on=["date", "ADM4_PCODE", "freq"])
merged_osm = merged_osm.merge(osm_water, on=["date", "ADM4_PCODE", "freq"])
# clean dataframe
merged_osm = add_year(merged_osm)
merged_osm = merged_osm[
    merged_osm["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
merged_osm.insert(0, "ADM3_PCODE", "PH097332000")  # add city code
merged_osm.head()

Unnamed: 0,ADM3_PCODE,ADM4_PCODE,date,year,poi_count,clinic_count,clinic_nearest,dentist_count,dentist_nearest,doctors_count,...,waste_transfer_station_nearest,osm_river_nearest,osm_stream_nearest,osm_canal_nearest,osm_drain_nearest,osm_wetland_nearest,osm_reservoir_nearest,osm_water_nearest,osm_riverbank_nearest,osm_dock_nearest
465,PH097332000,PH097332001,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,...,10000.0,0.0,2491.364593,4406.041222,8392.320981,0.0,10000.0,1463.991302,0.0,10000.0
466,PH097332000,PH097332002,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,...,10000.0,0.0,1175.647907,10000.0,10000.0,10000.0,10000.0,7248.859907,1846.055233,10000.0
467,PH097332000,PH097332004,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,...,10000.0,0.0,252.038675,5331.332439,1031.746742,3108.973587,10000.0,283.519999,219.120882,10000.0
468,PH097332000,PH097332005,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,...,10000.0,0.0,0.0,10000.0,5213.716862,10000.0,10000.0,0.0,4219.114815,10000.0
469,PH097332000,PH097332010,2014-01-01,2014,0.0,0.0,10000.0,0.0,10000.0,0.0,...,10000.0,0.0,0.0,562.158279,4580.785113,1799.515949,10000.0,0.0,572.847804,10000.0


In [18]:
city_osm = merged_osm.copy()
city_osm = convert_to_city(
    city_osm,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
city_osm

Unnamed: 0,ADM3_PCODE,date,year,poi_count_sum,poi_count_mean,clinic_count_sum,clinic_count_mean,clinic_nearest_sum,clinic_nearest_mean,dentist_count_sum,...,osm_wetland_nearest_sum,osm_wetland_nearest_mean,osm_reservoir_nearest_sum,osm_reservoir_nearest_mean,osm_water_nearest_sum,osm_water_nearest_mean,osm_riverbank_nearest_sum,osm_riverbank_nearest_mean,osm_dock_nearest_sum,osm_dock_nearest_mean
0,PH097332000,2014-01-01,2014,64.0,0.633663,0.0,0.0,1010000.0,10000.0,0.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
1,PH097332000,2015-01-01,2015,851.0,8.425743,0.0,0.0,1010000.0,10000.0,2.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
2,PH097332000,2016-01-01,2016,1129.0,11.178218,0.0,0.0,1010000.0,10000.0,2.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
3,PH097332000,2017-01-01,2017,1214.0,12.019802,0.0,0.0,1010000.0,10000.0,4.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
4,PH097332000,2018-01-01,2018,1288.0,12.752475,0.0,0.0,1010000.0,10000.0,4.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
5,PH097332000,2019-01-01,2019,1383.0,13.693069,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
6,PH097332000,2020-01-01,2020,1566.0,15.50495,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
7,PH097332000,2021-01-01,2021,1978.0,19.584158,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
8,PH097332000,2022-01-01,2022,2082.0,20.613861,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0


In [45]:
# remove ookla for now
exclude_osm = [
    # unneeded osm features
    "atm_count",
    "atm_nearest",
    "bank_count",
    "bank_nearest",
    "college_count",
    "college_nearest",
    "community_centre_count",
    "community_centre_nearest",
    "comms_tower_count",
    "comms_tower_nearest",
    "convenience_count",
    "convenience_nearest",
    "fire_station_count",
    "fire_station_nearest",
    "kindergarten_count",
    "kindergarten_nearest",
    "lighthouse_count",
    "lighthouse_nearest",
    "market_place_count",
    "market_place_nearest",
    "park_count",
    "park_nearest",
    "public_building_count",
    "public_building_nearest",
    "police_count",
    "police_nearest",
    "school_count",
    "school_nearest",
    "shelter_count",
    "shelter_nearest",
    "supermarket_count",
    "supermarket_nearest",
    "telephone_count",
    "telephone_nearest",
    "tower_count",
    "tower_nearest",
    "town_hall_count",
    "town_hall_nearest",
    "university_count",
    "university_nearest",
    "cable_count",
    "cable_nearest",
    "compensator_count",
    "compensator_nearest",
    "connection_count",
    "connection_nearest",
    "converter_count",
    "converter_nearest",
    "generator_count",
    "generator_nearest",
    "insulator_count",
    "insulator_nearest",
    "line_count",
    "line_nearest",
    "busbar_count",
    "busbar_nearest",
    "bay_count",
    "bay_nearest",
    "minor_line_count",
    "minor_line_nearest",
    "plant_count",
    "plant_nearest",
    "pole_count",
    "pole_nearest",
    "portal_count",
    "portal_nearest",
    "substation_count",
    "substation_nearest",
    "tower_count_y",
    "transformer_count",
    "transformer_nearest",
    "exchange_count",
    "exchange_nearest",
    "connection_point_count",
    "connection_point_nearest",
    "distribution_point_count",
    "distribution_point_nearest",
    "service_device_count",
    "service_device_nearest",
    "data_center_count",
    "data_center_nearest",
]

# Remove columns based on the list of strings or prefixes
columns_to_keep = [
    col
    for col in city_osm.columns
    if not any(col.startswith(prefix) for prefix in exclude_osm)
]
city_osm = city_osm[columns_to_keep]

In [46]:
city_osm

Unnamed: 0,ADM3_PCODE,date,year,poi_count_sum,poi_count_mean,clinic_count_sum,clinic_count_mean,clinic_nearest_sum,clinic_nearest_mean,dentist_count_sum,...,osm_wetland_nearest_sum,osm_wetland_nearest_mean,osm_reservoir_nearest_sum,osm_reservoir_nearest_mean,osm_water_nearest_sum,osm_water_nearest_mean,osm_riverbank_nearest_sum,osm_riverbank_nearest_mean,osm_dock_nearest_sum,osm_dock_nearest_mean
0,PH097332000,2014-01-01,2014,64.0,0.633663,0.0,0.0,1010000.0,10000.0,0.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
1,PH097332000,2015-01-01,2015,851.0,8.425743,0.0,0.0,1010000.0,10000.0,2.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
2,PH097332000,2016-01-01,2016,1129.0,11.178218,0.0,0.0,1010000.0,10000.0,2.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
3,PH097332000,2017-01-01,2017,1214.0,12.019802,0.0,0.0,1010000.0,10000.0,4.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
4,PH097332000,2018-01-01,2018,1288.0,12.752475,0.0,0.0,1010000.0,10000.0,4.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
5,PH097332000,2019-01-01,2019,1383.0,13.693069,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
6,PH097332000,2020-01-01,2020,1566.0,15.50495,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
7,PH097332000,2021-01-01,2021,1978.0,19.584158,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0
8,PH097332000,2022-01-01,2022,2082.0,20.613861,0.0,0.0,1010000.0,10000.0,3.0,...,551180.714516,5457.234797,1010000.0,10000.0,302427.15125,2994.32823,416841.991751,4127.148433,1010000.0,10000.0


## Population features

In [19]:
pop_count_files = os.listdir(POP_COUNT_DIR)
pop_density_files = os.listdir(POP_D_DIR)

In [20]:
pop_count_df = combine_indiv_files(POP_COUNT_DIR, pop_count_files)
pop_density_df = combine_indiv_files(POP_D_DIR, pop_density_files)

In [21]:
merged_population = pop_count_df.merge(
    pop_density_df, on=["date", "ADM4_PCODE", "freq"]
)
merged_population = add_year(merged_population)
merged_population = merged_population[
    merged_population["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
merged_population.insert(0, "ADM3_PCODE", "PH097332000")
merged_population.head()

Unnamed: 0,ADM3_PCODE,ADM4_PCODE,date,year,pop_count_total,pop_count_mean,pop_count_median,pop_count_stdev,pop_count_min,pop_count_max,pop_density_mean,pop_density_median,pop_density_stdev,pop_density_min,pop_density_max
465,PH097332000,PH097332001,2000-01-01,2000,5733.472168,59.723668,41.130318,45.626944,17.22714,218.46463,8621.525391,8621.525391,0.0,8621.525391,8621.525391
466,PH097332000,PH097332002,2000-01-01,2000,9514.5625,20.157971,6.06998,29.318064,0.728731,212.672974,2379.857422,2388.723389,1833.508759,437.172852,4304.810059
467,PH097332000,PH097332004,2000-01-01,2000,16094.892578,107.299284,98.10009,63.451642,20.900679,319.702301,19084.578125,19084.578125,4136.836914,14947.742188,23221.416016
468,PH097332000,PH097332005,2000-01-01,2000,6577.566406,1.657235,1.319477,1.180323,0.08547,8.077068,184.606885,152.145767,108.799295,23.923004,382.564636
469,PH097332000,PH097332010,2000-01-01,2000,5116.272461,5.665861,4.24681,4.959968,1.071326,69.962112,713.516211,698.731445,354.95202,221.705765,1480.763916


In [22]:
city_population = merged_population.copy()
city_population = convert_to_city(
    city_population,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
city_population

Unnamed: 0,ADM3_PCODE,date,year,pop_count_total_sum,pop_count_total_mean,pop_count_mean_sum,pop_count_mean_mean,pop_count_median_sum,pop_count_median_mean,pop_count_stdev_sum,...,pop_density_mean_sum,pop_density_mean_mean,pop_density_median_sum,pop_density_median_mean,pop_density_stdev_sum,pop_density_stdev_mean,pop_density_min_sum,pop_density_min_mean,pop_density_max_sum,pop_density_max_mean
0,PH097332000,2000-01-01,2000,636441.101075,6301.39704,1876.188619,18.576125,1476.795989,14.621742,1438.496466,...,177860.674625,1892.134836,164874.549536,1753.98457,70408.30809,749.024554,107779.561766,1146.591083,295590.599701,3144.580848
1,PH097332000,2001-01-01,2001,640249.55453,6339.1045,1865.841727,18.47368,1438.129826,14.238909,1563.42333,...,180990.536412,1925.431238,166248.85486,1768.604839,72549.876131,771.807193,110201.885486,1172.360484,304865.984024,3243.255149
2,PH097332000,2002-01-01,2002,654950.973494,6484.663104,1908.643121,18.897457,1525.791206,15.106844,1434.767777,...,183480.764857,1951.92303,168515.408168,1792.717108,73330.385396,780.110483,111615.956692,1187.403795,307714.900063,3273.562767
3,PH097332000,2003-01-01,2003,675238.506535,6685.529768,2006.610359,19.867429,1670.22433,16.536875,1384.858594,...,187093.734915,1990.358882,171571.880258,1825.232769,73977.65589,786.996339,114437.72105,1217.422564,311672.969597,3315.669889
4,PH097332000,2004-01-01,2004,677129.094577,6704.248461,1972.672959,19.531415,1599.734411,15.838955,1416.626341,...,188688.997685,2007.329763,172955.234663,1839.949305,75412.076956,802.256138,115221.3982,1225.759555,317871.516857,3381.611881
5,PH097332000,2005-01-01,2005,690686.697508,6838.482154,2010.42098,19.905158,1641.735058,16.254803,1437.480594,...,192340.64125,2046.177035,176503.197485,1877.69359,74186.411775,789.217147,120142.425097,1278.110905,320649.717426,3411.167207
6,PH097332000,2006-01-01,2006,705817.73222,6988.294378,2093.954114,20.732219,1752.037189,17.346903,1418.377265,...,195211.630834,2076.719477,180013.931343,1915.041823,74886.009618,796.659677,121491.529755,1292.463082,322334.130348,3429.086493
7,PH097332000,2007-01-01,2007,701936.852028,6949.869822,2026.401569,20.063382,1598.063196,15.822408,1575.8832,...,195460.165829,2079.363466,179383.373907,1908.333765,75778.678087,806.15615,121037.821279,1287.636397,326936.986588,3478.053049
8,PH097332000,2008-01-01,2008,724155.315517,7169.854609,2134.538339,21.134043,1771.475523,17.539362,1464.803268,...,199350.544201,2120.75047,183851.12395,1955.863021,78644.208458,836.640516,121576.534505,1293.367388,332643.722244,3538.763003
9,PH097332000,2009-01-01,2009,734271.027293,7270.010171,2185.277514,21.636411,1805.773164,17.878942,1508.843795,...,201495.658275,2143.570833,185092.434843,1969.068456,81785.425625,870.057719,120085.478716,1277.505093,338631.438889,3602.462116


In [23]:
# merged_population.to_csv(OUTPUT_DIR / "merged_population.csv")

## Connectivity Features (Ookla)

In [23]:
ookla_files = os.listdir(OOKLA_DIR)
print(ookla_files)

['ookla_features_2021.csv', 'ookla_features_2019.csv', 'ookla_features_2022.csv', 'ookla_features_2020.csv']


In [24]:
ookla_df = combine_indiv_files(OOKLA_DIR, ookla_files)
ookla_df = ookla_df.drop(columns=["Unnamed: 0", "freq"])
ookla_df["date"] = pd.to_datetime(ookla_df["date"])
ookla_df.insert(2, "year", ookla_df["date"].dt.year)
ookla_df

Unnamed: 0,ADM4_PCODE,date,year,fixed_mean_avg_d_kbps_mean,fixed_mean_avg_u_kbps_mean,fixed_mean_avg_lat_ms_mean,fixed_mean_num_tests_mean,fixed_mean_num_devices_mean,mobile_mean_avg_d_kbps_mean,mobile_mean_avg_u_kbps_mean,mobile_mean_avg_lat_ms_mean,mobile_mean_num_tests_mean,mobile_mean_num_devices_mean
735,PH015518001,2019-10-01,2019,2741.940307,2211.481662,5.627849,11.043301,2.538388,1703.697060,782.911132,8.600433,0.626745,0.473824
733,PH015518002,2019-10-01,2019,1932.898273,1446.769077,3.475531,3.405412,1.318271,1433.374015,986.309648,5.575700,0.384723,0.293926
270,PH015518003,2019-10-01,2019,8492.355434,6551.555566,16.250000,135.162336,45.585137,4900.746098,3434.823103,24.940739,23.161838,14.612044
140,PH015518004,2019-10-01,2019,6452.543581,4352.653337,10.998537,89.894809,24.677308,3727.233918,2569.344428,16.086301,10.357107,6.922026
448,PH015518006,2019-10-01,2019,1502.544977,1157.231393,3.714599,2.234149,0.681859,1696.486047,1016.854194,5.752340,0.278719,0.184696
...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,PH137603005,2022-10-01,2022,2501.618755,2335.673245,0.252759,13.263507,2.728825,1189.265592,265.935755,0.636062,1.207914,0.585039
138,PH137603006,2022-10-01,2022,4045.956334,3541.999146,0.329133,46.630160,9.137081,2750.009162,436.460016,0.847968,3.503998,1.763122
442,PH137603007,2022-10-01,2022,4486.549238,4168.059852,0.448570,43.797804,8.654693,2449.997885,447.225930,1.129326,4.286305,1.901264
588,PH137603008,2022-10-01,2022,2249.795085,1969.770666,0.189419,8.993121,1.889387,1126.938859,201.927134,0.500257,1.020367,0.465397


In [25]:
ookla_df = ookla_df[
    ookla_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
ookla_df.insert(0, "ADM3_PCODE", "PH097332000")
ookla_df = convert_to_city(
    ookla_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
ookla_df

Unnamed: 0,ADM3_PCODE,date,year,fixed_mean_avg_d_kbps_mean_mean,fixed_mean_avg_u_kbps_mean_mean,fixed_mean_avg_lat_ms_mean_mean,fixed_mean_num_tests_mean_mean,fixed_mean_num_devices_mean_mean,mobile_mean_avg_d_kbps_mean_mean,mobile_mean_avg_u_kbps_mean_mean,mobile_mean_avg_lat_ms_mean_mean,mobile_mean_num_tests_mean_mean,mobile_mean_num_devices_mean_mean
0,PH097332000,2019-10-01,2019,1102.588328,1077.213193,4.727306,8.142313,1.951016,711.849265,368.961823,4.316471,0.871976,0.483579
1,PH097332000,2020-10-01,2020,1038.854877,1051.777101,2.25923,10.502841,2.294825,686.93784,375.813284,2.718934,1.43344,0.529142
2,PH097332000,2021-10-01,2021,2479.527676,2249.181705,2.039032,12.2753,2.867956,1055.005818,406.759671,2.782771,2.488941,1.069597
3,PH097332000,2022-10-01,2022,3634.927677,3249.635544,1.657208,9.54462,2.624948,1588.900661,451.654118,3.02335,4.385754,1.531314


## Nightlights

In [26]:
# aggregate into one table as well
ntl_files = os.listdir(NIGHTLIGHTS_DIR)
print(ntl_files)

['nightlights_2016.csv', 'nightlights_2019.csv', 'nightlights_2018.csv', 'nightlights_2021.csv', 'nightlights_2015.csv', 'nightlights_2022.csv', 'nightlights_2012.csv', 'nightlights_2013.csv', 'nightlights_2017.csv', 'nightlights_2014.csv', 'nightlights_2020.csv']


In [27]:
ntl_df = combine_indiv_files(NIGHTLIGHTS_DIR, ntl_files)
ntl_df = add_year(ntl_df)
ntl_df = ntl_df[
    ntl_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
ntl_df.insert(0, "ADM3_PCODE", "PH097332000")
ntl_df = convert_to_city(
    ntl_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
ntl_df

Unnamed: 0,ADM3_PCODE,date,year,avg_rad_min_mean,avg_rad_max_mean,avg_rad_mean_mean,avg_rad_std_mean,avg_rad_median_mean
0,PH097332000,2012-01-01,2012,1.639831,5.460402,3.030957,1.149283,2.856192
1,PH097332000,2013-01-01,2013,1.678676,5.612633,3.081904,1.173999,2.907438
2,PH097332000,2014-01-01,2014,1.738257,5.266028,2.946094,1.036254,2.805252
3,PH097332000,2015-01-01,2015,1.76007,5.257482,2.979118,1.031192,2.8479
4,PH097332000,2016-01-01,2016,1.589075,4.353375,2.613751,0.832275,2.500951
5,PH097332000,2017-01-01,2017,2.164469,5.65707,3.425482,1.0548,3.280053
6,PH097332000,2018-01-01,2018,2.244963,5.713613,3.444397,1.040448,3.262335
7,PH097332000,2019-01-01,2019,2.246399,5.687465,3.460818,1.017337,3.300401
8,PH097332000,2020-01-01,2020,2.27833,5.602449,3.458333,0.987384,3.328969
9,PH097332000,2021-01-01,2021,2.239734,7.009234,3.974466,1.448885,3.714241


## Load Static features

In [28]:
health_facilities_doh_df = pd.read_csv(GEOPORTAL_DOH)
health_facilities_doh_df = add_year(health_facilities_doh_df)
health_facilities_doh_df = health_facilities_doh_df[
    health_facilities_doh_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
health_facilities_doh_df.insert(0, "ADM3_PCODE", "PH097332000")
health_facilities_doh_df = convert_to_city(
    health_facilities_doh_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
health_facilities_doh_df.head(2)

Unnamed: 0,ADM3_PCODE,date,year,doh_pois_count_sum,doh_pois_count_mean,doh_brgy_health_station_count_sum,doh_brgy_health_station_count_mean,doh_brgy_health_station_nearest_sum,doh_brgy_health_station_nearest_mean,doh_rural_health_unit_count_sum,...,doh_drug_abuse_treatment_rehabilitation_center_nearest_sum,doh_drug_abuse_treatment_rehabilitation_center_nearest_mean,doh_social_hygiene_clinic_count_sum,doh_social_hygiene_clinic_count_mean,doh_social_hygiene_clinic_nearest_sum,doh_social_hygiene_clinic_nearest_mean,doh_medical_clinic_count_sum,doh_medical_clinic_count_mean,doh_medical_clinic_nearest_sum,doh_medical_clinic_nearest_mean
0,PH097332000,2022-01-01,2022,128.0,1.267327,82.0,0.811881,29709.913621,294.157561,16.0,...,1010000.0,10000.0,0.0,0.0,1010000.0,10000.0,0.0,0.0,1010000.0,10000.0


In [29]:
rwi_df = pd.read_csv(RWI)
rwi_df = rwi_df.drop(columns=["Unnamed: 0", "Year"])
rwi_df = add_year(rwi_df)
rwi_df = rwi_df[
    rwi_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
rwi_df.insert(0, "ADM3_PCODE", "PH097332000")
rwi_df = convert_to_city(
    rwi_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
rwi_df.head(2)

Unnamed: 0,ADM3_PCODE,date,year,RWI_max_mean,RWI_mean_mean,RWI_median_mean,RWI_min_mean,RWI_std_mean
0,PH097332000,2016-01-01,2016,0.495014,0.412015,0.410069,0.343276,0.04266
1,PH097332000,2017-01-01,2017,0.509977,0.432567,0.432628,0.35538,0.041584


In [30]:
hazards_df = pd.read_csv(HAZARDS)
hazards_df = add_year(hazards_df)
hazards_df = hazards_df[
    hazards_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
hazards_df.insert(0, "ADM3_PCODE", "PH097332000")
hazards_df = convert_to_city(
    hazards_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
hazards_df.head(2)

Unnamed: 0,ADM3_PCODE,date,year,pct_area_flood_hazard_100yr_low_mean,pct_area_flood_hazard_100yr_med_mean,pct_area_flood_hazard_100yr_high_mean,pct_area_flood_hazard_25yr_low_mean,pct_area_flood_hazard_25yr_med_mean,pct_area_flood_hazard_25yr_high_mean,pct_area_flood_hazard_5yr_low_mean,pct_area_flood_hazard_5yr_med_mean,pct_area_flood_hazard_5yr_high_mean,pct_area_landslide_hazard_low_mean,pct_area_landslide_hazard_med_mean,pct_area_landslide_hazard_high_mean
0,PH097332000,2021-11-01,2021,11.963866,10.18882,3.436566,7.516484,5.431349,1.395336,5.267785,2.931399,0.813011,6.161432,16.813583,6.15745


In [31]:
landcover_df = pd.read_csv(LANDCOVER)
landcover_df = add_year(landcover_df)
landcover_df = landcover_df[
    landcover_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
landcover_df.insert(0, "ADM3_PCODE", "PH097332000")
landcover_df = convert_to_city(
    landcover_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("mean", "mean")],
)
landcover_df.head(2)

Unnamed: 0,ADM3_PCODE,date,year,pct_area_bare_sparse_vegetation_mean,pct_area_builtup_mean,pct_area_cropland_mean,pct_area_grassland_mean,pct_area_herbaceous_wetland_mean,pct_area_mangroves_mean,pct_area_permanent_water_bodies_mean,pct_area_shrubland_mean,pct_area_tree_cover_mean
0,PH097332000,2021-01-01,2021,0.574257,21.051881,2.781881,6.282871,0.067921,7.54604,13.458812,0.011287,64.952475


In [32]:
infra_df = pd.read_csv(BLDGS)
infra_df = infra_df.drop(columns=["Unnamed: 0"])
infra_df = add_year(infra_df)
infra_df = infra_df[
    infra_df["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
infra_df.insert(0, "ADM3_PCODE", "PH097332000")
infra_df = convert_to_city(
    infra_df,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year"],
    agg_list=[("sum", "sum"), ("mean", "mean")],
)
infra_df.head(2)

Unnamed: 0,ADM3_PCODE,date,year,google_bldgs_count_sum,google_bldgs_count_mean,google_bldgs_area_total_sum,google_bldgs_area_total_mean,google_bldgs_area_mean_sum,google_bldgs_area_mean_mean,google_bldgs_count_lt100_sqm_sum,google_bldgs_count_lt100_sqm_mean,google_bldgs_count_100_200_sqm_sum,google_bldgs_count_100_200_sqm_mean,google_bldgs_count_gt_200_sqm_sum,google_bldgs_count_gt_200_sqm_mean,google_bldgs_density_sum,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean
0,PH097332000,2023-01-01,2023,307045,3040.049505,20637380.0,204330.512491,6560.34809,64.953941,258800,2562.376238,37023,366.564356,11222,111.108911,0.112676,0.001116,908.037478,8.99047


In [33]:
infra_df.columns

Index(['ADM3_PCODE', 'date', 'year', 'google_bldgs_count_sum',
       'google_bldgs_count_mean', 'google_bldgs_area_total_sum',
       'google_bldgs_area_total_mean', 'google_bldgs_area_mean_sum',
       'google_bldgs_area_mean_mean', 'google_bldgs_count_lt100_sqm_sum',
       'google_bldgs_count_lt100_sqm_mean',
       'google_bldgs_count_100_200_sqm_sum',
       'google_bldgs_count_100_200_sqm_mean',
       'google_bldgs_count_gt_200_sqm_sum',
       'google_bldgs_count_gt_200_sqm_mean', 'google_bldgs_density_sum',
       'google_bldgs_density_mean', 'google_bldgs_pct_built_up_area_sum',
       'google_bldgs_pct_built_up_area_mean'],
      dtype='object')

### Aggregate the isochrones into one table

In [34]:
hospital_reach_df = pd.read_csv(HOSPITAL_ISO)
hospital_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,0.0,0.0
1,PH015518031,2023-10-25,S,5,8139.9,101.73


In [35]:
hc_reach_df = pd.read_csv(HEALTHCENTER_ISO)
hc_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,112.67,10.75
1,PH015518031,2023-10-25,S,5,7959.48,99.48


In [36]:
rhu_reach_df = pd.read_csv(RHU_ISO)
rhu_reach_df.head(2)

Unnamed: 0,ADM4_PCODE,date,freq,travel_time,pop_reached_total,pop_reached_pct
0,PH015518016,2023-10-25,S,5,0.0,0.0
1,PH015518031,2023-10-25,S,5,751.56,9.39


In [37]:
# rename the accesibility to health care features
# to not overlap in the merging
hospital_reach_df = hospital_reach_df.rename(
    columns={
        "pop_reached_total": "hospital_pop_reached_total",
        "pop_reached_pct": "hospital_pop_reached_pct",
    }
)
hc_reach_df = hc_reach_df.rename(
    columns={
        "pop_reached_total": "healthcenter_pop_reached_total",
        "pop_reached_pct": "healthcenter_pop_reached_pct",
    }
)
rhu_reach_df = rhu_reach_df.rename(
    columns={
        "pop_reached_total": "rhu_pop_reached_total",
        "pop_reached_pct": "rhu_pop_reached_pct",
    }
)

merged_health_access = hospital_reach_df.merge(
    hc_reach_df, on=["ADM4_PCODE", "date", "freq", "travel_time"]
)
merged_health_access = merged_health_access.merge(
    rhu_reach_df, on=["ADM4_PCODE", "date", "freq", "travel_time"]
)
merged_health_access = add_year(merged_health_access)
merged_health_access.head(3)

Unnamed: 0,ADM4_PCODE,date,year,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
0,PH015518016,2023-10-25,2023,5,0.0,0.0,112.67,10.75,0.0,0.0
1,PH015518031,2023-10-25,2023,5,8139.9,101.73,7959.48,99.48,751.56,9.39
2,PH015518022,2023-10-25,2023,5,29539.28,94.61,10105.87,32.37,3544.0,11.35


In [38]:
merged_health_access = merged_health_access[
    merged_health_access["ADM4_PCODE"].isin(zambo_adm4_pcodes)
]  # filter to zamboanga barangays only
merged_health_access.insert(0, "ADM3_PCODE", "PH097332000")
merged_health_access = convert_to_city(
    merged_health_access,
    key_columns=["ADM3_PCODE", "ADM4_PCODE", "date", "year", "travel_time"],
    agg_list=[("mean", "mean")],
)

In [39]:
merged_health_access

Unnamed: 0,ADM3_PCODE,date,year,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
0,PH097332000,2023-10-25,2023,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
1,PH097332000,2023-10-25,2023,6,389.484455,2.727129,713.509703,5.400396,594.931287,4.134158
2,PH097332000,2023-10-25,2023,7,438.327822,2.564455,642.164851,4.409901,456.764059,3.186139
3,PH097332000,2023-10-25,2023,8,312.853762,1.916832,481.750891,3.231089,450.803168,3.241683
4,PH097332000,2023-10-25,2023,9,304.710594,1.858515,328.927525,2.433564,402.472475,3.082673
5,PH097332000,2023-10-25,2023,10,288.640099,1.403267,195.164455,1.813168,349.437723,2.948119
6,PH097332000,2023-10-25,2023,11,271.904653,1.357327,107.197426,1.240198,336.927129,2.677525
7,PH097332000,2023-10-25,2023,12,270.109208,1.437723,114.283663,1.351089,322.284059,2.923861
8,PH097332000,2023-10-25,2023,13,277.985347,1.391287,65.391881,0.949802,244.616337,2.535149
9,PH097332000,2023-10-25,2023,14,277.761782,1.602079,44.066832,0.934455,222.077129,2.409505


# Link to aggregated Zamboanga Dengue LGU dataset

In [63]:
dengue_df["start_of_week"] = pd.to_datetime(dengue_df["start_of_week"])

In [64]:
climate_merged_df_zambo = convert_to_city(
    climate_merged_df_zambo,
    key_columns=["start_of_week", "ADM4_PCODE"],
    agg_list=[("mean", "mean")],
)
climate_merged_df_zambo

Unnamed: 0,start_of_week,CO_AVG_mean,CO_MIN_mean,CO_MAX_mean,CO_STD_mean,HI_AVG_mean,HI_MIN_mean,HI_MAX_mean,HI_STD_mean,NDVI_AVG_mean,...,Tmin_MAX_mean,Tmin_STD_mean,UVR_AVG_mean,UVR_MIN_mean,UVR_MAX_mean,UVR_STD_mean,WS_AVG_mean,WS_MIN_mean,WS_MAX_mean,WS_STD_mean
0,2013-01-01,0.065419,0.057866,0.072049,0.005295,28.585627,27.438317,29.559703,0.919976,0.659133,...,25.974356,0.517428,22.979191,11.405446,28.207525,6.322490,1.873069,0.474257,3.638713,1.124530
1,2013-01-07,0.066417,0.063019,0.070317,0.002770,27.809293,26.113366,28.632772,0.839268,0.658186,...,25.593267,0.365843,20.151301,11.324752,27.758317,7.771338,2.447284,1.635050,3.159604,0.506528
2,2013-01-14,0.084637,0.064092,0.095125,0.010181,28.109491,27.631980,28.725644,0.374584,0.664271,...,25.360198,0.181474,25.650651,19.950891,29.248713,3.054714,3.431160,0.624653,5.090891,1.603288
3,2013-01-21,0.075221,0.069819,0.081210,0.003478,27.776846,27.050693,28.515050,0.483635,0.650514,...,25.447822,0.246577,24.182249,16.243267,28.522772,4.390766,2.892942,1.187624,3.660792,0.939929
4,2013-01-28,0.077248,0.069280,0.085297,0.006678,28.338359,26.994059,29.366337,0.866676,0.669600,...,25.983564,0.648475,28.419279,23.328614,32.218812,3.741702,2.301273,0.753069,3.359109,0.914786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,2022-11-28,0.061614,0.060193,0.062938,0.001073,29.490099,28.581881,30.299208,0.648064,0.635486,...,26.134950,0.449235,27.561117,25.590198,30.275941,1.851226,1.085177,0.505446,1.563366,0.350588
526,2022-12-05,0.065590,0.060628,0.071329,0.003942,29.130849,28.347624,29.980396,0.546792,0.641329,...,26.250000,0.476887,24.908699,21.352178,28.927129,2.699851,2.167610,0.556436,4.311980,1.456936
527,2022-12-12,0.067248,0.060009,0.075689,0.006016,29.350905,28.250099,29.995347,0.553446,0.650500,...,26.246337,0.436345,26.271612,20.374653,28.690792,2.787037,1.957270,0.750495,3.237228,0.818074
528,2022-12-19,0.065165,0.058409,0.084636,0.009184,28.340750,27.088713,29.285644,0.752213,0.655229,...,25.973663,0.450674,21.391188,13.089505,26.428416,4.725523,1.750410,0.471782,3.462772,0.943893


In [65]:
climate_merged_df_zambo["start_of_week"] = pd.to_datetime(
    climate_merged_df_zambo["start_of_week"]
)

In [66]:
# join with climate variables first
health_climate_weekly_df = dengue_df.merge(
    climate_merged_df_zambo, on=["start_of_week"], how="left"
)
# add year column
health_climate_weekly_df.insert(
    1, "year", health_climate_weekly_df["start_of_week"].dt.year
)
health_climate_weekly_df.insert(1, "ADM3_PCODE", "PH097332000")
health_climate_weekly_df.head()

Unnamed: 0,start_of_week,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,Tmin_MAX_mean,Tmin_STD_mean,UVR_AVG_mean,UVR_MIN_mean,UVR_MAX_mean,UVR_STD_mean,WS_AVG_mean,WS_MIN_mean,WS_MAX_mean,WS_STD_mean
0,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,25.974356,0.517428,22.979191,11.405446,28.207525,6.32249,1.873069,0.474257,3.638713,1.12453
1,2013-01-07,PH097332000,2013,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,...,25.593267,0.365843,20.151301,11.324752,27.758317,7.771338,2.447284,1.63505,3.159604,0.506528
2,2013-01-14,PH097332000,2013,83,0.666667,45.0,13.447791,11.333333,10.454911,38,...,25.360198,0.181474,25.650651,19.950891,29.248713,3.054714,3.43116,0.624653,5.090891,1.603288
3,2013-01-21,PH097332000,2013,75,0.416667,49.583333,12.897778,10.25,9.506395,37,...,25.447822,0.246577,24.182249,16.243267,28.522772,4.390766,2.892942,1.187624,3.660792,0.939929
4,2013-01-28,PH097332000,2013,85,0.25,49.583333,12.851961,11.333333,9.896491,47,...,25.983564,0.648475,28.419279,23.328614,32.218812,3.741702,2.301273,0.753069,3.359109,0.914786


In [67]:
health_climate_weekly_df.shape

(475, 88)

In [68]:
# Dataframes that will be linked based on year + pcode
yearly_dfs_to_link = [
    city_osm,
    city_population,
    # ookla_df,
    ntl_df,
    # rwi_df,
]

linked_df = health_climate_weekly_df.copy()

# Merge dataframes one by one
for df in yearly_dfs_to_link:
    df = df.drop(columns=["date"])
    linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE", "year"], how="left")

In [69]:
linked_df

Unnamed: 0,start_of_week,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,pop_density_stdev_mean,pop_density_min_sum,pop_density_min_mean,pop_density_max_sum,pop_density_max_mean,avg_rad_min_mean,avg_rad_max_mean,avg_rad_mean_mean,avg_rad_std_mean,avg_rad_median_mean
0,2013-01-01,PH097332000,2013,66,0.500000,68.083333,17.587121,14.750000,14.592766,28,...,927.724028,127708.338617,1358.599347,358893.841148,3818.019587,1.678676,5.612633,3.081904,1.173999,2.907438
1,2013-01-07,PH097332000,2013,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,...,927.724028,127708.338617,1358.599347,358893.841148,3818.019587,1.678676,5.612633,3.081904,1.173999,2.907438
2,2013-01-14,PH097332000,2013,83,0.666667,45.000000,13.447791,11.333333,10.454911,38,...,927.724028,127708.338617,1358.599347,358893.841148,3818.019587,1.678676,5.612633,3.081904,1.173999,2.907438
3,2013-01-21,PH097332000,2013,75,0.416667,49.583333,12.897778,10.250000,9.506395,37,...,927.724028,127708.338617,1358.599347,358893.841148,3818.019587,1.678676,5.612633,3.081904,1.173999,2.907438
4,2013-01-28,PH097332000,2013,85,0.250000,49.583333,12.851961,11.333333,9.896491,47,...,927.724028,127708.338617,1358.599347,358893.841148,3818.019587,1.678676,5.612633,3.081904,1.173999,2.907438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,2021-11-29,PH097332000,2021,27,0.580000,79.000000,16.784074,12.000000,16.000713,11,...,,,,,,2.239734,7.009234,3.974466,1.448885,3.714241
471,2021-12-06,PH097332000,2021,26,0.580000,66.000000,15.176154,10.000000,15.970134,12,...,,,,,,2.239734,7.009234,3.974466,1.448885,3.714241
472,2021-12-13,PH097332000,2021,35,3.000000,32.000000,14.200000,12.000000,8.407979,17,...,,,,,,2.239734,7.009234,3.974466,1.448885,3.714241
473,2021-12-20,PH097332000,2021,27,0.500000,32.000000,10.388889,10.000000,7.766215,13,...,,,,,,2.239734,7.009234,3.974466,1.448885,3.714241


In [52]:
landcover_df

Unnamed: 0,ADM3_PCODE,date,year,pct_area_bare_sparse_vegetation_mean,pct_area_builtup_mean,pct_area_cropland_mean,pct_area_grassland_mean,pct_area_herbaceous_wetland_mean,pct_area_mangroves_mean,pct_area_permanent_water_bodies_mean,pct_area_shrubland_mean,pct_area_tree_cover_mean
0,PH097332000,2021-01-01,2021,0.574257,21.051881,2.781881,6.282871,0.067921,7.54604,13.458812,0.011287,64.952475


In [70]:
# for static variables just join by pcode
# will repeat throughout the dataset
static_dfs_to_link = [
    health_facilities_doh_df,
    hazards_df,
    landcover_df,
    infra_df,
    merged_health_access,
]

# Merge dataframes one by one
for df in static_dfs_to_link:
    df = df.drop(columns=["date", "year"])
    linked_df = pd.merge(linked_df, df, on=["ADM3_PCODE"], how="left")

linked_df.head()

Unnamed: 0,start_of_week,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
0,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
1,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,6,389.484455,2.727129,713.509703,5.400396,594.931287,4.134158
2,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,7,438.327822,2.564455,642.164851,4.409901,456.764059,3.186139
3,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,8,312.853762,1.916832,481.750891,3.231089,450.803168,3.241683
4,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,9,304.710594,1.858515,328.927525,2.433564,402.472475,3.082673


In [71]:
# check row count for duplicates
linked_df.shape

(12350, 281)

In [72]:
# drop duplicates
linked_df = linked_df.drop_duplicates(subset=["start_of_week"])
linked_df

Unnamed: 0,start_of_week,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
0,2013-01-01,PH097332000,2013,66,0.500000,68.083333,17.587121,14.750000,14.592766,28,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
26,2013-01-07,PH097332000,2013,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
52,2013-01-14,PH097332000,2013,83,0.666667,45.000000,13.447791,11.333333,10.454911,38,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
78,2013-01-21,PH097332000,2013,75,0.416667,49.583333,12.897778,10.250000,9.506395,37,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
104,2013-01-28,PH097332000,2013,85,0.250000,49.583333,12.851961,11.333333,9.896491,47,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,2021-11-29,PH097332000,2021,27,0.580000,79.000000,16.784074,12.000000,16.000713,11,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
12246,2021-12-06,PH097332000,2021,26,0.580000,66.000000,15.176154,10.000000,15.970134,12,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
12272,2021-12-13,PH097332000,2021,35,3.000000,32.000000,14.200000,12.000000,8.407979,17,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
12298,2021-12-20,PH097332000,2021,27,0.500000,32.000000,10.388889,10.000000,7.766215,13,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584


In [73]:
linked_df.to_csv(PROCESSED_DIR / "linked_df_city_weekly_dengue.csv", index=False)