# EDA on linked dataset and baseline model results
This notebook generates charts to analyze key findings on trends and patterns seen in the linked dataset, and the results of the baseline outbreak model

## Imports and set-up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path
import os

import pandas as pd
import geopandas as gpd
from shapely import wkt

In [None]:
import sys

sys.path.append("../../")
from src.settings import DATA_DIR
from src.linked_data_viz import *

## Set directories

Access input files in [this folder](https://drive.google.com/drive/u/0/folders/1JRpfEZS17gBlh84RcCwVppsaNqCyV1bb)

- For informal settlement data, public users are asked to submit a request [here](https://forms.gle/vwgjJ32hqHCWMpTBA) and wait for approval. 
- For TM and other members of the project team, kindly inform the repo owner.

In [None]:
# linked data
LINKED_DATA_FPATH = DATA_DIR / "linked_data"

GEOG_HEALTH = LINKED_DATA_FPATH / "processed/geog_health_city_merged.csv"

# cases with outbreak tags
LABELED_CASES_DIR = LINKED_DATA_FPATH / "tagged_outbreaks"
DAGUPAN_DENGUE = LABELED_CASES_DIR / "dagupan_dengue_city_merged_w_outbreak_tag.csv"
DAGUPAN_ROTAVIRUS = (
    LABELED_CASES_DIR / "dagupan_rotavirus_city_merged_w_outbreak_tag.csv"
)

# informal settlements
PACSII_FPATH = LINKED_DATA_FPATH / "pacsii/informal_settlement_community_surveys_v1.csv"
PACSII_FPATH_V2 = (
    LINKED_DATA_FPATH / "pacsii/informal_settlement_community_surveys_v2.csv"
)

# muntinlupa consolidated table (climate + environment)
LINKED_MUNTINLUPA = LINKED_DATA_FPATH / "processed/linked_muntinlupa.csv"

# cleaned up muntinlupa lgu disease data
MUNTI1_FPATH = LINKED_DATA_FPATH / "processed/lgu-level/Muntinlupa_1.csv"
MUNTI2_FPATH = LINKED_DATA_FPATH / "processed/lgu-level/Muntinlupa_2.csv"
MUNTI3_FPATH = LINKED_DATA_FPATH / "processed/lgu-level/Muntinlupa_3.csv"
MUNTI4_FPATH = LINKED_DATA_FPATH / "processed/lgu-level/Muntinlupa_4.csv"

# location and geom tables
LINKED_DATA_TABLES_DIR = LINKED_DATA_FPATH / "tables"
LOC_FPATH = LINKED_DATA_TABLES_DIR / "location.csv"
GEOG_FPATH = LINKED_DATA_TABLES_DIR / "brgy_geography.csv"
HEALTH_ISOCHRONES = (
    LINKED_DATA_TABLES_DIR / "mapbox_health_facility_brgy_isochrones.csv"
)
POPULATION = LINKED_DATA_TABLES_DIR / "worldpop_population.csv"
RWI = LINKED_DATA_TABLES_DIR / "tm_relative_wealth_index.csv"

## Read tables

In [None]:
# informal settlements
pacsii_v1 = pd.read_csv(PACSII_FPATH)

# from linked dataset notebook
muntinlupa_linked = pd.read_csv(LINKED_MUNTINLUPA)

# from outbreak model notebook
dagupan_dengue = pd.read_csv(DAGUPAN_DENGUE)

# from cleaned up lgu data
muntinlupa1 = pd.read_csv(MUNTI1_FPATH)
muntinlupa2 = pd.read_csv(MUNTI2_FPATH)
muntinlupa3 = pd.read_csv(MUNTI3_FPATH)
muntinlupa4 = pd.read_csv(MUNTI4_FPATH)

## Prepare location tables with geometries

In [None]:
# open barangay geometries
brgy_geography = pd.read_csv(GEOG_FPATH)
brgy_geography = brgy_geography.drop(
    columns=["Unnamed: 0", "freq", "date", "uuid", "brgy_total_area"]
)
brgy_geography["geometry"] = brgy_geography["geometry"].apply(wkt.loads)

brgy_gdf = gpd.GeoDataFrame(brgy_geography, geometry="geometry")
brgy_gdf.head(3)

In [None]:
# open full admin boundary table
loc_df = pd.read_csv(LOC_FPATH)
loc_df = loc_df.drop(columns=["Unnamed: 0", "uuid"])
loc_df.head(3)

In [None]:
# merge location and geometry tables
brgy_gdf = loc_df.merge(brgy_gdf, how="left", on="adm4_pcode")
brgy_gdf = gpd.GeoDataFrame(brgy_gdf)
brgy_gdf.head(2)

# General analysis for 12 lacuna cities

## Top CDs cases /NCDs deaths

In [None]:
geog_health_df = pd.read_csv(GEOG_HEALTH)
geog_health_df["date"] = pd.to_datetime(geog_health_df["date"])
geog_health_df = geog_health_df[geog_health_df["date"].dt.year >= 2020]
geog_health_df.head(3)

In [None]:
AREA_DATE_COLS = [
    "adm3_en",
    "adm3_pcode",
    "date",
    "year",
    "week",
    "brgy_distance_to_coast",
    "brgy_is_coastal",
]
CASE_COLS = geog_health_df.columns[
    geog_health_df.columns.str.startswith("case")
].tolist()
DEATH_COLS = geog_health_df.columns[
    geog_health_df.columns.str.startswith("death")
].tolist()

In [None]:
# historically from 2003 to 2022
plot_diseases_bar(geog_health_df, cols_list=CASE_COLS, category="CD")

In [None]:
# historically from 2003 to 2022
plot_diseases_bar(geog_health_df, cols_list=DEATH_COLS, category="NCD")

In [None]:
city_names = geog_health_df["adm3_en"].unique().tolist()
for city in city_names:
    print(f"Plot overall diseases count for {city}")
    city_df = geog_health_df[geog_health_df["adm3_en"] == city]
    # city_df = city_df.drop(columns=['case_total_dengue'])
    plot_diseases_bar(city_df, cols_list=CASE_COLS, category="CD", city=city)
    plot_diseases_bar(city_df, cols_list=DEATH_COLS, category="NCD", city=city)

In [None]:
geog_health_df.groupby("adm3_en")[CASE_COLS].sum()

In [None]:
geog_health_df.groupby("adm3_en")[DEATH_COLS].sum()

## Health Access

In [None]:
# open health access files
brgy_iso = pd.read_csv(HEALTH_ISOCHRONES)
brgy_iso = brgy_iso.drop(columns=["Unnamed: 0", "uuid", "date", "freq"])

# keep only necessary cols
pct_cols = [col for col in brgy_iso.columns if col.__contains__("_pct")]
brgy_iso = brgy_iso[["adm4_pcode"] + pct_cols]
brgy_iso.head(3)

In [None]:
brgy_iso = brgy_iso.merge(
    brgy_gdf[["adm3_en", "adm4_pcode", "geometry"]], on="adm4_pcode", how="left"
)
brgy_iso = gpd.GeoDataFrame(brgy_iso)

In [None]:
health_facility_type = ["brgy_healthcenter", "hospital", "rhu"]
pct_5min = [col for col in brgy_iso.columns if col.endswith("_5min")]
pct_15min = [col for col in brgy_iso.columns if col.endswith("_15min")]
pct_30min = [col for col in brgy_iso.columns if col.endswith("_30min")]

In [None]:
cities = brgy_gdf["adm3_en"].unique().tolist()

for city in cities:
    if city == "Iloilo City" or city == "Davao City":
        plot_health_access(brgy_iso, city=city, pct_cols=pct_5min, save_df=True)

In [None]:
plot_choropleth_all_cities(
    brgy_iso,
    "rhu_pop_reached_pct_5min",
    vmin=0,
    vmax=100,
    cmap="viridis",
    label="% population reached in 5 mins \n for RHUs",
)

## Population Density

In [None]:
population_df = pd.read_csv(POPULATION)
population_df = population_df.drop(columns=["Unnamed: 0", "uuid"])
population_df = population_df[
    ["adm4_pcode", "date", "pop_count_total", "pop_count_mean"]
]
population_df.head()

In [None]:
population_df["date"] = pd.to_datetime(population_df["date"])
population_df = population_df[population_df["date"].dt.year == 2020]
population_df = population_df.merge(
    brgy_gdf[["adm3_en", "adm4_pcode", "geometry"]], on="adm4_pcode", how="left"
)
population_gdf = gpd.GeoDataFrame(population_df)
population_gdf

In [None]:
plot_choropleth_all_cities(population_gdf, "pop_count_mean", vmin=0, vmax=None)

## Wealth Index

In [None]:
rwi_df = pd.read_csv(RWI)
rwi_df = rwi_df.drop(columns=["Unnamed: 0", "uuid", "date", "freq"])
rwi_df.head()

In [None]:
rwi_df = rwi_df.merge(
    brgy_gdf[["adm3_en", "adm4_pcode", "geometry"]], on="adm4_pcode", how="left"
)
rwi_gdf = gpd.GeoDataFrame(rwi_df)

In [None]:
plot_choropleth_all_cities(
    rwi_gdf, "rwi_mean", vmin=0, vmax=1, cmap="viridis", label="RWI"
)

# PACSII Surveys

In [None]:
# informal settlements
pacsii_v1 = pd.read_csv(PACSII_FPATH)
pacsii_v2 = pd.read_csv(PACSII_FPATH_V2)
pacsii_v2.info(verbose=True)

In [None]:
# drop columns
pacsii_v1.drop(columns=["Unnamed: 0"], inplace=True)
location = loc_df[["adm3_en", "adm4_pcode"]]

# merged dataframes to extract adm3_en
merged_df = pd.merge(location, pacsii_v1, on="adm4_pcode")

# reorder columns
merged_df = merged_df[
    ["adm3_en", "adm4_pcode"]
    + [col for col in merged_df.columns if col not in ["adm3_en", "adm4_pcode"]]
]
merged_df.head(3)

## Muntinlupa vs. Iloilo

Use `batch_df` so that dataframe is already filtered to `batches_of_interest`.

In [None]:
# filter to batches of interest
batches_of_interest = [
    "muntinlupa_2015",
    "muntinlupa_2016",
    "iloilo_2020",
    "iloilo_2021",
    "iloilo_2022",
]
batch_df = merged_df[(merged_df["batch"].isin(batches_of_interest))]

# show number of rows per survey batch and barangay
rows_per_batch_and_adm4_pcode = batch_df.groupby(["batch", "adm4_pcode"]).size()
print(rows_per_batch_and_adm4_pcode.to_string())

### Water Supply

In [None]:
# fix values
batch_df_edit = batch_df.copy()
batch_df_edit.loc["water_supply_type_1"] = batch_df_edit["water_supply_type_1"].fillna(
    "Unknown"
)

plot_pacsii_batch_data(
    batch_df_edit,
    var_to_plot="water_supply_type_1",
    plot_title="Primary Water Supply Type",
    legend_title="Water Supply Type",
)

### Toilets

In [None]:
# fix values
batch_df_edit = batch_df.copy()
batch_df_edit["has_toilets_functioning"] = (
    batch_df_edit["has_toilets_functioning"]
    .fillna("Unknown")
    .replace({0: "No", 1: "Yes"})
)

plot_pacsii_batch_data(
    batch_df_edit,
    var_to_plot="has_toilets_functioning",
    plot_title="Functioning Toilets Inside the House",
    legend_title="Functioning Toilet",
    bbox_anchor=(0.60, -0.1),
)

### Income Bracket

In [None]:
# fix values
batch_df_edit = batch_df.copy()
batch_df_edit["income_bracket"] = (
    batch_df_edit["income_bracket"]
    .replace(
        {
            "Below 10,957": "00 Poor: Below PHP 10,957",
            "10,957-21,914": "01 Low income: PHP 10,957-21,914",
            "21,914-43,828": "02 Lower middle income: PHP 21,914-43,828",
            "43,828-76,699": "03 Middle middle income: PHP 43,828-76,699",
            "76,699-131,483": "04 Upper middle income: PHP 76,699-131,483",
            "131,483-219,140": "05 Upper income: PHP 131,483-219,140",
            "Above 219,140": "06 Rich: Above 219,140",
        }
    )
    .fillna("Unknown")
)

plot_pacsii_batch_data(
    batch_df_edit,
    var_to_plot="income_bracket",
    plot_title="Income Bracket",
    legend_title="Income Bracket",
    bbox_anchor=(0.8, -0.1),
)

## Davao- Matina Aplaya

### Toilets

In [None]:
dataframe = merged_df.copy()

plot_pacsii_stacked_area_comparison(
    dataframe,
    variable_of_interest="has_toilets_functioning",
    barangays_of_interest=["PH112402074"],
    title_label="Functioning Toilets Inside the House",
    legend_label="Functioning toilet",
)

### Water Supply

In [None]:
# variables
# variable_of_interest = 'water_supply_type_1'
# title_label = 'Primary Water Supply Type Over Time'
# legend_label = 'Water Supply Type'
# barangays_of_interest = ['PH112402074']
dataframe = merged_df.copy()

plot_pacsii_stacked_area_comparison(
    dataframe,
    variable_of_interest="water_supply_type_1",
    barangays_of_interest=["PH112402074"],
    title_label="Primary Water Supply Type Over Time",
    legend_label="Water Supply Type",
)

### Income bracket

In [None]:
dataframe = merged_df.copy()

plot_pacsii_stacked_area_comparison(
    dataframe,
    variable_of_interest="income_bracket",
    barangays_of_interest=["PH112402074"],
    title_label="Income Over Time",
    legend_label="Income bracket",
)

## All sites

### Family members

In [None]:
# define categories
categories = pd.CategoricalDtype(
    categories=[
        "0_solo",
        "1_small family (2-4 members)",
        "2_medium family (5-6 members)",
        "3_large family (7-10 members)",
        "4_very large family (>10 members)",
        "5_unknown",
    ],
    ordered=True,
)

# create dataframe copy
batch_df_edit = merged_df.copy()

# assign categories
batch_df_edit["family_type"] = pd.cut(
    batch_df_edit["n_family_members"],
    bins=[-np.inf, 1, 4, 6, 10, np.inf],
    labels=[
        "0_solo",
        "1_small family (2-4 members)",
        "2_medium family (5-6 members)",
        "3_large family (7-10 members)",
        "4_very large family (>10 members)",
    ],
)

batch_df_edit["family_type"] = batch_df_edit["family_type"].astype(categories)
batch_df_edit["family_type"] = batch_df_edit["family_type"].fillna("5_unknown")

plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="family_type",
    title_label="Family Members",
    legend_label="# of family members",
    bbox_anchor=(1.43, 1),
)

### Occupation

In [None]:
# variables
variable_of_interest = "occupation1"
title_label = "Primary Occupation"
legend_label = "Occupation"
dataframe = merged_df

# create dataframe copy
batch_df_edit = merged_df.copy()

# edit occupations
occupation_mapping = {
    "0 Armed forces occupations": "00_armed forces occupations",
    "1 Managers": "01_managers",
    "2 Professionals": "02_professionals",
    "3 Technicians and associate professionals": "03_technicians and associate professionals",
    "4 Clerical support workers": "04_clerical support workers",
    "5 Service and sales workers": "05_service and sales workers",
    "6 Skilled agricultural, forestry and fishery workers": "06_skilled agricultural, forestry, and fishery workers",
    "7 Craft and related trades workers": "07_craft and related trades workers",
    "8 Plant and machine operators and assemblers": "08_plant and machine operators and assemblers",
    "9 Elementary occupations": "09_elementary occupations",
}

batch_df_edit[variable_of_interest] = batch_df_edit[variable_of_interest].map(
    occupation_mapping
)
batch_df_edit[variable_of_interest] = batch_df_edit[variable_of_interest].fillna(
    "10_unknown"
)

plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="occupation1",
    title_label="Primary Occupation",
    legend_label="Occupation",
    bbox_anchor=(1.55, 1),
    fig_text_x=0.92,
    fig_text_y=0.5,
)

In [None]:
# variables
variable_of_interest = "occupation1"
city = ["City of Muntinlupa", "Iloilo City", "Davao City", "Mandaue City"]
dataframe = merged_df.copy()

occupation_mapping = {
    "0 Armed forces occupations": "armed forces occupations",
    "1 Managers": "managers",
    "2 Professionals": "professionals",
    "3 Technicians and associate professionals": "technicians and associate professionals",
    "4 Clerical support workers": "clerical support workers",
    "5 Service and sales workers": "service and sales workers",
    "6 Skilled agricultural, forestry and fishery workers": "skilled agricultural, forestry, and fishery workers",
    "7 Craft and related trades workers": "craft and related trades workers",
    "8 Plant and machine operators and assemblers": "plant and machine operators and assemblers",
    "9 Elementary occupations": "elementary occupations",
}

dataframe[variable_of_interest] = dataframe[variable_of_interest].map(
    occupation_mapping
)

plot_employment_treemap(dataframe, variable_of_interest)

### Vulnerable subpopulations

In [None]:
# dataframe
df = dataframe

# plot males and females
plot_avg_males_females(df, "viridis")

# plot other vulnerable populations
columns_labels = [
    ("n_family_members_student", "Students"),
    ("n_family_members_sc", "Senior Citizens"),
    ("n_family_members_pwd", "Persons with disabilities"),
]

avg_percentages = {}
for column, label in columns_labels:
    avg_percentages[label] = calculate_and_plot_percentage(df, column, label)

avg_combined = pd.DataFrame(avg_percentages)

colors = get_colorscheme(len(avg_combined.columns))

# Adjust size of the chart
fig, ax = plt.subplots(figsize=(12, 6))

avg_combined.plot(kind="bar", color=colors, ax=ax)
plt.title("Average % of Households with Vulnerable Populations")
plt.xlabel("")
plt.ylabel("Percentage")
plt.legend(title="Categories", labels=avg_combined.columns)
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0.99:
        text = ax.annotate(
            f"{height:.1f}%",
            (x + width / 2, y + height / 2),
            ha="center",
            va="center",
            color="black",
            size=7,
        )
        text.set_path_effects(
            [
                path_effects.Stroke(linewidth=3, foreground="white"),
                path_effects.Normal(),
            ]
        )

plt.xticks(rotation=0)
plt.show()

### Income bracket

In [None]:
# variables
variable_of_interest = "income_bracket"
title_label = "Income Bracket"
legend_label = "Income Bracket"


# create dataframe copy
batch_df_edit = merged_df.copy()
batch_df_edit[variable_of_interest] = (
    batch_df_edit[variable_of_interest]
    .replace(
        {
            "Below 10,957": "00 Poor: Below PHP 10,957",
            "10,957-21,914": "01 Low income: PHP 10,957-21,914",
            "21,914-43,828": "02 Lower middle income: PHP 21,914-43,828",
            "43,828-76,699": "03 Middle middle income: PHP 43,828-76,699",
            "76,699-131,483": "04 Upper middle income: PHP 76,699-131,483",
            "131,483-219,140": "05 Upper income: PHP 131,483-219,140",
            "Above 219,140": "06 Rich: Above 219,140",
        }
    )
    .fillna("Unknown")
)


plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="income_bracket",
    title_label="Income Bracket",
    legend_label="Income Bracket",
    bbox_anchor=(1.55, 1),
    fig_text_x=0.93,
    fig_text_y=0.6,
)

### Power source

In [None]:
# variables
variable_of_interest = "power_source_type"
title_label = "Power source"
legend_label = "Power source"
dataframe = merged_df

# create dataframe copy
batch_df_edit = dataframe.copy()

# edit occupations
batch_df_edit[variable_of_interest] = batch_df_edit[variable_of_interest].fillna(
    "Unknown"
)

plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="power_source_type",
    title_label="Power source",
    legend_label="Power source",
    bbox_anchor=(1.55, 1),
    fig_text_x=0.93,
    fig_text_y=0.65,
)

### Water supply

In [None]:
# variables
variable_of_interest = "water_supply_type_1"
title_label = "Primary Water Supply Type"
legend_label = "Water Supply Type"

# replace nulls with placeholder value
batch_df_edit = merged_df.copy()
batch_df_edit[variable_of_interest] = batch_df_edit[variable_of_interest].fillna(
    "Unknown"
)

plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="water_supply_type_1",
    title_label="Primary Water Supply Type",
    legend_label="Water Supply Type",
    bbox_anchor=(1.45, 1),
    fig_text_x=0.93,
    fig_text_y=0.6,
)

### Toilets

In [None]:
# variables
variable_of_interest = "has_toilets_functioning"
title_label = "Toilet Access"
legend_label = "Toilet Present in Structure"
dataframe = merged_df

# replace nulls with placeholder value
batch_df_edit = merged_df.copy()
batch_df_edit["has_toilets_functioning"] = (
    batch_df_edit["has_toilets_functioning"]
    .fillna("Unknown")
    .replace({0: "No", 1: "Yes"})
)
batch_df_edit[variable_of_interest] = batch_df_edit[variable_of_interest].fillna(
    "Unknown"
)

plot_all_pacsii_sites(
    batch_df_edit,
    variable_of_interest="has_toilets_functioning",
    title_label="Toilet Access",
    legend_label="Toilet Present in Structure",
    bbox_anchor=(1.33, 1),
    fig_text_x=0.92,
    fig_text_y=0.73,
)

# Outbreaks

In [None]:
# drop columns
title = "Dagupan"
location = "Dagupan"
dengue_copy = dagupan_dengue.copy()
dengue_copy["date"] = pd.to_datetime(dengue_copy["date"])
dengue_copy = dengue_copy[
    (dengue_copy["date"].dt.year >= 2014) & (dengue_copy["date"].dt.year <= 2023)
]


dengue_outbreak = detect_outbreak_periods(dengue_copy, "outbreak")
dengue_outbreak = dengue_outbreak.nlargest(5, "actual_length_weeks")
dengue_outbreak

In [None]:
# variables
colors = getattr(cm, "viridis")
args = {
    "dataframe": dengue_copy,
    "outbreak_markers": dengue_outbreak,
    "casetype": "case_total_dengue",
    "axis1_label": "Dengue Cases",
    "axis1_color": colors(0.2),
    "variable_of_interest": "pr",
    "axis2_label": "Precipitation (mm)",
    "axis2_color": "cadetblue",  # colors(0.5),
    "outbreak_color": colors(0.9),
    "major_outbreak_color": colors(0.7),
    "title": "Dagupan",
}

plot_outbreaks_precip(**args)

In [None]:
args = {
    "dataframe": dengue_copy,
    "outbreak_markers": dengue_outbreak,
    "disease": "Dengue",
    "casetype": "case_total_dengue",
    "case_color": colors(0.2),
    "major_outbreak_color": colors(0.7),
    "location": "Dagupan",
}
plot_all_clim_subplots(**args)

## Linking PACSII socioeconomic data to other variables in Muntinlupa

In [None]:
muntinlupa_socioecon = agg_munti_socioecon(batch_df)

In [None]:
# append muntinlupa data
muntinlupa_health = pd.concat(
    [muntinlupa1, muntinlupa2, muntinlupa3, muntinlupa4], ignore_index=True
)
muntinlupa_health["date"] = pd.to_datetime(muntinlupa_health["date"], format="%Y-%m-%d")

# variables for filtering
start_date = "2011-01-01"
end_date = "2017-12-31"
disease = "typhoid fever"
disease_simple_name = "typhoid"

# filter
muntinlupa_health = muntinlupa_health[
    (muntinlupa_health["date"] >= start_date) & (muntinlupa_health["date"] <= end_date)
]
muntinlupa_disease = muntinlupa_health[
    (muntinlupa_health["disease_standard_name"] == disease)
]

# aggregate
muntinlupa_disease = muntinlupa_disease.drop(
    columns=["freq", "source_name", "disease_standard_code", "sex", "age_group", "date"]
)
muntinlupa_disease = (
    muntinlupa_disease.groupby(["adm4_pcode", "disease_standard_name"])
    .sum()
    .reset_index()
)
muntinlupa_disease = muntinlupa_disease.rename(
    columns={"case_total": disease_simple_name + "_cases"}
)
muntinlupa_disease = muntinlupa_disease.drop(columns=["disease_standard_name"])

In [None]:
# merge
dfs = [muntinlupa_linked, muntinlupa_socioecon, muntinlupa_disease]
muntinlupa_combined = dfs[0]
for df in dfs[1:]:
    muntinlupa_combined = pd.merge(muntinlupa_combined, df, on="adm4_pcode", how="left")

# merge with geo info
geo_info_copy = brgy_gdf.copy()
geo_info_copy.drop(columns=["adm3_en", "adm4_en"], inplace=True)
muntinlupa = gpd.GeoDataFrame(
    pd.merge(muntinlupa_combined, geo_info_copy, on="adm4_pcode", how="left")
)
muntinlupa.drop(columns=["adm4_pcode"], inplace=True)
muntinlupa = muntinlupa[muntinlupa["adm4_en"] != "New Alabang Village"]
muntinlupa["typhoid_cases_per_population"] = (
    muntinlupa["typhoid_cases"] / muntinlupa["pop_count_total"]
) * 100
muntinlupa.info()

## Generate choropleth maps for all other socioeconomic features

In [None]:
exc_columns = [
    "adm1_en",
    "adm1_pcode",
    "adm2_en",
    "adm2_pcode",
    "adm3_pcode",
    "brgy_total_area",
    "brgy_distance_to_coast",
    "brgy_is_coastal",
    "adm4_en",
    "geometry",
    "water_none_percentage",
    "typhoid_cases",
]
columns_to_plot = [column for column in muntinlupa.columns if column not in exc_columns]
columns_to_compare = [
    column for column in columns_to_plot if column != "typhoid_cases_per_population"
]

fig, axs = plt.subplots(
    len(columns_to_compare), 2, figsize=(10, 5 * len(columns_to_compare))
)
if len(columns_to_compare) == 1:
    axs = [axs]

for i, column in enumerate(columns_to_compare):

    # first column - typhoid cases
    muntinlupa.plot(
        column="typhoid_cases_per_population", cmap="viridis", ax=axs[i, 0], legend=True
    )
    axs[i, 0].set_title("typhoid_cases_%_population")
    axs[i, 0].set_axis_off()
    for idx, row in muntinlupa.iterrows():
        text = axs[i, 0].annotate(
            text=row["adm4_en"],
            xy=(row.geometry.centroid.x, row.geometry.centroid.y),
            color="black",
        )
        text.set_path_effects(
            [
                path_effects.Stroke(linewidth=3, foreground="white"),
                path_effects.Normal(),
            ]
        )

    # second column - other variables
    muntinlupa.plot(column=column, cmap="viridis", ax=axs[i, 1], legend=True)
    axs[i, 1].set_title(column)
    axs[i, 1].set_axis_off()
    for idx, row in muntinlupa.iterrows():
        text = axs[i, 1].annotate(
            text=row["adm4_en"],
            xy=(row.geometry.centroid.x, row.geometry.centroid.y),
            color="black",
        )
        text.set_path_effects(
            [
                path_effects.Stroke(linewidth=3, foreground="white"),
                path_effects.Normal(),
            ]
        )

    # third column - correlation coefficient
    correlation = np.corrcoef(
        muntinlupa["typhoid_cases_per_population"], muntinlupa[column]
    )[0, 1]
    axs[i, 1].text(
        0.5,
        0.95,
        f"Corr: {correlation:.2f}",
        ha="center",
        va="top",
        transform=axs[i, 1].transAxes,
        fontsize=12,
        bbox=dict(facecolor="white", alpha=0.8),
    )

plt.tight_layout()
plt.show()