In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import pandas as pd

from pathlib import Path

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# WorldPop Population Count EDA

Compare WorldPop Population dataset sourced from [WorldPop](https://hub.worldpop.org/project/categories?id=3) against PSA census at a city-level. Perform this validation on two years to identify consistency of errors: 2020 and 2015.

### Input

- Extracted WorldPop Population Count 2020 and 2015 at lacuna cities
- PSA census 2020 and 2015

### Output

- Summary error metrics
- Visualizations (histograms and chloropleths over a city)

### Set-up directories and parameters

In [15]:
# data files
DATA_DIR = Path("../../../data/")
WP_DIR = DATA_DIR / "02-raw" / "worldpop"
OUTPUT_DIR = DATA_DIR / "04-output"

# population count
WP_COUNTS_OUTPUTS_DIR = OUTPUT_DIR / "worldpop" / "population_count"
POP_COUNT_2020 = WP_COUNTS_OUTPUTS_DIR / "worldpop-popcount-2020.csv"
POP_COUNT_2015 = WP_COUNTS_OUTPUTS_DIR / "worldpop-popcount-2015.csv"

# PSA population counts
PSA_2020 = WP_DIR / "PSA_BarangayLevel_2020.xlsx"
PSA_2015 = WP_DIR / "PSA_BarangayLevel_2015.xlsx"

# administrative bounds
ADMIN_BOUNDS = DATA_DIR / "01-admin-bounds" / "renamed_target_admin_bounds.gpkg"

### Load AOI

In [16]:
aoi = gpd.read_file(ADMIN_BOUNDS, driver="GPKG")

### Load WorldPop Population Counts

In [17]:
# worldpop 2020
wp_popcount_2020_df = pd.read_csv(POP_COUNT_2020)
wp_popcount_2020_df = wp_popcount_2020_df.iloc[:, 7::]

wp_popcount_2020_gdf = aoi.merge(
    wp_popcount_2020_df, on="barangay_psgc_code", how="left"
)

In [18]:
# worldpop 2015
wp_popcount_2015_df = pd.read_csv(POP_COUNT_2015)
wp_popcount_2015_df = wp_popcount_2015_df.iloc[:, 7::]

wp_popcount_2015_gdf = aoi.merge(
    wp_popcount_2015_df, on="barangay_psgc_code", how="left"
)

### Load PSA 2020

In [19]:
# prep per barangay population from PSA
psa_2020 = pd.read_excel(WP_DIR / "PSA_BarangayLevel_2020.xlsx")
psa_2020.head()

Unnamed: 0,REGION,CITY,BRGY_NAME,PSA_POPULATION
0,NCR,CITY OF MANDALUYONG\n,Addition Hills,108896
1,NCR,CITY OF MANDALUYONG\n,Bagong Silang,4939
2,NCR,CITY OF MANDALUYONG\n,Barangka Drive,15474
3,NCR,CITY OF MANDALUYONG\n,Barangka Ibaba,9040
4,NCR,CITY OF MANDALUYONG\n,Barangka Ilaya,22334


In [20]:
psa_2015 = pd.read_excel(WP_DIR / "PSA_BarangayLevel_2015.xlsx")
psa_2015.head()

Unnamed: 0,REGION,CITY,BRGY_NAME,PSA_POPULATION
0,NCR,CITY OF MANDALUYONG\n,Addition Hills,99058
1,NCR,CITY OF MANDALUYONG\n,Bagong Silang,5572
2,NCR,CITY OF MANDALUYONG\n,Barangka Drive,13310
3,NCR,CITY OF MANDALUYONG\n,Barangka Ibaba,9540
4,NCR,CITY OF MANDALUYONG\n,Barangka Ilaya,17896


### Utils

In [21]:
def get_error_metrics(df, city_name):
    """
    Calculate WorldPop errors against PSA census.
    Errors to calculate: MAPE, MAE, RMSE
    """
    y_true = df["PSA_POPULATION"].tolist()
    y_pred = df["wp_total"].tolist()

    mape = mean_absolute_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    return mape, mae, rmse


def get_summary_statistics(df):
    """
    Summarize statistics and calculated errors in a single dataframe.
    """
    psa_sum = df["PSA_POPULATION"].sum()
    psa_mean = df["PSA_POPULATION"].mean()
    psa_median = df["PSA_POPULATION"].median()
    psa_std = df["PSA_POPULATION"].std()

    wp_sum = df["wp_total"].sum().round(3)
    wp_mean = df["wp_total"].mean()
    wp_median = df["wp_total"].median()
    wp_std = df["wp_total"].std()

    diff_psa_wp = wp_sum - psa_sum

    return (
        psa_sum,
        psa_mean,
        psa_median,
        psa_std,
        wp_sum,
        wp_mean,
        wp_median,
        wp_std,
        diff_psa_wp,
    )

In [22]:
def join_psa_adminbounds(psa_df):
    psa_df["CITY"] = psa_df["CITY"].str.rstrip("\n")
    psa_df["CITY"] = psa_df["CITY"].str.lower()
    psa_df["BRGY_NAME"] = psa_df["BRGY_NAME"].str.lower()
    aoi["city_name"] = aoi["city_name"].str.lower()
    aoi["barangay_name"] = aoi["barangay_name"].str.lower()

    # try joining directly first
    # join with admin bounds
    psa_admin_joined = pd.merge(
        psa_df,
        aoi,
        left_on=["CITY", "BRGY_NAME"],
        right_on=["city_name", "barangay_name"],
        validate="one_to_one",
    )

    print("Num barangays from our Admin bounds: ", aoi.shape[0])
    print("Num barangays from PSA: ", psa_df.shape[0])
    print("Num barangays after merge: ", psa_admin_joined.shape[0])

    # drop some rows
    psa_admin_joined = psa_admin_joined.drop(columns=["REGION", "CITY", "BRGY_NAME"])

    return psa_admin_joined


def join_worldpop_psa(wp_popcount_gdf, psa_admin_joined):  # wp_popcount_2020_gdf
    # join with worldpop
    worldpop_psa_joined = wp_popcount_gdf.set_index("barangay_psgc_code").join(
        psa_admin_joined.set_index("barangay_psgc_code"), rsuffix="_psa", how="inner"
    )

    keep_cols = [
        "region_name",
        "region_code",
        "province_name",
        "province_code",
        "city_name",
        "city_code",
        "barangay_name",
        "geometry",
        "wp_total",
        "wp_min",
        "wp_max",
        "wp_mean",
        "wp_stdev",
        "wp_median",
        "PSA_POPULATION",
    ]
    worldpop_psa_joined = worldpop_psa_joined[keep_cols]

    # drop rows with NA population
    worldpop_psa_joined = worldpop_psa_joined.dropna(
        subset=["wp_total", "PSA_POPULATION"]
    )
    # Add differences
    worldpop_psa_joined["difference"] = (
        worldpop_psa_joined["wp_total"] - worldpop_psa_joined["PSA_POPULATION"]
    )

    return worldpop_psa_joined


def split_to_city_dfs(wp_psa_joined):
    # Using groupby and a custom aggfunc to condense the many individual city filter and MAPE computation lines for 2015 and 2020
    wp_psa_joined_group = wp_psa_joined.groupby(["city_name"])
    muntinlupa = wp_psa_joined[wp_psa_joined["city_name"] == "City of Muntinlupa"]
    mandaluyong = wp_psa_joined[wp_psa_joined["city_name"] == "City of Mandaluyong"]
    navotas = wp_psa_joined[wp_psa_joined["city_name"] == "City of Navotas"]
    dagupan = wp_psa_joined[wp_psa_joined["city_name"] == "Dagupan City"]
    palayan = wp_psa_joined[wp_psa_joined["city_name"] == "Palayan City"]
    zamboanga = wp_psa_joined[wp_psa_joined["city_name"] == "Zamboanga City"]
    legazpi = wp_psa_joined[wp_psa_joined["city_name"] == "Legazpi City"]
    iloilo = wp_psa_joined[wp_psa_joined["city_name"] == "Iloilo City"]
    mandaue = wp_psa_joined[wp_psa_joined["city_name"] == "Mandaue City"]
    tacloban = wp_psa_joined[wp_psa_joined["city_name"] == "Tacloban City"]
    cdo = wp_psa_joined[wp_psa_joined["city_name"] == "Cagayan de Oro City"]
    davao = wp_psa_joined[wp_psa_joined["city_name"] == "Davao City"]

    city_dfs = [
        dagupan,
        palayan,
        legazpi,
        iloilo,
        mandaue,
        tacloban,
        zamboanga,
        cdo,
        davao,
        mandaluyong,
        navotas,
        muntinlupa,
    ]
    return city_dfs

In [23]:
def validate_worldpop(psa_df, wp_df):
    """
    psa_df: PSA barangay census
    wp_df:  WorldPop 2020 or 2015 gdf

    return summary statistics
    """
    psa_admin_joined = join_psa_adminbounds(psa_df)
    wp_psa_joined = join_worldpop_psa(wp_df, psa_admin_joined)
    city_names = wp_psa_joined["city_name"].unique().tolist()
    grouped = wp_psa_joined.groupby("city_name")

    mape_list = []
    mae_list = []
    rmse_list = []
    stats_lists = {
        "psa_sum": [],
        "wp_sum": [],
        "psa_mean": [],
        "wp_mean": [],
        "psa_median": [],
        "wp_median": [],
        "psa_std": [],
        "wp_std": [],
        "diff_psa_wp": [],
    }

    for city_name, group_df in grouped:
        mape, mae, rmse = get_error_metrics(group_df, city_name)
        mape_list.append(mape)
        mae_list.append(mae)
        rmse_list.append(rmse)

        (
            psa_sum,
            psa_mean,
            psa_median,
            psa_std,
            wp_sum,
            wp_mean,
            wp_median,
            wp_std,
            diff_psa_wp,
        ) = get_summary_statistics(group_df)
        stats_lists["psa_sum"].append(psa_sum)
        stats_lists["psa_mean"].append(psa_mean)
        stats_lists["psa_median"].append(psa_median)
        stats_lists["psa_std"].append(psa_std)
        stats_lists["wp_sum"].append(wp_sum)
        stats_lists["wp_mean"].append(wp_mean)
        stats_lists["wp_median"].append(wp_median)
        stats_lists["wp_std"].append(wp_std)
        stats_lists["diff_psa_wp"].append(diff_psa_wp)
    mape_df = pd.DataFrame(
        {"city": city_names, "MAPE": mape_list, "MAE": mae_list, "RMSE": rmse_list}
    )
    stats_df = pd.DataFrame(stats_lists)

    summary_df = pd.concat([mape_df, stats_df], axis=1)

    return summary_df

In [24]:
def plot_choropleth(city_df, fig_title):
    fig, axs = plt.subplots(ncols=3, figsize=(10, 4))
    psa_min = city_df["PSA_POPULATION"].min()
    psa_max = city_df["PSA_POPULATION"].max()
    # plot map
    city_df.plot(
        column="PSA_POPULATION",
        legend=True,
        legend_kwds={"shrink": 0.3},
        ax=axs[0],
        cmap="YlOrBr",
        vmin=psa_min,
        vmax=psa_max,
    )
    axs[0].set_title("PSA")
    # plot histogram
    city_df.plot(
        column="wp_total",
        legend=True,
        legend_kwds={"shrink": 0.3},
        ax=axs[1],
        cmap="YlOrBr",
        vmin=psa_min,
        vmax=psa_max,
    )
    axs[1].set_title("WorldPop")
    city_df.plot(
        column="difference",
        legend=True,
        legend_kwds={"shrink": 0.3},
        ax=axs[2],
        cmap="coolwarm",
    )
    axs[2].set_title("Difference")

    # adjust layout
    fig.suptitle(fig_title)
    fig.tight_layout()


def plot_histograms(city_df, fig_title):
    fig, ax = plt.subplots()
    sns.histplot(
        data=city_df[["PSA_POPULATION", "wp_total"]],
        bins=20,
        ax=ax,
        kde=True,
        multiple="layer",
        legend=True,
    )
    fig.suptitle(fig_title)
    fig.tight_layout()


def plot_all_histograms(psa_df, wp_df):
    psa_admin_joined = join_psa_adminbounds(psa_df)
    wp_psa_joined = join_worldpop_psa(wp_df, psa_admin_joined)
    grouped = wp_psa_joined.groupby("city_name")

    for city_name, city_df in grouped:
        plot_choropleth(city_df, city_name)
        plot_histograms(city_df, f"Histograms for {city_name}")

# Validate WorldPop against PSA

In [25]:
wp_2020_validation_df = validate_worldpop(psa_df=psa_2020, wp_df=wp_popcount_2020_gdf)
wp_2020_validation_df.sort_values(by="MAPE", ascending=False)

Num barangays from our Admin bounds:  879
Num barangays from PSA:  875
Num barangays after merge:  873


Unnamed: 0,city,MAPE,MAE,RMSE,psa_sum,wp_sum,psa_mean,wp_mean,psa_median,wp_median,psa_std,wp_std,diff_psa_wp
10,City of Navotas,1.131664,1331.172382,2590.172714,251773,200382.196,1837.759124,1462.643765,691.0,614.279907,2442.219499,2629.172119,-51390.804
3,Iloilo City,1.070613,5347.190142,10661.49012,247543,287225.755,17681.642857,20516.125357,11755.5,11750.602539,19163.619602,20220.387015,39682.755
0,Dagupan City,0.945305,2689.315466,5169.926032,728402,752064.576,9105.025,9400.807197,1671.0,2586.737549,14257.779838,16115.445305,23662.576
9,City of Mandaluyong,0.865184,1708.859487,2110.875414,45383,55563.117,2388.578947,2924.374586,2131.0,2495.634766,1515.46591,2185.424365,10180.117
5,Tacloban City,0.826713,3900.631677,7445.312331,1776949,1690857.475,9763.456044,9290.425686,4942.0,4341.62915,13079.373556,15081.445228,-86091.525
6,Zamboanga City,0.633823,971.791997,1492.93455,457626,424837.209,2542.366667,2360.206719,1809.5,1818.104126,2559.883368,1978.110169,-32788.791
1,Palayan City,0.503849,6594.430103,10571.613933,425758,411208.226,15768.814815,15229.934299,8483.0,11407.729492,21131.004336,15075.351435,-14549.774
4,Mandaue City,0.413593,1914.904937,3143.799024,174302,208321.78,5622.645161,6720.057418,4079.0,4014.707031,5491.705992,6988.616862,34019.78
7,Cagayan de Oro City,0.410233,1253.644346,2316.069479,205933,195228.389,3028.426471,2871.005716,2211.5,1694.452393,2585.676604,3902.3737,-10704.611
8,Davao City,0.394946,4557.821958,5648.387312,364116,356164.897,13485.777778,13191.29248,13832.0,12413.217773,6660.141615,7896.955072,-7951.103


In [26]:
wp_2015_validation_df = validate_worldpop(psa_df=psa_2015, wp_df=wp_popcount_2015_gdf)
wp_2015_validation_df.sort_values(by="MAPE", ascending=False)

Num barangays from our Admin bounds:  879
Num barangays from PSA:  875
Num barangays after merge:  866


Unnamed: 0,city,MAPE,MAE,RMSE,psa_sum,wp_sum,psa_mean,wp_mean,psa_median,wp_median,psa_std,wp_std,diff_psa_wp
3,Iloilo City,0.912609,5252.254081,9444.880744,249463,282187.684,17818.785714,20156.263149,11384.5,11686.368164,18716.770519,19571.864279,32724.684
5,Tacloban City,0.755019,3494.385035,6411.637134,1632991,1523552.07,8972.478022,8371.165222,4313.0,4198.031738,12542.45547,13703.287333,-109438.93
10,City of Navotas,0.745698,1106.218773,2030.109327,232353,189607.296,1721.133333,1404.498489,1019.0,583.906311,1897.60512,2573.958413,-42745.704
9,City of Mandaluyong,0.736213,1408.614655,1706.63667,41041,45855.151,2160.052632,2413.429021,1985.0,1975.401123,1217.346042,1791.052753,4814.151
6,Zamboanga City,0.597415,991.599633,1493.79212,447992,412513.546,2488.844444,2291.741924,1885.5,1793.265808,2393.136837,1915.560502,-35478.454
0,Dagupan City,0.569621,2254.029181,4198.676533,639033,614081.453,8192.730769,7872.839136,1861.0,2202.691772,13029.279747,13982.705758,-24951.547
7,Cagayan de Oro City,0.421849,1196.704518,2391.884133,180174,177012.152,2771.907692,2723.263881,1820.0,1565.422241,2174.794758,3747.479421,-3161.848
1,Palayan City,0.401334,5064.62821,8995.882128,386276,365882.139,14306.518519,13551.19033,7628.0,9792.787109,18774.093405,13222.462186,-20393.861
11,City of Muntinlupa,0.400885,2729.922591,4007.033355,861799,786450.183,8793.867347,8025.001868,5328.5,4922.84668,8387.012016,9156.097386,-75348.817
8,Davao City,0.323829,3900.44373,4709.761546,362654,330401.847,13431.62963,12237.10546,13685.0,11137.533203,6221.881605,7465.358744,-32252.153


## Visualizations

For sanity checking

In [None]:
plot_all_histograms(psa_df=psa_2020, wp_df=wp_popcount_2020_gdf)

In [None]:
plot_all_histograms(psa_df=psa_2015, wp_df=wp_popcount_2015_gdf)