In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import rasterio
import geowrangler.raster_zonal_stats as rzs
import geopandas as gpd
import pandas as pd

from pathlib import Path



In [2]:
# data files
DATA_DIR = Path("../../../data/")
WP_DIR = DATA_DIR / "02-raw" / "worldpop"

# population count
POP_COUNT = WP_DIR / "phl_ppp_2015.tif"

# population density
POP_D = WP_DIR / "phl_pd_2020_1km.tif"
POP_D_UNADJ = WP_DIR / "phl_pd_2020_1km_UNadj.tif"

ADMIN_BOUNDS = DATA_DIR / "01-admin-bounds" / "target_admin_bounds.shp"

# admin bounds
ADMIN_BOUNDS_PROJECT_ID = "tm-geodata-warehouse"
ADMIN_BOUNDS_BQ_TABLE = "ph_admin_boundary_dec2017.level2_province"

### Load AOI

In [3]:
aoi = gpd.read_file(ADMIN_BOUNDS)

In [4]:
# open rasterio
with rasterio.open(POP_COUNT) as src:
    data = src.read(1)
    data_crs = src.crs
    data_bounds = src.bounds

print(data.shape, data_crs, data_bounds)

(19781, 11613) EPSG:4326 BoundingBox(left=116.927916214, bottom=4.586250183000001, right=126.605416175, top=21.070416784)


In [5]:
aoi.groupby(["ADM1_EN", "ADM2_EN", "ADM3_EN"])["ADM3_EN"].count()

ADM1_EN                  ADM2_EN               ADM3_EN            
National Capital Region  NCR, Fourth District  City of Muntinlupa       9
                         NCR, Second District  City of Mandaluyong     27
                         NCR, Third District   City of Navotas         14
Region I                 Pangasinan            Dagupan City            31
Region III               Nueva Ecija           Palayan City            19
Region IX                Zamboanga del Sur     Zamboanga City         101
Region V                 Albay                 Legazpi City            70
Region VI                Iloilo                Iloilo City            180
Region VII               Cebu                  Mandaue City            27
Region VIII              Leyte                 Tacloban City          138
Region X                 Misamis Oriental      Cagayan de Oro City     80
Region XI                Davao del Sur         Davao City             183
Name: ADM3_EN, dtype: int64

## Population Count

## Generate raster zonal stats from worldpop `TIF`

In [6]:
%%time
pop_count_stats = rzs.create_raster_zonal_stats(
    aoi,
    POP_COUNT,
    aggregation=dict(
        func=["sum", "count", "mean", "median", "std", "min", "max"],
        column="population",
        output=[
            "wp_total",
            "samples",
            "wp_mean",
            "wp_median",
            "wp_stdev",
            "wp_min",
            "wp_max",
        ],
    ),
    extra_args=dict(nodata=-99999),
)

CPU times: user 11.4 s, sys: 452 ms, total: 11.8 s
Wall time: 11.8 s


In [7]:
pop_count_stats.head()

Unnamed: 0,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,ADM4_PCODE,geometry,wp_min,wp_max,wp_mean,samples,wp_total,wp_stdev,wp_median
0,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Lomboy,PH015518016,"POLYGON ((120.32742 16.05423, 120.32719 16.053...",11.345417,133.47525,17.145344,54,925.848572,16.212323,14.305421
1,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Tapuac,PH015518031,"POLYGON ((120.33380 16.03974, 120.33389 16.039...",47.127029,126.68557,78.146438,85,6642.447266,15.008567,76.24646
2,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Pantal,PH015518022,"POLYGON ((120.34737 16.06009, 120.34761 16.060...",15.144128,718.085327,141.062089,190,26801.796875,174.346696,52.950523
3,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Barangay I (T. Bugallon),PH015518024,"POLYGON ((120.34054 16.04489, 120.34054 16.044...",56.019218,121.355377,87.616705,21,1839.950806,16.819277,88.875938
4,Region III,PH030000000,Nueva Ecija,PH034900000,Palayan City,PH034919000,Imelda Valley,PH034919017,"POLYGON ((121.12250 15.58028, 121.12687 15.579...",0.600749,17.6947,4.684663,773,3621.244873,3.612297,3.392569


### EDA

In [8]:
# pd.set_option('display.float_format', lambda x: '%.5f' % x)
pop_count_stats.groupby("ADM3_EN")["wp_total"].sum()

ADM3_EN
Cagayan de Oro City    6.472304e+05
City of Mandaluyong    3.658821e+05
City of Muntinlupa     4.660632e+05
City of Navotas        2.821877e+05
Dagupan City           1.799721e+05
Davao City             1.537308e+06
Iloilo City            4.125135e+05
Legazpi City           1.871071e+05
Mandaue City           3.304018e+05
Palayan City           4.585515e+04
Tacloban City          1.924982e+05
Zamboanga City         7.988494e+05
Name: wp_total, dtype: float64

### Calculate MAPE 

In [9]:
# prep per barangay population from PSA
psa_2015 = pd.read_excel(WP_DIR / "PSA_BarangayLevel_2015.xlsx")
psa_2015.head()

Unnamed: 0,REGION,CITY,BRGY_NAME,PSA_POPULATION
0,NCR,CITY OF MANDALUYONG\n,Addition Hills,99058
1,NCR,CITY OF MANDALUYONG\n,Bagong Silang,5572
2,NCR,CITY OF MANDALUYONG\n,Barangka Drive,13310
3,NCR,CITY OF MANDALUYONG\n,Barangka Ibaba,9540
4,NCR,CITY OF MANDALUYONG\n,Barangka Ilaya,17896


In [10]:
psa_2015.shape

(875, 4)

In [11]:
psa_2015["CITY"] = psa_2015["CITY"].str.rstrip("\n")

In [12]:
psa_2015.dtypes

REGION            object
CITY              object
BRGY_NAME         object
PSA_POPULATION     int64
dtype: object

In [13]:
psa_2015["CITY"] = psa_2015["CITY"].str.lower()
psa_2015["BRGY_NAME"] = psa_2015["BRGY_NAME"].str.lower()
aoi["ADM3_EN"] = aoi["ADM3_EN"].str.lower()
aoi["ADM4_EN"] = aoi["ADM4_EN"].str.lower()

In [14]:
# try joining directly first
# join with admin bounds
psa_joined = pd.merge(
    psa_2015,
    aoi,
    left_on=["CITY", "BRGY_NAME"],
    right_on=["ADM3_EN", "ADM4_EN"],
    validate="one_to_one",
)

In [15]:
psa_joined.head(2)

Unnamed: 0,REGION,CITY,BRGY_NAME,PSA_POPULATION,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,ADM4_PCODE,geometry
0,NCR,city of mandaluyong,addition hills,99058,National Capital Region,PH130000000,"NCR, Second District",PH137400000,city of mandaluyong,PH137401000,addition hills,PH137401001,"POLYGON ((121.04640 14.58746, 121.04636 14.587..."
1,NCR,city of mandaluyong,bagong silang,5572,National Capital Region,PH130000000,"NCR, Second District",PH137400000,city of mandaluyong,PH137401000,bagong silang,PH137401002,"POLYGON ((121.03209 14.59330, 121.03209 14.593..."


We lose 10 barangays after the inner merge. I am guessing they cannot be matched. I will let this go for now.

In [16]:
print("Num barangays from our Admin bounds: ", aoi.shape[0])
print("Num barangays from PSA: ", psa_2015.shape[0])
print("Num barangays after merge: ", psa_joined.shape[0])

Num barangays from our Admin bounds:  879
Num barangays from PSA:  875
Num barangays after merge:  866


In [17]:
psa_joined.groupby(["ADM1_EN", "ADM2_EN", "ADM3_EN"])["ADM3_EN"].count()

ADM1_EN                  ADM2_EN               ADM3_EN            
National Capital Region  NCR, Fourth District  city of muntinlupa       9
                         NCR, Second District  city of mandaluyong     27
                         NCR, Third District   city of navotas         14
Region I                 Pangasinan            dagupan city            31
Region III               Nueva Ecija           palayan city            19
Region IX                Zamboanga del Sur     zamboanga city          98
Region V                 Albay                 legazpi city            65
Region VI                Iloilo                iloilo city            180
Region VII               Cebu                  mandaue city            27
Region VIII              Leyte                 tacloban city          136
Region X                 Misamis Oriental      cagayan de oro city     78
Region XI                Davao del Sur         davao city             182
Name: ADM3_EN, dtype: int64

In [18]:
# drop some rows
psa_joined = psa_joined.drop(columns=["REGION", "CITY", "BRGY_NAME"])
# join with worldpop
worldpop_psa_joined = pop_count_stats.set_index("ADM4_PCODE").join(
    psa_joined.set_index("ADM4_PCODE"), rsuffix="_psa", how="inner"
)

In [19]:
worldpop_psa_joined.shape

(866, 24)

In [20]:
worldpop_psa_joined.columns

Index(['ADM1_EN', 'ADM1_PCODE', 'ADM2_EN', 'ADM2_PCODE', 'ADM3_EN',
       'ADM3_PCODE', 'ADM4_EN', 'geometry', 'wp_min', 'wp_max', 'wp_mean',
       'samples', 'wp_total', 'wp_stdev', 'wp_median', 'PSA_POPULATION',
       'ADM1_EN_psa', 'ADM1_PCODE_psa', 'ADM2_EN_psa', 'ADM2_PCODE_psa',
       'ADM3_EN_psa', 'ADM3_PCODE_psa', 'ADM4_EN_psa', 'geometry_psa'],
      dtype='object')

In [21]:
keep_cols = [
    "ADM1_EN",
    "ADM1_PCODE",
    "ADM2_EN",
    "ADM2_PCODE",
    "ADM3_EN",
    "ADM3_PCODE",
    "ADM4_EN",
    "wp_total",
    "wp_min",
    "wp_max",
    "wp_mean",
    "wp_stdev",
    "wp_median",
    "PSA_POPULATION",
]
worldpop_psa_joined = worldpop_psa_joined[keep_cols]
worldpop_psa_joined.head(2)

Unnamed: 0_level_0,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,wp_total,wp_min,wp_max,wp_mean,wp_stdev,wp_median,PSA_POPULATION
ADM4_PCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PH015518016,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Lomboy,925.848572,11.345417,133.47525,17.145344,16.212323,14.305421,1337
PH015518031,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Tapuac,6642.447266,47.127029,126.68557,78.146438,15.008567,76.24646,4154


In [22]:
worldpop_psa_joined = worldpop_psa_joined.dropna(subset=["wp_total", "PSA_POPULATION"])
# worldpop_psa_joined['population_count'] = worldpop_psa_joined['population_count'].round()

In [23]:
worldpop_psa_joined["ADM3_EN"].unique()

array(['Dagupan City', 'Palayan City', 'Legazpi City', 'Iloilo City',
       'Mandaue City', 'Tacloban City', 'Zamboanga City',
       'Cagayan de Oro City', 'Davao City', 'City of Mandaluyong',
       'City of Navotas', 'City of Muntinlupa'], dtype=object)

In [24]:
muntinlupa = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "City of Muntinlupa"]
mandaluyong = worldpop_psa_joined[
    worldpop_psa_joined["ADM3_EN"] == "City of Mandaluyong"
]
navotas = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "City of Navotas"]
dagupan = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Dagupan City"]
palayan = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Palayan City"]
zamboanga = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Zamboanga City"]
legazpi = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Legazpi City"]
iloilo = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Iloilo City"]
mandaue = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Mandaue City"]
tacloban = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Tacloban City"]
cdo = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Cagayan de Oro City"]
davao = worldpop_psa_joined[worldpop_psa_joined["ADM3_EN"] == "Davao City"]

In [25]:
tacloban["wp_total"].isna().value_counts()

False    135
Name: wp_total, dtype: int64

In [26]:
from sklearn.metrics import mean_absolute_percentage_error


def get_mape(df, city_name):
    y_true = df["PSA_POPULATION"].tolist()
    y_pred = df["wp_total"].tolist()

    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(f"{city_name} MAPE: {mape}")
    return mape


def get_statistics(df):
    psa_sum = df["PSA_POPULATION"].sum()
    psa_mean = df["PSA_POPULATION"].mean()
    psa_median = df["PSA_POPULATION"].median()
    psa_std = df["PSA_POPULATION"].std()

    wp_sum = df["wp_total"].sum().round(3)
    wp_mean = df["wp_total"].mean()
    wp_median = df["wp_total"].median()
    wp_std = df["wp_total"].std()

    diff_psa_wp = psa_sum - wp_sum

    return (
        psa_sum,
        psa_mean,
        psa_median,
        psa_std,
        wp_sum,
        wp_mean,
        wp_median,
        wp_std,
        diff_psa_wp,
    )

In [27]:
city_dfs = [
    dagupan,
    palayan,
    legazpi,
    iloilo,
    mandaue,
    tacloban,
    zamboanga,
    cdo,
    davao,
    mandaluyong,
    navotas,
    muntinlupa,
]
city_names = worldpop_psa_joined["ADM3_EN"].unique().tolist()

mape_list = []
stats_lists = {
    "psa_sum": [],
    "wp_sum": [],
    "psa_mean": [],
    "wp_mean": [],
    "psa_median": [],
    "wp_median": [],
    "psa_std": [],
    "wp_std": [],
    "diff_psa_wp": [],
}

for df, name in zip(city_dfs, city_names):
    mape = get_mape(df, name)
    mape_list.append(mape)

    (
        psa_sum,
        psa_mean,
        psa_median,
        psa_std,
        wp_sum,
        wp_mean,
        wp_median,
        wp_std,
        diff_psa_wp,
    ) = get_statistics(df)
    stats_lists["psa_sum"].append(psa_sum)
    stats_lists["psa_mean"].append(psa_mean)
    stats_lists["psa_median"].append(psa_median)
    stats_lists["psa_std"].append(psa_std)
    stats_lists["wp_sum"].append(wp_sum)
    stats_lists["wp_mean"].append(wp_mean)
    stats_lists["wp_median"].append(wp_median)
    stats_lists["wp_std"].append(wp_std)
    stats_lists["diff_psa_wp"].append(diff_psa_wp)

Dagupan City MAPE: 0.2957186269200407
Palayan City MAPE: 0.7362131209452295
Legazpi City MAPE: 0.4218491712325348
Iloilo City MAPE: 0.5974146840110849
Mandaue City MAPE: 0.3238289669358913
Tacloban City MAPE: 0.7456983749780215
Zamboanga City MAPE: 0.40088531469112587
Cagayan de Oro City MAPE: 0.5696207826117766
Davao City MAPE: 0.7550190477275363
City of Mandaluyong MAPE: 0.40133357435664724
City of Navotas MAPE: 0.9126093047294116
City of Muntinlupa MAPE: 0.19293643592464418


In [28]:
mape_df = pd.DataFrame({"city": city_names, "MAPE": mape_list})
stats_df = pd.DataFrame(stats_lists)

summary_df = pd.concat([mape_df, stats_df], axis=1)

In [29]:
summary_df.sort_values(by="MAPE", ascending=False)

Unnamed: 0,city,MAPE,psa_sum,wp_sum,psa_mean,wp_mean,psa_median,wp_median,psa_std,wp_std,diff_psa_wp
10,City of Navotas,0.912609,249463,282187.684,17818.785714,20156.263149,11384.5,11686.368164,18716.770519,19571.864279,-32724.684
8,Davao City,0.755019,1632991,1523552.07,8972.478022,8371.165222,4313.0,4198.031738,12542.45547,13703.287333,109438.93
5,Tacloban City,0.745698,232353,189607.296,1721.133333,1404.498489,1019.0,583.906311,1897.60512,2573.958413,42745.704
1,Palayan City,0.736213,41041,45855.151,2160.052632,2413.429021,1985.0,1975.401123,1217.346042,1791.052753,-4814.151
3,Iloilo City,0.597415,447992,412513.546,2488.844444,2291.741924,1885.5,1793.265808,2393.136837,1915.560502,35478.454
7,Cagayan de Oro City,0.569621,639033,614081.453,8192.730769,7872.839136,1861.0,2202.691772,13029.279747,13982.705758,24951.547
2,Legazpi City,0.421849,180174,177012.152,2771.907692,2723.263881,1820.0,1565.422241,2174.794758,3747.479421,3161.848
9,City of Mandaluyong,0.401334,386276,365882.139,14306.518519,13551.19033,7628.0,9792.787109,18774.093405,13222.462186,20393.861
6,Zamboanga City,0.400885,861799,786450.183,8793.867347,8025.001868,5328.5,4922.84668,8387.012016,9156.097386,75348.817
4,Mandaue City,0.323829,362654,330401.847,13431.62963,12237.10546,13685.0,11137.533203,6221.881605,7465.358744,32252.153


In [None]:
sns.regplot(data=tacloban, x="PSA_POPULATION", y="wp_total").set(title="Tacloban City")

In [None]:
sns.regplot(data=navotas, x="PSA_POPULATION", y="wp_total").set(title="City of Navotas")

In [None]:
sns.regplot(data=cdo, x="PSA_POPULATION", y="wp_total").set(title="Cagayan de Oro City")

In [None]:
sns.regplot(data=davao, x="PSA_POPULATION", y="wp_total").set(title="Davao City")

In [None]:
sns.regplot(data=mandaue, x="PSA_POPULATION", y="wp_total").set(title="Mandaue City")

In [None]:
sns.regplot(data=muntinlupa, x="PSA_POPULATION", y="wp_total").set(
    title="Muntinlupa City"
)