In [1]:
# default_exp vector_zonal_stats

In [2]:
# hide
# no_test
! [ -e /content ] && pip install -Uqq git+https://github.com/thinkingmachines/geowrangler.git

In [3]:
# hide
# no_test
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
# hide
# no_test
# conditionally load nb_black cell magic
import sys

if "google.colab" not in sys.modules:
    from IPython import get_ipython

    ipython = get_ipython()
    ipython.magic("reload_ext lab_black")

# Vector Zonal Stats

> generate vector zonal stat features

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thinkingmachines/geowrangler/blob/master/notebooks/02_vector_zonal_stats.ipynb)

In [5]:
# exporti
import geopandas as gpd
import pandas as pd

  shapely_geos_version, geos_capi_version_string


In [32]:
# exports
def fix_agg(agg: {}) -> {}:  # A dict containing at the minimum a 'func' key
    """
    Validate and (possibly) Fix an aggregation specification
    and outputs a dict containing the following keys:
    'func': a list of aggregation functions (should be a valid 'agg' function)
    'column': a column to apply the aggregation functions (should be a valid numeric column in data)
    'output': the names of the new columns containing the application of the aggregation functions (default: concat column + '_' + func)
    'fillna': boolean list whether to replace new columns with 'NA' values  with 0 (default: True)
    """
    if "func" not in agg:
        raise ValueError(f"Missing key 'func' for agg {agg}")

    if type(agg["func"]) == str:
        agg["func"] = [agg["func"]]

    # optional column, default to index count
    if "column" not in agg:
        agg["column"] = "aoi_index"

    # check matching output
    if "output" not in agg:
        agg["output"] = [f'{agg["column"]}_{f}' for f in agg["func"]]
    if type(agg["output"]) == str:
        agg["output"] = [agg["output"]]
    if len(agg["output"]) != len(agg["func"]):
        raise ValueError(
            f"output list {agg['output']} doesn't match func list {agg['func']}"
        )

    # check matching fillna
    if "fillna" not in agg:
        agg["fillna"] = [True for o in agg["func"]]

    if len(agg["fillna"]) != len(agg["func"]):
        raise ValueError(
            f"fillna list {agg['fillna']} doesn't match func list {agg['func']}"
        )

    return agg

In [7]:
assert fix_agg({"func": ["sum", "max", "min", "mean"], "column": "population"}) == {
    "func": ["sum", "max", "min", "mean"],
    "column": "population",
    "output": ["population_sum", "population_max", "population_min", "population_mean"],
    "fillna": [True, True, True, True],
}

In [8]:
assert fix_agg({"func": "count"}) == {
    "func": ["count"],
    "column": "aoi_index",
    "output": ["aoi_index_count"],
    "fillna": [True],
}

In [9]:
assert fix_agg(
    {
        "func": ["sum", "max", "min", "mean", "std"],
        "column": "population",
        "output": ["pop_sum", "pop_max", "pop_min", "avg_pop", "pop_std_dev"],
        "fillna": [True, True, True, True, False],
    }
) == {
    "func": ["sum", "max", "min", "mean", "std"],
    "column": "population",
    "output": ["pop_sum", "pop_max", "pop_min", "avg_pop", "pop_std_dev"],
    "fillna": [True, True, True, True, False],
}

In [10]:
assert fix_agg({"func": "count"}) == {
    "func": ["count"],
    "column": "aoi_index",
    "output": ["aoi_index_count"],
    "fillna": [True],
}

In [11]:
try:
    fix_agg({})
    threw_exception = False
except ValueError as e:
    threw_exception = True
    assert e.args[0] == "Missing key 'func' for agg {}"
assert threw_exception

In [12]:
try:
    fix_agg({"func": ["mean", "sum"], "column": "population", "output": ["pop_mean"]})
    threw_exception = False
except ValueError as e:
    threw_exception = True
    assert (
        e.args[0] == "output list ['pop_mean'] doesn't match func list ['mean', 'sum']"
    )
assert threw_exception

In [13]:
# TODO - more tests for validating agg

In [33]:
# export
def prep_aoi(aoi: gpd.GeoDataFrame) -> gpd.GeoDataFrame:  # Area of interest
    """
    prepare aoi for spatial join
      - split off any existing columns named index and aoi_index and drop them from aoi
      - create a column 'aoi_index' from aoi index
    """
    # prep for spatial join
    aoi = aoi.copy()

    # handle existing col named 'index'

    aoi_index_data = None
    if "index" in list(aoi.columns.values):
        aoi_index_data = aoi["index"]
        aoi.drop(labels="index", inplace=True, axis=1)

    # handle existing col named 'aoi_index'

    aoi_col_data = None
    if "aoi_index" in list(aoi.columns.values):
        aoi_col_data = aoi["aoi_index"]
        aoi.drop(labels="aoi_index", inplace=True, axis=1)

    # create index col for broadcast to features
    aoi.reset_index(level=0, inplace=True)
    aoi.rename(columns={"index": "aoi_index"}, inplace=True)
    return aoi, aoi_index_data, aoi_col_data

In [15]:
# TODO - add tests
# - show addition of aoi_index
# - show split if existing column index
# - show split if existing column aoi_index

In [35]:
# export


def aggregate_stats(
    aoi: gpd.GeoDataFrame,  # Area of interest
    groups: pd.core.groupby.DataFrameGroupBy,  # data aggregated into groups by 'aoi_index'
    agg: {},  # aggregation to be applied for a given column
) -> gpd.GeoDataFrame:
    """Aggregate groups and compute agg functions in agg['func'] for agg['column'], map them to output columns in agg['column']
    and merge them back to aoi
    """
    aggregates = groups[agg["column"]].agg(agg["func"])
    renames = {k: v for k, v in zip(agg["func"], agg["output"])}
    aggregates.rename(columns=renames, inplace=True)
    results = aoi.merge(aggregates, how="left", on="aoi_index", suffixes=(None, "_y"))

    return results

In [17]:
# TODO:
# - setup aggregate stats inputs
# - show examples of aggregate stats

In [36]:
# export


def create_zonal_stats(
    aoi: gpd.GeoDataFrame,  # Area of interest for which zonal stats are to be computed for
    data: gpd.GeoDataFrame,  # Source gdf containing data to compute zonal stats from
    # data_type: str
    aggregations: [],  # a list of aggregation operations, with each agg applied to a column
    overlap_method: str = "intersects",  # spatial predicate to used in spatial join of aoi and data
    # categorical_column_options: str = None,
) -> gpd.GeoDataFrame:
    """
    Create zonal stats for area of interest from data using aggregration operations on data columns.
    Returns the same aoi with additional columns containing the computed zonal features.
    """
    # make data crs == aoi crs
    if not data.crs.equals(aoi.crs):
        data = data.to_crs(aoi.crs)

    # prep for spatial join
    aoi, aoi_index_data, aoi_col_data = prep_aoi(aoi)

    # spatial join - broadcast aoi_index to data => features
    features = gpd.sjoin(
        aoi[["aoi_index", "geometry"]], data, how="inner", predicate=overlap_method
    )

    # group
    groups = features.groupby("aoi_index")
    results = aoi

    # apply aggregations one column at a time
    for agg in aggregations:
        agg = fix_agg(agg)
        results = aggregate_stats(results, groups, agg)

    # cleanup results
    results.drop(labels="aoi_index", inplace=True, axis=1)
    if aoi_index_data:
        results["index"] = aoi_index_data
    if aoi_col_data:
        results["aoi_index"] = aoi_col_data

    return results

#### use case 1 - pois count 
* input: 
    - aoi - region3 geometry (geom_type - polygon, multipolygon)
    - data - pois (geom_type - points) 
    - data_type: 'individual_pois'
    - overlap_method = 'intersect'?
    - aggregations:
        * count 
            - number of pois within aoi
            - output column name
        

In [19]:
%%time
# area multipolygons for regions 3,4,ncr of the philippines
aoi = gpd.read_file("../data/region34ncr_admin.geojson")

CPU times: user 3.03 s, sys: 509 ms, total: 3.54 s
Wall time: 3.53 s


In [20]:
%%time
# raw pois from osm data
raw_data = gpd.read_file("../data/ph_pois.geojson")

CPU times: user 486 ms, sys: 20.8 ms, total: 507 ms
Wall time: 505 ms


In [21]:
attractions = raw_data[raw_data.fclass == "attraction"]

In [22]:
aoi_attr = create_zonal_stats(aoi, attractions, aggregations=[{"func": "count"}])

Unnamed: 0,Reg_Code,Reg_Name,Reg_Alt_Name,geometry,aoi_index_count
0,130000000,National Capital Region,NCR,"MULTIPOLYGON (((121.03842 14.78525, 121.03815 ...",136
1,30000000,Region III,Central Luzon,"MULTIPOLYGON (((120.11687 14.76309, 120.11684 ...",205
2,40000000,Region IV-A,Calabarzon,"MULTIPOLYGON (((122.72165 13.36485, 122.72143 ...",312


In [23]:
assert [*list(aoi.columns.values), "aoi_index_count"] == list(aoi_attr.columns.values)

In [24]:
len(aoi_attr) == len(aoi)

True

In [25]:
assert aoi_attr.drop("aoi_index_count", axis=1).equals(aoi)

In [26]:
assert [136, 205, 312] == aoi_attr[["aoi_index_count"]].values.flatten().tolist()

In [30]:
%%time
aoi_attr = create_zonal_stats(
    aoi, attractions, aggregations=[{"func": "count", "output": "attractions"}]
)

CPU times: user 126 ms, sys: 0 ns, total: 126 ms
Wall time: 124 ms


In [29]:
aoi_attr

Unnamed: 0,Reg_Code,Reg_Name,Reg_Alt_Name,geometry,attractions
0,130000000,National Capital Region,NCR,"MULTIPOLYGON (((121.03842 14.78525, 121.03815 ...",136
1,30000000,Region III,Central Luzon,"MULTIPOLYGON (((120.11687 14.76309, 120.11684 ...",205
2,40000000,Region IV-A,Calabarzon,"MULTIPOLYGON (((122.72165 13.36485, 122.72143 ...",312


In [31]:
assert [*list(aoi.columns.values), "attractions"] == list(aoi_attr.columns.values)