In [1]:
# default_exp vector_zonal_stats

In [2]:
# hide
# no_test
! [ -e /content ] && pip install -Uqq git+https://github.com/thinkingmachines/geowrangler.git

In [3]:
# hide
# no_test
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
# hide
# no_test
# conditionally load nb_black cell magic
import sys

if "google.colab" not in sys.modules:
    from IPython import get_ipython

    ipython = get_ipython()
    ipython.magic("reload_ext lab_black")

In [5]:
# exporti
import warnings

warnings.filterwarnings("ignore")

# Vector Zonal Stats

> generate vector zonal stat features

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thinkingmachines/geowrangler/blob/master/notebooks/02_vector_zonal_stats.ipynb)

In [6]:
# hide
import geopandas

In [7]:
# exporti
from typing import Any, Dict
import geopandas as gpd
import pandas as pd

### Aggregations

In order to generate zonal stats, we have come up with the concept of an **aggregation specification**, which is a way to specify what aggregation functions (sucn as `count`,`sum`, `mean`,`std` etc.) are to be applied to columns in the data. 

The method `create_zonal_stats` can then take in a list of these specifications and apply them to create zonal stats for the area of interest.

An "aggregation specification" aka "agg spec" is specification of how to generate a zonal statistic from a column. It consists of a `dict` with the following keys:

* `func`: (Required) a `str` or a list `[str]` of aggregation functions. See the pandas documentation for [`agg`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html)

* `column`: (Optional) an existing column to generate the zonal statistic from. If not specified, the grouping key `aoi_index` is used as default.

* `output`: (Optional) a `str` or a list `[str]` of the name(s) of the output zonal statistic column. If not specified it is concatenated from the column and func e.g. `{column}_{func}`.

* `fillna`: (Optional) a `bool` or a list `[bool]` of the flag(s) that indicates whether to to a `fillna(0)` step for the new zonal column, `True` meaning it will set any `NA` values in the resulting zonal stat to `0`, and `False` will retain any `NA` values. The default value of the flag(s) is `True`.

**Examples**

* The simplest aggregation spec:
``` 
{"func":"count"}
```

* Compute the zonal stats `mean`,`sum`,`std` on the `population` column and rename them to `pop_avg`, `pop_total` and `pop_sdev` and retain the `NA` on `std` if no data:

```
{"func":["mean","sum","std"],
 "column": "population",
 "fillna": [True,True,False]
 }
 ```



### Internal API

In [8]:
# export
def fix_agg(
    agg: Dict[str, Any],  # A dict containing at the minimum a 'func' key
) -> Dict[str, Any]:
    """
    Validate and (possibly) fix an `agg spec`.

    It outputs a dict containing the following keys:
      - 'func': a list of aggregation functions (should be a valid 'agg' function)
      - 'column': a column to apply the aggregation functions (should be a valid numeric column in data)
      - 'output': the names of the new columns containing the application of the aggregation functions (default: concat column + '_' + func)
      - 'fillna': boolean list whether to replace new columns with 'NA' values  with 0 (default: True)
    """
    if "func" not in agg:
        raise ValueError(f"Missing key 'func' for agg {agg}")

    if type(agg["func"]) == str:
        agg["func"] = [agg["func"]]

    # optional column, default to index count
    if "column" not in agg:
        agg["column"] = "aoi_index"

    # check matching output
    if "output" not in agg:
        agg["output"] = [f'{agg["column"]}_{f}' for f in agg["func"]]
    if type(agg["output"]) == str:
        agg["output"] = [agg["output"]]
    if len(agg["output"]) != len(agg["func"]):
        raise ValueError(
            f"output list {agg['output']} doesn't match func list {agg['func']}"
        )

    # check matching fillna
    if "fillna" not in agg:
        agg["fillna"] = [True for o in agg["func"]]

    if len(agg["fillna"]) != len(agg["func"]):
        raise ValueError(
            f"fillna list {agg['fillna']} doesn't match func list {agg['func']}"
        )

    return agg

In [9]:
# hide
assert fix_agg({"func": ["sum", "max", "min", "mean"], "column": "population"}) == {
    "func": ["sum", "max", "min", "mean"],
    "column": "population",
    "output": ["population_sum", "population_max", "population_min", "population_mean"],
    "fillna": [True, True, True, True],
}

In [10]:
# hide
assert fix_agg({"func": "count"}) == {
    "func": ["count"],
    "column": "aoi_index",
    "output": ["aoi_index_count"],
    "fillna": [True],
}

In [11]:
# hide
assert fix_agg(
    {
        "func": ["sum", "max", "min", "mean", "std"],
        "column": "population",
        "output": ["pop_sum", "pop_max", "pop_min", "avg_pop", "pop_std_dev"],
        "fillna": [True, True, True, True, False],
    }
) == {
    "func": ["sum", "max", "min", "mean", "std"],
    "column": "population",
    "output": ["pop_sum", "pop_max", "pop_min", "avg_pop", "pop_std_dev"],
    "fillna": [True, True, True, True, False],
}

In [12]:
# hide
assert fix_agg({"func": "count"}) == {
    "func": ["count"],
    "column": "aoi_index",
    "output": ["aoi_index_count"],
    "fillna": [True],
}

In [13]:
# hide
try:
    fix_agg({})
    threw_exception = False
except ValueError as e:
    threw_exception = True
    assert e.args[0] == "Missing key 'func' for agg {}"
assert threw_exception

In [14]:
# hide
try:
    fix_agg({"func": ["mean", "sum"], "column": "population", "output": ["pop_mean"]})
    threw_exception = False
except ValueError as e:
    threw_exception = True
    assert (
        e.args[0] == "output list ['pop_mean'] doesn't match func list ['mean', 'sum']"
    )
assert threw_exception

In [15]:
# hide
# TODO - more tests for validating agg

In [16]:
# export
def prep_aoi(
    aoi: gpd.GeoDataFrame,  # Area of interest
) -> gpd.GeoDataFrame:
    """
    Prepare aoi for spatial join
      - split off any existing columns named index and aoi_index and drop them from aoi
      - create a column 'aoi_index' from aoi's index which will be used as grouping key
    """
    # prep for spatial join
    aoi = aoi.copy()

    # handle existing col named 'index'

    aoi_index_data = None
    if "index" in list(aoi.columns.values):
        aoi_index_data = aoi["index"]
        aoi.drop(labels="index", inplace=True, axis=1)

    # handle existing col named 'aoi_index'

    aoi_col_data = None
    if "aoi_index" in list(aoi.columns.values):
        aoi_col_data = aoi["aoi_index"]
        aoi.drop(labels="aoi_index", inplace=True, axis=1)

    # create index col for broadcast to features
    aoi.reset_index(level=0, inplace=True)
    aoi.rename(columns={"index": "aoi_index"}, inplace=True)
    return aoi, aoi_index_data, aoi_col_data

In [17]:
# hide
# TODO - add tests
# - show addition of aoi_index
# - show split if existing column index
# - show split if existing column aoi_index

In [18]:
# export


def aggregate_stats(
    aoi: gpd.GeoDataFrame,  # Area of interest
    groups: pd.core.groupby.DataFrameGroupBy,  # Source data aggregated into groups by 'aoi_index'
    agg: {},  # An agg spec to be applied
) -> gpd.GeoDataFrame:
    """Aggregate groups and compute agg functions in agg['func'] for agg['column'], map them to output columns in agg['column']
    and merge them back to aoi dataframe
    """
    aggregates = groups[agg["column"]].agg(agg["func"])
    renames = {k: v for k, v in zip(agg["func"], agg["output"])}
    aggregates.rename(columns=renames, inplace=True)
    results = aoi.merge(aggregates, how="left", on="aoi_index", suffixes=(None, "_y"))

    # set NAs to 0 if fillna
    for i, colname in enumerate(agg["output"]):
        if agg["fillna"][i]:
            if colname in list(aoi.columns.values):
                colname = colname + "_y"  # try if merged df has colname + _y
            if colname in list(results.columns.values):
                results[colname].fillna(0, inplace=True)

    return results

In [19]:
# hide
# TODO:
# - setup aggregate stats inputs
# - show examples of aggregate stats

### External API

In [20]:
# export


def create_zonal_stats(
    aoi: gpd.GeoDataFrame,  # Area of interest for which zonal stats are to be computed for
    data: gpd.GeoDataFrame,  # Source gdf containing data to compute zonal stats from
    aggregations: [],  # a list of agg specs, with each agg spect applied to a data column
    overlap_method: str = "intersects",  # spatial predicate to used in spatial join of aoi and data
    # categorical_column_options: str = None,
) -> gpd.GeoDataFrame:
    """
    Create zonal stats for area of interest from data using aggregration operations on data columns.
    Returns the same aoi with additional columns containing the computed zonal features.
    """
    # make data crs == aoi crs
    if not data.crs.equals(aoi.crs):
        data = data.to_crs(aoi.crs)

    # prep for spatial join
    aoi, aoi_index_data, aoi_col_data = prep_aoi(aoi)

    # spatial join - broadcast aoi_index to data => features
    features = gpd.sjoin(
        aoi[["aoi_index", "geometry"]], data, how="inner", predicate=overlap_method
    )

    # group
    groups = features.groupby("aoi_index")
    results = aoi

    # apply aggregations one column at a time
    for agg in aggregations:
        agg = fix_agg(agg)
        results = aggregate_stats(results, groups, agg)

    # cleanup results
    results.drop(labels="aoi_index", inplace=True, axis=1)
    if aoi_index_data:
        results["index"] = aoi_index_data
    if aoi_col_data:
        results["aoi_index"] = aoi_col_data

    return results

In [21]:
# hide
# no_test
from nbdev.export import notebook2script

notebook2script("02_vector_zonal_stats.ipynb")

Converted 02_vector_zonal_stats.ipynb.
