# Post-process morphotopes

This ensures that a morphotope boundary does not split a connected component of buildings.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from libpysal.graph import read_parquet

In [None]:
clusters_dir = "/data/uscuni-ulce/processed_data/clusters/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
morphotopes_dir = "/data/uscuni-ulce/processed_data/morphotopes/"

In [4]:
input_model_params = "_75_0_None_None_False"

In [5]:
output_model_params = "_post_processing_v1"

1. Change morphotope boundaries so that adjacent buildings are always in the majority morphotope.

In [6]:
def post_process_morphotope_labels(region_id, input_model_params, output_model_params):
    ## read data
    bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
    region_morphotope_labels = pd.read_parquet(
        f"{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq"
    ).morphotope_label
    region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]

    # assign mode of non-noise clusters to whole adjacent structure
    # if its only noise, assign the most common noise cluster
    def non_noise_mode(x):
        non_noise = x[~x.str.endswith("-1")]
        if non_noise.shape[0]:
            return pd.Series.mode(non_noise)[0]
        else:
            return pd.Series.mode(x)[0]

    component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(
        non_noise_mode
    )

    aggregated_morphotope_labels = bq1.component_labels.map(
        component_morphotopes.to_dict()
    )
    aggregated_morphotope_labels.name = "morphotope_label"
    assert (aggregated_morphotope_labels.index == region_morphotope_labels.index).all()
    aggregated_morphotope_labels.to_frame().to_parquet(
        f"{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq"
    )

In [10]:
%%time
from joblib import Parallel, delayed

n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_labels)(
        region_id, input_model_params, output_model_params
    )
    for region_id, _ in region_hulls.iterrows()
)

CPU times: user 962 ms, sys: 484 ms, total: 1.45 s
Wall time: 7min 20s


2. Generate new morphotope data based on the new morphotope boundaries

In [11]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)

    percentile_.__name__ = "percentile_%s" % n
    return percentile_


def post_process_morphotope_data(region_id, input_model_params, output_model_params):
    ## read data
    new_morphotope_labels = pd.read_parquet(
        f"{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq"
    ).morphotope_label
    X_train = pd.read_parquet(chars_dir + f"primary_chars_{region_id}.parquet")
    component_data = X_train.loc[new_morphotope_labels.index]

    # get morphotope stats
    component_data = component_data.groupby(new_morphotope_labels.values).agg(
        [percentile(25), "median", percentile(75), "std", "mean"]
    )

    # save sizes for clustering
    component_data[("Size", "Size")] = (
        X_train.loc[new_morphotope_labels.index]
        .groupby(new_morphotope_labels.values)
        .size()
    )

    # store morphotopes data
    component_data.to_parquet(
        morphotopes_dir + f"data_morphotopes_{region_id}{output_model_params}.pq"
    )

In [12]:
%%time
from joblib import Parallel, delayed

n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_data)(
        region_id, input_model_params, output_model_params
    )
    for region_id, _ in region_hulls.iterrows()
)

CPU times: user 878 ms, sys: 562 ms, total: 1.44 s
Wall time: 7min 10s
