In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from libpysal.graph import read_parquet

In [2]:
country = "belgium"

In [8]:
clusters_dir = '/data/uscuni-ulce/processed_data/clusters/'
chars_dir = '/data/uscuni-ulce/processed_data/chars/'
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(regions_datadir + "regions/" + f"{country}_regions_hull.parquet")
graph_dir = '/data/uscuni-ulce/processed_data/neigh_graphs/'
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'

In [4]:
input_model_params = '_75_0_None_None_False'

In [5]:
output_model_params = '_post_processing_v1'

# 1. Change morphotope boundaries so that adjacent buildings are always in the majority morphotope.

In [6]:
def post_process_morphotope_labels(region_id, input_model_params, output_model_params):
    
    ## read data
    bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
    region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
    region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]
    
    # assign mode of non-noise clusters to whole adjacent structure
    # if its only noise, assign the most common noise cluster
    def non_noise_mode(x):
        non_noise = x[~x.str.endswith('-1')]
        if non_noise.shape[0]:
            return pd.Series.mode(non_noise)[0]
        else:
            return pd.Series.mode(x)[0]
            
    component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)
    
    aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
    aggregated_morphotope_labels.name = 'morphotope_label'
    assert (aggregated_morphotope_labels.index == region_morphotope_labels.index).all()
    aggregated_morphotope_labels.to_frame().to_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq')

In [9]:
region_hulls.index

Index([153512, 153650, 153752, 153804, 154059, 154947, 155610, 155668, 155971], dtype='int64', name='labels')

In [10]:
%%time
from joblib import Parallel, delayed
n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_labels)(region_id, input_model_params, output_model_params) for region_id, _ in region_hulls.iterrows()
)

CPU times: user 962 ms, sys: 484 ms, total: 1.45 s
Wall time: 7min 20s


# 2. Generate new morphotope data based on the new morphotope boundaries

In [11]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

def post_process_morphotope_data(region_id, input_model_params, output_model_params):
    ## read data
    new_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq').morphotope_label
    X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
    component_data = X_train.loc[new_morphotope_labels.index]
    
    
    # get morphotope stats
    component_data = component_data.groupby(new_morphotope_labels.values).agg([percentile(25), 'median', percentile(75), 'std', 'mean'])
    
    # save sizes for clustering
    component_data[('Size', 'Size')] = X_train.loc[new_morphotope_labels.index].groupby(new_morphotope_labels.values).size()
    
    # store morphotopes data
    component_data.to_parquet(morphotopes_dir + f'data_morphotopes_{region_id}{output_model_params}.pq')

In [12]:
%%time
from joblib import Parallel, delayed
n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_data)(region_id, input_model_params, output_model_params) for region_id, _ in region_hulls.iterrows()
)

CPU times: user 878 ms, sys: 562 ms, total: 1.44 s
Wall time: 7min 10s


### Plotting

In [13]:
region_id = 153512

In [18]:

from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color

In [17]:
buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq').morphotope_label

In [21]:
buildings['morph'] = region_morphotope_labels

In [23]:
layer = SolidPolygonLayer.from_geopandas(
    gdf=buildings[["geometry", "morph",]], opacity=0.15
)



In [24]:
m = Map(layer, basemap_style=CartoBasemap.Positron)
m

Map(basemap_style=<CartoBasemap.Positron: 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json'>, la…

In [25]:
factors, idx = buildings['morph'].factorize()

In [26]:
layer.get_fill_color = get_color(factors)

  """Computes the :math:`\Delta E` distance between pairs of colors.


# Experiments

In [108]:
from libpysal.graph import Graph

output_model_params = '_post_processing_v2'

In [109]:
def relabel_morphotopes_geography(group):

    # extract morph info
    gc = group.morph.iloc[0].split('_')[0]
    morph_number = group.morph.iloc[0].split('_')[1]

    # gropu the buildings with no 100m distances between them
    focal, neighbor = group.sindex.query(group.geometry, predicate='dwithin', distance=100)
    geom_graph = Graph.from_adjacency(pd.DataFrame({'focal': group.index[focal],
                                                    'neighbor': group.index[neighbor], 'weight': 1}))
    value_counts = geom_graph.component_labels.value_counts()
    
    # return new morphotope labels
    comp_labels = geom_graph.component_labels.map(lambda x: f'{gc}_{morph_number}_{str(x)}'  if value_counts[x] > 75 else f'{gc}_-1')
    
    return comp_labels

In [110]:
def post_process_morphotope_labels(region_id, input_model_params, output_model_params):
    
    ## read data
    bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
    region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
    region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]
    
    # assign mode of non-noise clusters to whole adjacent structure
    # if its only noise, assign the most common noise cluster
    def non_noise_mode(x):
        non_noise = x[~x.str.endswith('-1')]
        if non_noise.shape[0]:
            return pd.Series.mode(non_noise)[0]
        else:
            return pd.Series.mode(x)[0]
            
    component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)
    
    aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
    aggregated_morphotope_labels.name = 'morphotope_label'

    del bq1
    
    # read buildings geometries
    buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/chars/buildings_chars_{region_id}.parquet', columns=['geometry'])
    buildings['morph'] = aggregated_morphotope_labels

    # tighten morphotopes based on geographic distances
    res = []
    groups = buildings.groupby('morph')
    for key, group in groups:
    
        # if noise leave unchanged
        if key.split('_')[-1] == '-1':
            res.append(group.morph)
            continue
        else:
            res.append(relabel_morphotopes_geography(group))
    
    aggregated_morphotope_labels = pd.concat(res).sort_index()
    aggregated_morphotope_labels.name = 'morphotope_label'

    
    assert (aggregated_morphotope_labels.index == region_morphotope_labels.index).all()
    aggregated_morphotope_labels.to_frame().to_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq')

In [64]:
## read data
bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]

# assign mode of non-noise clusters to whole adjacent structure
# if its only noise, assign the most common noise cluster
def non_noise_mode(x):
    non_noise = x[~x.str.endswith('-1')]
    if non_noise.shape[0]:
        return pd.Series.mode(non_noise)[0]
    else:
        return pd.Series.mode(x)[0]
        
component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)

aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
aggregated_morphotope_labels.name = 'morphotope_label'

In [99]:
%%time

buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/chars/buildings_chars_{region_id}.parquet', columns=['geometry'])
buildings['morph'] = aggregated_morphotope_labels

res = []
groups = buildings.groupby('morph')
for key, group in groups:

    # if noise leave unchanged
    if key.split('_')[-1] == '-1':
        res.append(group.morph)
        continue
    else:
        res.append(relabel_morphotopes_geography(group))

aggregated_morphotope_labels = pd.concat(res).sort_index()
aggregated_morphotope_labels.name = 'morphotope_label'

CPU times: user 22.4 s, sys: 15.6 ms, total: 22.4 s
Wall time: 22.4 s


In [103]:
aggregated_morphotope_labels

focal
0            0_-1
1             1_0
2             1_0
3             1_0
4             1_0
           ...   
446822    849_640
446823     849_-1
446824    849_487
446825    849_508
446826    849_361
Name: morphotope_label, Length: 446827, dtype: object

In [95]:
def relabel_morphotopes_geography(group):

    # extract morph info
    gc = group.morph.iloc[0].split('_')[0]
    morph_number = group.morph.iloc[0].split('_')[1]

    # gropu the buildings with no 100m distances between them
    focal, neighbor = group.sindex.query(group.geometry, predicate='dwithin', distance=100)
    geom_graph = Graph.from_adjacency(pd.DataFrame({'focal': group.index[focal],
                                                    'neighbor': group.index[neighbor], 'weight': 1}))
    value_counts = geom_graph.component_labels.value_counts()
    
    # return new morphotope labels
    comp_labels = geom_graph.component_labels.map(lambda x: f'{gc}_{morph_number}_{str(x)}'  if value_counts[x] > 75 else f'{gc}_-1')
    
    return comp_labels

In [72]:
%%time
res = buildings.groupby('morph').transform(relabel_morphotopes_geography)

0    POLYGON ((4616718.204 3026081.581, 4616720.974...
Name: geometry, dtype: geometry


AttributeError: 'GeoSeries' object has no attribute 'morph'