# 1. Fix Overlapping Buildings

Overlapping buildings should either be merged to a neighbouring polygon if smaller than a size ``merge_limit`` or if they share area larger than ``area * overlap_limit``, or trimmed if not.

In [131]:
import geopandas as gpd
from tqdm import tqdm
import numpy as np
import libpysal
import geoplanar
import momepy

In [132]:
path = '/Users/lisawinkler/Documents/Prague/BuildingFootprints/buildings_berlin_0.parquet'

In [133]:
buildings=gpd.read_parquet(path,columns=['geometry'])

In [134]:
buildings=buildings.reset_index(drop=True)

In [135]:
buildings

Unnamed: 0,geometry
0,"POLYGON ((4588533.734 5821521.726, 4588570.786..."
1,"POLYGON ((4591889.97 5823105.866, 4591898.467 ..."
2,"POLYGON ((4592002.064 5821961.774, 4592000.684..."
3,"POLYGON ((4592077.946 5822437.286, 4592070.997..."
4,"POLYGON ((4592403.073 5822915.869, 4592383.652..."
...,...
1057236,"POLYGON ((4599145.094 5819669.998, 4599129.491..."
1057237,"POLYGON ((4576136.11 5806949.507, 4576173.534 ..."
1057238,"POLYGON ((4586770.205 5823065.392, 4586753.742..."
1057239,"POLYGON ((4597976.059 5812604.354, 4597984.496..."


## Merge Overlapping Buildings

In [136]:
def merge_overlapping1(gdf, merge_limit, overlap_limit):
    """Merge overlapping polygons based on a set of conditions.
    
    Overlapping polygons smaller than ``merge_limit`` are merged to a neighboring polygon.
    If ``largest=None`` it picks one randomly, otherwise it picks the largest (True) or the
    smallest (False).
    
    Polygons larger than ``merge_limit`` are merged to neighboring if they share area larger
    than ``area * overlap_limit``.
    
    Parameters
    ----------
    gdf : GeoDataFrame
        GeoDataFrame with polygon or mutli polygon geometry
    merge_limit : float
        area of overlapping polygons that are to be merged with neighbors no matter the size
        of the overlap
    overlap_limit : float (0-1)
        ratio of area of an overlapping polygon that has to be shared with other polygon 
        to merge both into a one
    largest : bool (default None)
        Merge each overlapping polygons smaller than merge_limit with  the polygon with the largest intersection (True), or smallest (False) neighbor.
        If None, merge with any neighbor non-deterministically but performantly.
        
    Returns
    -------

    GeoDataFrame
    """
    neighbors = {}
    for i, poly in tqdm(gdf.geometry.items(), total=len(gdf)):
        
        hits_overlaps = gdf.sindex.query(poly, predicate='overlaps')
        hits_overlaps = hits_overlaps[hits_overlaps != i]

        hits_contains = gdf.sindex.query(poly, predicate='contains')
        hits_contains = hits_contains[hits_contains != i]

        if poly.area < merge_limit:
            neighbors[i] = np.unique(np.concatenate([hits_overlaps,hits_contains]))
        else:
            sub = gdf.geometry.iloc[np.unique(np.concatenate([hits_overlaps,hits_contains]))]
            inters = sub.intersection(poly)
            include = sub.index[inters.area > (sub.area * overlap_limit)]
            neighbors[i] = list(include)
    
    W = libpysal.weights.W(neighbors, silence_warnings=True)
    return gdf.dissolve(W.component_labels)

In [138]:
def merge_overlapping2(gdf, merge_limit, overlap_limit):
    """Merge overlapping polygons based on a set of conditions.
    
    Overlapping polygons smaller than ``merge_limit`` are merged to a neighboring polygon.
    If ``largest=None`` it picks one randomly, otherwise it picks the largest (True) or the
    smallest (False).
    
    Polygons larger than ``merge_limit`` are merged to neighboring if they share area larger
    than ``area * overlap_limit``.
    
    Parameters
    ----------
    gdf : GeoDataFrame
        GeoDataFrame with polygon or mutli polygon geometry
    merge_limit : float
        area of overlapping polygons that are to be merged with neighbors no matter the size
        of the overlap
    overlap_limit : float (0-1)
        ratio of area of an overlapping polygon that has to be shared with other polygon 
        to merge both into a one
    largest : bool (default None)
        Merge each overlapping polygons smaller than merge_limit with  the polygon with the largest intersection (True), or smallest (False) neighbor.
        If None, merge with any neighbor non-deterministically but performantly.
        
    Returns
    -------

    GeoDataFrame
    """

    overlap_a, overlap_b = gdf.sindex.query(
        gdf.geometry, predicate="overlaps"
    )
    contains_a, contains_b = gdf.sindex.query(
        gdf.geometry, predicate="contains"
    )

    self_mask = contains_a != contains_b
    contains_a = contains_a[self_mask]
    contains_b = contains_b[self_mask]

    self_mask = overlap_a != overlap_b
    overlap_a = overlap_a[self_mask]
    overlap_b = overlap_b[self_mask]

    source = np.concatenate([overlap_a, contains_a])
    target = np.concatenate([overlap_b, contains_b])

    neighbors = {}
    [neighbors.setdefault(key, []).append(value) for key, value in zip(source, target)]

    neighbors_final = {}

    for i, poly in tqdm(gdf.geometry.items(), total=len(gdf)):
        if i in neighbors:
            if poly.area < merge_limit:
                neighbors_final[i] = neighbors[i]
            else:
                sub = gdf.geometry.iloc[neighbors[i]]
                inters = sub.intersection(poly)
                include = sub.index[inters.area > (sub.area * overlap_limit)]
                neighbors_final[i] = list(include)
        else:
            neighbors_final[i]=[]
    
    W = libpysal.weights.W(neighbors_final, silence_warnings=True)
    return gdf.dissolve(W.component_labels)

In [32]:
# all overlapping buildings merged
merged=merge_overlapping(buildings,500,0.1)

100%|██████████| 1057251/1057251 [04:38<00:00, 3793.28it/s]


In [140]:
# some overlapping buildings not merged, need to be trimmed
merged=merge_overlapping2(buildings,300,0.2)

100%|██████████| 1057241/1057241 [00:04<00:00, 217284.81it/s]


In [143]:
check=momepy.CheckTessellationInput(merged)

Collapsed features  : 199
Split features      : 410
Overlapping features: 26


In [144]:
merged

Unnamed: 0,geometry
0,"POLYGON ((4588533.734 5821521.726, 4588570.786..."
1,"POLYGON ((4591889.97 5823105.866, 4591898.467 ..."
2,"POLYGON ((4592002.064 5821961.774, 4592000.684..."
3,"POLYGON ((4592077.946 5822437.286, 4592070.997..."
4,"POLYGON ((4592403.073 5822915.869, 4592383.652..."
...,...
1056580,"POLYGON ((4599145.094 5819669.998, 4599129.491..."
1056581,"POLYGON ((4576136.11 5806949.507, 4576173.534 ..."
1056582,"POLYGON ((4586770.205 5823065.392, 4586753.742..."
1056583,"POLYGON ((4597976.059 5812604.354, 4597984.496..."


## Trim Overlapping buildings

In [147]:
buildings_trimmed = geoplanar.trim_overlaps(merged)

In [152]:
check=momepy.CheckTessellationInput(buildings_trimmed)

Collapsed features  : 201
Split features      : 410
Overlapping features: 0


## Check: Validate and Geometry Types

In [151]:
buildings_trimmed['geometry'].geom_type.value_counts()

Polygon    1056589
Name: count, dtype: int64

In [146]:
merged['geometry'].geom_type.value_counts()

Polygon    1056585
Name: count, dtype: int64

In [154]:
buildings_trimmed.is_valid.value_counts()

True    1056589
Name: count, dtype: int64

### Save to parquet

In [155]:
buildings_trimmed.to_parquet('buildings_berlin_1.parquet')