In [2]:
import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

In [3]:
country = "fr_sp_nl_be"
regions_v = "3"

In [4]:
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)  # delineation of existing regions 

Cleanup

In [18]:
import glob
from pathlib import Path
import os

files = glob.glob("/data/uscuni-ulce/buildings_in_regions/*")

In [15]:
to_rm = [f for f in files if int(Path(f).stem.split("_")[1]) > region_hulls.index.max()]

In [19]:
for f in to_rm:
    os.remove(f)

Read data

In [21]:
# read the new data
gdf = pd.concat([
    gpd.read_parquet(f"/data/uscuni-ulce/extension/france/clean_3035.parquet"),
    gpd.read_parquet(f"/data/uscuni-ulce/extension/spain/clean_3035.parquet"),
    gpd.read_parquet(f"/data/uscuni-ulce/extension/netherlands/clean_3035.parquet"),
    gpd.read_parquet(f"/data/uscuni-ulce/extension/belgium/buildings.parquet"),
], ignore_index=True)

## Split the buildings into regions

Assign building centroids to a grid, then cluster the grid points based on distances. This results in a split of the buildings into contiguous regions where all the 250m neighbours of a building are within the same region.

In [22]:
cents = gdf.centroid
gdf["x"], gdf["y"] = cents.x, cents.y
gdf["id"] = gdf.index.values
data = gdf[["x", "y", "id"]].copy()

In [23]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [24]:
dbscan = DBSCAN(250, n_jobs=-1).fit(grid[["x_100", "y_100"]], sample_weight=grid["id"])
grid["labels"] = dbscan.labels_

Merge the results back into the dataframe. Then split the dataset into cores and non-cores. Cores are regions with more than 10_000 buildings.

In [25]:
# %%time
data = pd.merge(data, grid, "left", on=["x_100", "y_100"])

counts = data.labels.value_counts()

index = counts[counts > 10000].index
if -1 in index:
    index = index.drop(-1)

data["core"] = data.labels.isin(index)
grid["core"] = grid.labels.isin(index)


cores = data[data.core]
grid_cores = grid[grid.core]
grid_cores = gpd.GeoDataFrame(
    grid_cores["labels"],
    geometry=gpd.points_from_xy(grid_cores["x_100"], grid_cores["y_100"]),
    crs=3035,
)
grid_cores_dissolved = grid_cores.dissolve("labels")

grid_non_cores = grid[~grid.core]
grid_non_cores = gpd.GeoDataFrame(
    grid_non_cores["labels"],
    geometry=gpd.points_from_xy(grid_non_cores["x_100"], grid_non_cores["y_100"]),
    crs=3035,
)

grid_non_cores_clustered = grid_non_cores[grid_non_cores.labels != -1]
grid_non_cores_outliers = grid_non_cores[grid_non_cores.labels == -1]

grid_non_cores_clustered_dissolved = grid_non_cores_clustered.dissolve("labels")

Assign the non-core regions to the geographically nearest core region.

In [26]:
%%time
nearest = grid_cores.sindex.nearest(
    grid_non_cores_clustered_dissolved.geometry, return_all=False
)

CPU times: user 18.1 s, sys: 36.3 ms, total: 18.1 s
Wall time: 18.1 s


In [27]:
grid_non_cores_clustered_dissolved["nearest_core"] = grid_cores.labels.values[
    nearest[1]
]

In [28]:
nearest_outliers = grid_cores.sindex.nearest(
    grid_non_cores_outliers.geometry, return_all=False
)

In [29]:
grid_non_cores_outliers["nearest_core"] = grid_cores.labels.values[nearest_outliers[1]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [30]:
grid_non_cores = pd.concat(
    [
        grid_non_cores_clustered_dissolved.reset_index().explode(ignore_index=True),
        grid_non_cores_outliers,
    ],
    ignore_index=True,
)

Update the region labels based on the assignment

In [31]:
grid_non_cores["x_100"] = grid_non_cores.geometry.x
grid_non_cores["y_100"] = grid_non_cores.geometry.y

data = pd.merge(
    data,
    grid_non_cores[["x_100", "y_100", "nearest_core"]],
    "left",
    on=["x_100", "y_100"],
)

data["region"] = data.labels
data.loc[~data.core, "region"] = data.loc[~data.core, "nearest_core"]

data = data.rename(
    columns={
        "id_x": "id",
        "id_y": "weight",
        "labels": "dbscan_cluster",
    }
)

In [32]:
additional_region_hulls = (
    pd.concat(
        [
            grid_cores,
            grid_non_cores[["nearest_core", "geometry"]].rename(
                columns={"nearest_core": "labels"}
            ),
        ]
    )
    .dissolve("labels")
    .convex_hull.to_frame("convex_hull")
)

## Adjust the new region numbers and add to the existing cadastre regions_hull.parquet

In [33]:
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + "cadastre_regions_hull.parquet"  # the latest version
)

In [34]:
buffer = region_hulls.index.max() + 1000

In [35]:
additional_region_hulls.index = additional_region_hulls.index + buffer
data["region"] = data["region"] + buffer

In [36]:
additional_region_hulls.to_parquet(
    regions_datadir + "regions/" + f"{country}_regions_hull.parquet"
)

In [37]:
region_hulls = pd.concat((region_hulls, additional_region_hulls))

In [38]:
region_hulls.to_parquet(
    regions_datadir + "regions/" + f"region_hulls_v{regions_v}.parquet"
)

Show the largest regions.

In [42]:
data["region"].value_counts().sort_values().tail(20)

region
478375     425616
235505     441988
259195     443551
328399     446078
505018     447752
235968     464181
245980     476222
245970     488338
504227     523888
370160     540892
450071     571987
523438     574674
478146     646453
487636     677986
344277     819538
470467     825754
489180     840671
485131    2493161
403958    2517665
436461    5808856
Name: count, dtype: int64

## Assign buildings to regions

In [40]:
region_mapping = data

In [41]:
%%time
for region_id, group in region_mapping.groupby("region"):
    region_id = int(region_id)

    buildings = gdf.iloc[group.id].copy()
    buildings["iid"] = buildings.index.values
    buildings.to_parquet(
        f"/data/uscuni-ulce/buildings_in_regions/buildings_{region_id}.pq"
    )

CPU times: user 1min 31s, sys: 13.8 s, total: 1min 45s
Wall time: 2min 32s


## Check existing region ids

In [28]:
region_name = 525377
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + f"region_hulls_v{regions_v}.parquet"
)
region_id, region_hull = (
    region_hulls.loc[region_name].name,
    region_hulls.loc[region_name].convex_hull,
)

In [29]:
gpd.GeoSeries([region_hull], name="geometry", crs="epsg:3035").explore()