In [7]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

In [32]:
country = "belgium"
regions_v = "2"

In [8]:
# read the new data
gdf = gpd.read_parquet(f'/data/uscuni-ulce/extension/{country}/buildings.parquet')

## Split the buildings into regions

Assign building centroids to a grid, then cluster the grid points based on distances. This results in a split of the buildings into contiguous regions where all the 400m neighbours of a building are within the same region.

In [16]:
cents = gdf.centroid
gdf['x'], gdf['y'] = cents.x, cents.y
gdf['id'] = gdf.index.values
data = gdf[["x", "y", 'id']]

In [17]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [18]:
dbscan = DBSCAN(400, n_jobs=-1).fit(grid[["x_100", "y_100"]], sample_weight=grid["id"])
grid["labels"] = dbscan.labels_

Merge the results back into the dataframe. Then split the dataset into cores and non-cores. Cores are regions with more than 10_000 buildings.

In [19]:
# %%time
data = pd.merge(data, grid, "left", on=["x_100", "y_100"])

counts = data.labels.value_counts()

index = counts[counts > 10000].index
if -1 in index:
    index = index.drop(-1)
    
data["core"] = data.labels.isin(index)
grid["core"] = grid.labels.isin(index)


cores = data[data.core]
grid_cores = grid[grid.core]
grid_cores = gpd.GeoDataFrame(
    grid_cores["labels"],
    geometry=gpd.points_from_xy(grid_cores["x_100"], grid_cores["y_100"]),
    crs=3035,
)
grid_cores_dissolved = grid_cores.dissolve("labels")

grid_non_cores = grid[~grid.core]
grid_non_cores = gpd.GeoDataFrame(
    grid_non_cores["labels"],
    geometry=gpd.points_from_xy(grid_non_cores["x_100"], grid_non_cores["y_100"]),
    crs=3035,
)

grid_non_cores_clustered = grid_non_cores[grid_non_cores.labels != -1]
grid_non_cores_outliers = grid_non_cores[grid_non_cores.labels == -1]

grid_non_cores_clustered_dissolved = grid_non_cores_clustered.dissolve("labels")

Assign the non-core regions to the geographically nearest core region.

In [20]:
%%time
nearest = grid_cores.sindex.nearest(
    grid_non_cores_clustered_dissolved.geometry, return_all=False
)

CPU times: user 257 ms, sys: 995 μs, total: 258 ms
Wall time: 257 ms


In [21]:
grid_non_cores_clustered_dissolved["nearest_core"] = grid_cores.labels.values[
    nearest[1]
]

In [22]:
nearest_outliers = grid_cores.sindex.nearest(
    grid_non_cores_outliers.geometry, return_all=False
)

In [23]:
grid_non_cores_outliers["nearest_core"] = grid_cores.labels.values[nearest_outliers[1]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [24]:
grid_non_cores = pd.concat(
    [
        grid_non_cores_clustered_dissolved.reset_index().explode(ignore_index=True),
        grid_non_cores_outliers,
    ],
    ignore_index=True,
)

Update the region labels based on the assignment

In [25]:
grid_non_cores["x_100"] = grid_non_cores.geometry.x
grid_non_cores["y_100"] = grid_non_cores.geometry.y

data = pd.merge(
    data,
    grid_non_cores[["x_100", "y_100", "nearest_core"]],
    "left",
    on=["x_100", "y_100"],
)

data["region"] = data.labels
data.loc[~data.core, "region"] = data.loc[~data.core, "nearest_core"]

data = data.rename(
    columns={
        "id_x": "id",
        "id_y": "weight",
        "labels": "dbscan_cluster",
    }
)

In [26]:
additional_region_hulls = pd.concat(
    [
        grid_cores,
        grid_non_cores[["nearest_core", "geometry"]].rename(
            columns={"nearest_core": "labels"}
        ),
    ]
).dissolve("labels").convex_hull.to_frame("convex_hull")

In [27]:
data

Unnamed: 0,x,y,id,x_100,y_100,weight,dbscan_cluster,core,nearest_core,region
0,3.947173e+06,3.096840e+06,0,3947200.0,3096800.0,3,1,True,,1
1,3.883326e+06,3.124409e+06,1,3883300.0,3124400.0,1,1,True,,1
2,3.926935e+06,3.091201e+06,2,3926900.0,3091200.0,3,1,True,,1
3,3.926531e+06,3.096822e+06,3,3926500.0,3096800.0,1,1,True,,1
4,3.928839e+06,3.099783e+06,4,3928800.0,3099800.0,1,1,True,,1
...,...,...,...,...,...,...,...,...,...,...
7003881,3.924990e+06,3.126907e+06,7003881,3925000.0,3126900.0,16,1,True,,1
7003882,3.924993e+06,3.126915e+06,7003882,3925000.0,3126900.0,16,1,True,,1
7003883,3.924996e+06,3.126920e+06,7003883,3925000.0,3126900.0,16,1,True,,1
7003884,3.924999e+06,3.126927e+06,7003884,3925000.0,3126900.0,16,1,True,,1


## Adjust the new region numbers and add to the existing cadastre regions_hull.parquet

In [28]:
regions_datadir = '/data/uscuni-ulce/'
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )

In [29]:
region_hulls

Unnamed: 0_level_0,convex_hull
labels,Unnamed: 1_level_1
4,"POLYGON ((4122200 3045600, 4121500 3045900, 40..."
10,"POLYGON ((4068900 3057300, 4051000 3064300, 40..."
132,"POLYGON ((4053100 3049800, 4050900 3053000, 40..."
134,"POLYGON ((4052500 3093600, 4052200 3093700, 40..."
286,"POLYGON ((4060600 3103900, 4057000 3104200, 40..."
...,...
149997,"POLYGON ((5253200 3642300, 5245600 3647000, 52..."
150044,"POLYGON ((5265700 3578300, 5263400 3578400, 52..."
151676,"POLYGON ((5285400 3549200, 5284800 3549900, 52..."
152081,"POLYGON ((5269700 3664900, 5269300 3665300, 52..."


In [30]:
buffer = region_hulls.index.max() + 1000

In [31]:
additional_region_hulls.index = additional_region_hulls.index + buffer
data['region'] = data['region'] + buffer

In [33]:
additional_region_hulls.to_parquet(regions_datadir + "regions/" + f"{country}_regions_hull.parquet")

In [34]:
region_hulls = pd.concat((region_hulls, additional_region_hulls))

In [35]:
region_hulls.to_parquet(regions_datadir + "regions/" + f"region_hulls_v{regions_v}.parquet")

In [36]:
additional_region_hulls.index

Index([153512, 153650, 153752, 153804, 154059, 154947, 155610, 155668, 155971], dtype='int64', name='labels')

In [37]:
data['region'].value_counts().index.sort_values()

Index([153512, 153650, 153752, 153804, 154059, 154947, 155610, 155668, 155971], dtype='int64', name='region')

## Assign buildings to regions

In [38]:
region_mapping = data

In [40]:
%%time
for region_id, group in region_mapping.groupby('region'):

    region_id = int(region_id)
    
    buildings = gdf.iloc[group.id].copy()
    buildings['iid'] = buildings.index.values
    buildings.to_parquet(f'/data/uscuni-ulce/regions/buildings/buildings_{region_id}.pq')

CPU times: user 7.14 s, sys: 1.65 s, total: 8.79 s
Wall time: 8.91 s


## Check existing region ids

In [45]:
region_name = 153650
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + f"region_hulls_v{regions_v}.parquet"
    )
region_id, region_hull = region_hulls.loc[region_name].name, region_hulls.loc[region_name].convex_hull

In [46]:
gpd.GeoSeries([region_hull], name='geometry', crs='epsg:3035').explore()