# Link the morphometric cluster label to census geoemtries

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import box

v = "v3"

selected_level = 7

grid = gpd.read_parquet("/data/uscuni-restricted/geometries/nadzsj_d.parquet")
grid = grid["geometry"]


german_boundary = box(*grid.to_crs(epsg=3035).total_bounds)
regions = gpd.read_parquet("/data/uscuni-ulce/regions/cadastre_regions_hull.parquet")
german_regions = regions[regions.intersects(german_boundary)].index.values

cluster_mapping = pd.read_parquet(
    f"/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq"
)

all_buildings = []

for region_id in german_regions:
    print(region_id)
    buildings = gpd.read_parquet(
        f"/data/uscuni-ulce/processed_data/clusters/clusters_{region_id}_{v}.pq",
        columns=["geometry", "final_without_noise"],
    )

    buildings = buildings.to_crs(grid.crs)

    ## assign correct level from the hierarchy
    if selected_level != 7:
        buildings["final_without_noise"] = buildings["final_without_noise"].map(
            cluster_mapping[selected_level].to_dict()
        )

    all_buildings.append(buildings)

all_buildings = pd.concat(all_buildings, ignore_index=True)

# assign grid cells to tessellation cells
inp, res = grid.sindex.query(all_buildings.centroid, predicate="intersects")
# buildings should be assigned to only one geometry
duplicated = pd.Series(res).duplicated()
inp = inp[~duplicated]
res = res[~duplicated]

# save regional results
region_res = (
    all_buildings.iloc[inp]
    .groupby(res)["final_without_noise"]
    .agg(lambda x: pd.Series.mode(x)[0])
)
region_res.index = grid.index[region_res.index.values]
grid_assignment = region_res

grid_assignment.to_csv(f"/data/uscuni-restricted/geometries/cluster_assignment_{v}.csv")