# Generate data product

This notebooks takes the internal representation of the data and converts them to a public data product format.

In [None]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
v = "v10"

# Buildings geoms per NUTS1 region

In [3]:
nuts = gpd.read_file(
    "https://gisco-services.ec.europa.eu/distribution/v2/nuts/gpkg/NUTS_RG_01M_2024_3035.gpkg"
)

In [4]:
nuts_l1 = nuts[nuts.CNTR_CODE.isin(["ES", "FR", "BE", "NL"])].query(
    "LEVL_CODE == 1"
)

In [5]:
region_hulls = gpd.read_parquet(
    "/data/uscuni-ulce/regions/region_hulls_v3.parquet"
)

region_idxs, nuts_idxs = nuts_l1.sindex.query(
    region_hulls.geometry, predicate="intersects"
)
intersections = (
    region_hulls.iloc[region_idxs]
    .intersection(nuts_l1.iloc[nuts_idxs], align=False)
    .area
)
intersection_df = gpd.GeoDataFrame(
    {
        "region_id": region_hulls.index[region_idxs].values,
        "NUTS_ID": nuts_l1.iloc[nuts_idxs, 0].values,
        "intersection_area": intersections.values,
        "geometry": region_hulls.iloc[region_idxs, 0].values,
    },
    crs=region_hulls.crs,
)
cluster_mapping = pd.read_parquet(
    f"/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq"
)

In [7]:
folder = "/data/uscuni-ulce/data_product/extension_1"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/clusters/clusters_{region}_{v}.pq"
        )
        clusters = clusters[
            ["geometry", "unique_morph", "final", "final_without_noise"]
        ].rename(columns={"unique_morph": "morphotope_id"})
        clusters.index = str(region) + "_" + clusters.index.astype(str)

        clusters["initially_noise"] = clusters.final != clusters.final_without_noise

        for level in range(1, 7):
            labels = clusters.final_without_noise.map(cluster_mapping[level].to_dict())
            clusters[f"level_{level}_label"] = labels.astype(np.min_scalar_type(labels))
        clusters["level_7_label"] = clusters.final_without_noise.astype("uint8")

        clusters = clusters[
            [
                "geometry",
                "morphotope_id",
                "initially_noise",
                "level_1_label",
                "level_2_label",
                "level_3_label",
                "level_4_label",
                "level_5_label",
                "level_6_label",
                "level_7_label",
            ]
        ]

        clusters = clusters.iloc[
            clusters.centroid.sindex.query(region_polygon, predicate="contains")
        ]

        region_data.append(clusters)

    # save region data
    region_data = pd.concat(region_data)
    region_data.sort_values("geometry").to_parquet(
        f"{folder}/{nuts_region.lower()}.parquet",
        index=True,
        geometry_encoding="geoarrow",
        write_covering_bbox=True,
        schema_version="1.1.0",
        compression="zstd",
        compression_level=22,
    )

BE1
BE2
BE3
FRC
FRK
FRF
FRH
FRI
FRL
FRG
FRM
FR1
FRJ
FRD
FRB
FRE
ES4
ES5
ES1
ES2
ES3
ES6
ES7
NL1
NL2
NL4
NL3
FRY


ValueError: No objects to concatenate

# Data

In [6]:
folder = "/data/uscuni-ulce/data_product/extension_1"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    if Path(f"{folder}/{nuts_region.lower()}_data.parquet").exists():
        continue

    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/clusters/clusters_{region}_{v}.pq"
        )
        primary = pd.read_parquet(
            f"/data/uscuni-ulce/processed_data/chars/primary_chars_{region}.parquet"
        )
        data = primary.loc[clusters.index].set_geometry(clusters.geometry)
        data.index = str(region) + "_" + data.index.astype(str)

        region_data.append(data)

    if len(region_data):
        # save region data
        region_data = pd.concat(region_data)
        region_data.sort_values("geometry").drop(columns="geometry").astype(
            "float32"
        ).to_parquet(
            f"{folder}/{nuts_region.lower()}_data.parquet",
            index=True,
            compression="zstd",
            compression_level=22,
        )

NL2
NL4
NL3
FRY


# Morphotope geoms per NUTS1 region

In [None]:
folder = "/data/uscuni-ulce/data_product/extension_1"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/morphotope_clusters/{v}/{region}_clusters.pq"
        ).reset_index()
        clusters = clusters[
            ["geometry", "final", "final_without_noise", "unique_morph"]
        ].rename(columns={"unique_morph": "morphotope_id"})
        clusters["initially_noise"] = clusters.final != clusters.final_without_noise

        for level in range(1, 7):
            clusters[f"level_{level}_label"] = clusters.final_without_noise.map(
                cluster_mapping[level].to_dict()
            )
        clusters["level_7_label"] = clusters.final_without_noise
        clusters = clusters[
            [
                "geometry",
                "morphotope_id",
                "initially_noise",
                "level_1_label",
                "level_2_label",
                "level_3_label",
                "level_4_label",
                "level_5_label",
                "level_6_label",
                "level_7_label",
            ]
        ]

        clusters = clusters.iloc[
            clusters.centroid.sindex.query(region_polygon, predicate="contains")
        ]

        region_data.append(clusters)

    if len(region_data):
        region_data = pd.concat(region_data)
        region_data.sort_values("geometry").to_parquet(
            f"{folder}/{nuts_region.lower()}_morphotopes.parquet",
            index=True,
            geometry_encoding="geoarrow",
            write_covering_bbox=True,
            schema_version="1.1.0",
            compression="zstd",
            compression_level=22,
        )

BE1
BE2
FRF
FRH
FRI
FRE
ES1
ES2
ES3
ES6
ES7
NL1
NL2
NL4
NL3
FRY


## Morphotope data

In [6]:
existing = pd.read_parquet('/data/uscuni-ulce/data_product/morphotope_data.parquet')

In [17]:
region_id = 535987
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
clusters_dir = "/data/uscuni-ulce/processed_data/clusters/"

country = "fr_sp_nl_be"

region_hulls = gpd.read_parquet(
    "/data/uscuni-ulce/" + "regions/" + f"{country}_regions_hull.parquet"
)

v_ext = "v10_ext1"

In [19]:
data = []

for region_id in region_hulls.index:

    clusters = pd.read_parquet(
        f"{clusters_dir}clusters_{region_id}_{v_ext}.pq", columns=["final", "morph"]
    )

    chars = pd.read_parquet(f"{chars_dir}primary_chars_{region_id}.parquet")
    chars = chars[chars.index >= 0]

    is_noise = clusters["final"].values == -1

    morphotope_groups = chars[~is_noise].groupby(clusters[~is_noise].morph).median()
    morphotope_chars = pd.read_parquet(f"/data/uscuni-ulce/processed_data/morphotopes/morph_chars_{region_id}.pq")
    morphotope_groups["limAre"] = morphotope_chars['limAre']
    morphotope_groups["limLPS"] = morphotope_chars['limLPS']
    data.append(morphotope_groups[existing.columns])

In [20]:
morphotope_data = pd.concat(data)

In [24]:
morphotope_data.to_parquet(f"{folder}/morphotope_data_{country}.parquet")

## Licenses

- France: Licence Ouverte / Open Licence 2.0 © IGN
- Belgium: Open Database License (ODbL) OpenStreetMap Contributors
- Netherlands: CC-BY-4.0 © 3DBAG by tudelft3d and 3DGI
- Spain: LICENCIA DE ACCESO Y USO DE LOS SERVICIOS Y CONJUNTOS DE DATOS INSPIRE DE LA DIRECCIÓN GENERAL DEL CATASTRO (c) DGC