# Generate data product

This notebooks takes the internal representation of the data and converts them to a public data product format.

In [1]:
import json
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
v = "v10"

# Buildings geoms per NUTS1 region

In [4]:
nuts = gpd.read_file(
    "https://gisco-services.ec.europa.eu/distribution/v2/nuts/gpkg/NUTS_RG_01M_2024_3035.gpkg"
)
nuts_l1 = nuts[nuts.CNTR_CODE.isin(["DE", "AT", "CZ", "SK", "PL", "LT"])].query(
    "LEVL_CODE == 1"
)

In [5]:
region_hulls = gpd.read_parquet(
    "/data/uscuni-ulce/regions/cadastre_regions_hull.parquet"
)

region_idxs, nuts_idxs = nuts_l1.sindex.query(
    region_hulls.geometry, predicate="intersects"
)
intersections = (
    region_hulls.iloc[region_idxs]
    .intersection(nuts_l1.iloc[nuts_idxs], align=False)
    .area
)
intersection_df = gpd.GeoDataFrame(
    {
        "region_id": region_hulls.index[region_idxs].values,
        "NUTS_ID": nuts_l1.iloc[nuts_idxs, 0].values,
        "intersection_area": intersections.values,
        "geometry": region_hulls.iloc[region_idxs, 0].values,
    },
    crs=region_hulls.crs,
)
cluster_mapping = pd.read_parquet(
    f"/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq"
)

In [5]:
folder = "/data/uscuni-ulce/data_product"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/clusters/clusters_{region}_{v}.pq"
        )
        clusters = clusters[
            ["geometry", "unique_morph", "final", "final_without_noise"]
        ].rename(columns={"unique_morph": "morphotope_id"})
        clusters.index = str(region) + "_" + clusters.index.astype(str)

        clusters["initially_noise"] = clusters.final != clusters.final_without_noise

        for level in range(1, 7):
            labels = clusters.final_without_noise.map(cluster_mapping[level].to_dict())
            clusters[f"level_{level}_label"] = labels.astype(np.min_scalar_type(labels))
        clusters["level_7_label"] = clusters.final_without_noise.astype("uint8")

        clusters = clusters[
            [
                "geometry",
                "morphotope_id",
                "initially_noise",
                "level_1_label",
                "level_2_label",
                "level_3_label",
                "level_4_label",
                "level_5_label",
                "level_6_label",
                "level_7_label",
            ]
        ]

        clusters = clusters.iloc[
            clusters.centroid.sindex.query(region_polygon, predicate="contains")
        ]

        region_data.append(clusters)

    # save region data
    region_data = pd.concat(region_data)
    region_data.sort_values("geometry").to_parquet(
        f"{folder}/{nuts_region.lower()}.parquet",
        index=True,
        geometry_encoding="geoarrow",
        write_covering_bbox=True,
        schema_version="1.1.0",
        compression="zstd",
        compression_level=22,
    )

DEF
DE2
DE3
DEG
DE4
DE5
DE6
DEA
DE7
DEB
CZ0
DEC
DE8
DED
DE1
DE9
DEE
AT2
AT1
AT3
LT0
SK0
PL8
PL9
PL4
PL5
PL6
PL7
PL2


# Linkage

In [6]:
linkage_matrix = np.load(
    f"/data/uscuni-ulce/processed_data/clusters/complete_linkage_10_{v}.npy"
)

In [7]:
np.save("/data/uscuni-ulce/data_product/ward_linkage_10.npy", linkage_matrix)

# Data

In [6]:
folder = "/data/uscuni-ulce/data_product"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    if Path(f"{folder}/{nuts_region.lower()}_data.parquet").exists():
        continue

    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/clusters/clusters_{region}_{v}.pq"
        )
        primary = pd.read_parquet(
            f"/data/uscuni-ulce/processed_data/chars/primary_chars_{region}.parquet"
        )
        data = primary.loc[clusters.index].set_geometry(clusters.geometry)
        data.index = str(region) + "_" + data.index.astype(str)

        region_data.append(data)

    # save region data
    region_data = pd.concat(region_data)
    region_data.sort_values("geometry").drop(columns="geometry").astype(
        "float32"
    ).to_parquet(
        f"{folder}/{nuts_region.lower()}_data.parquet",
        index=True,
        compression="zstd",
        compression_level=22,
    )

DEA
DE7
DEB
CZ0
DEC
DE8
DED
DE1
DE9
DEE
AT2
AT1
AT3
LT0
SK0
PL8
PL9
PL4
PL5
PL6
PL7
PL2


### Morphotope labels

In [None]:
regions = gpd.read_parquet("/data/uscuni-ulce/regions/cadastre_regions_hull.parquet")

In [10]:
model_params = "_post_processing_v1"
# region_id = 69333

In [19]:
%%time

all_morphs = []

for region_id in regions.index:
    print(region_id)
    morphotopes = pd.read_parquet(
        f"/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq"
    ).morphotope_label
    morphotopes.index = str(region_id) + "_" + morphotopes.index.astype(str)
    all_morphs.append(morphotopes)

all_morphs = pd.concat(all_morphs)

4
10
132
134
286
313
400
523
765
801
832
913
960
1124
1154
1387
1478
1515
1605
1718
1736
1782
1970
1981
2096
2322
2350
2478
2514
2625
2728
2975
3039
3109
3150
3221
3250
3526
3610
3612
3701
3705
3752
3759
3981
4070
4214
4215
4235
4284
4356
4382
4723
4805
5096
5191
5246
5310
5408
5427
5662
5671
5766
5883
6254
6529
6560
6576
6741
6749
6811
6873
6996
7068
7094
7280
7485
7528
7534
7681
7688
7712
7727
7805
7914
7937
7963
8046
8216
8238
8256
8265
8345
8374
8396
8592
8707
8731
8757
8759
8813
9016
9064
9074
9150
9169
9194
9284
9824
9924
9954
9972
10019
10086
10095
10124
10179
10222
10263
10277
10455
10510
10511
10563
10579
10602
10666
10794
10847
10908
10926
10970
11002
11019
11057
11141
11210
11256
11261
11305
11309
11311
11318
11367
11444
11455
11471
11667
11678
11735
11757
11799
11877
11905
12027
12084
12100
12115
12154
12191
12381
12440
12483
12552
12667
12707
12755
12756
12844
12919
12965
13076
13137
13172
13191
13196
13229
13301
13395
13442
13482
13506
13553
13555
13614
13616
13655
13677


In [22]:
all_morphs = all_morphs.to_frame().reset_index()
all_morphs.columns = ["building_index", "morphotope_label"]
all_morphs.to_parquet("/data/uscuni-ulce/data_product/morphotope_labels.parquet")

# Morphotope geoms per NUTS1 region

In [7]:
folder = "/data/uscuni-ulce/data_product"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):
    region_data = []
    print(nuts_region)

    for region in intersection_df[
        intersection_df.NUTS_ID == nuts_region
    ].region_id.values:
        clusters = gpd.read_parquet(
            f"/data/uscuni-ulce/processed_data/morphotope_clusters/{v}/{region}_clusters.pq"
        )
        clusters = clusters[
            ["geometry", "final", "final_without_noise", "unique_morph"]
        ].rename(columns={"unique_morph": "morphotope_id"})
        clusters["initially_noise"] = clusters.final != clusters.final_without_noise

        for level in range(1, 7):
            clusters[f"level_{level}_label"] = clusters.final_without_noise.map(
                cluster_mapping[level].to_dict()
            )
        clusters["level_7_label"] = clusters.final_without_noise
        clusters = clusters[
            [
                "geometry",
                "morphotope_id",
                "initially_noise",
                "level_1_label",
                "level_2_label",
                "level_3_label",
                "level_4_label",
                "level_5_label",
                "level_6_label",
                "level_7_label",
            ]
        ]

        clusters = clusters.iloc[
            clusters.centroid.sindex.query(region_polygon, predicate="contains")
        ]

        region_data.append(clusters)

    region_data = pd.concat(region_data)
    region_data.sort_values("geometry").to_parquet(
        f"{folder}/{nuts_region.lower()}_morphotopes.parquet",
        index=True,
        geometry_encoding="geoarrow",
        write_covering_bbox=True,
        schema_version="1.1.0",
        compression="zstd",
        compression_level=22,
    )

DEF
DE2
DE3
DEG
DE4
DE5
DE6
DEA
DE7
DEB
CZ0
DEC
DE8
DED
DE1
DE9
DEE
AT2
AT1
AT3
LT0
SK0
PL8
PL9
PL4
PL5
PL6
PL7
PL2


## Licenses

- Brandenburg: "Data license Germany - Attribution - Version 2.0" (https://www.govdata.de/dl-de/by-2-0). Attribution: "GeoBasis-DE/LGB"
- Saxony: "Data license Germany - Attribution - Version 2.0" (https://www.govdata.de/dl-de/by-2-0). Attribution: State Office for Geobasis Information Saxony (GeoSN)
- Baden-Württemberg: Data source: LGL, www.lgl-bw.de, dl-de/by-2-0
- Mecklenburg-Vorpommern : GeoBasis-DE/MV/CC BY 4.0
- Rheinland Pfalz : "Data license Germany - Attribution - Version 2.0" (https://www.govdata.de/dl-de/by-2-0). Attribution: "GeoBasis-DE/LGB"
- Saarland: Datenlizenz Deutschland Namensnennung 2.0 Quellenvermerk: © GeoBasis DE/LVGL-SL (2024)
- Nordrhein-Westfalen: Datenlizenz Deutschland Namensnennung 2.0 . Geoinformationszentrum, Information und Technik Nordrhein-Westfalen
- Niedersachcen: CC-BY 4.0 © GeoBasis-DE/LGLN
- Hessen: Datenlizenz Deutschland – Zero – Version 2.0
- Thuringen: Datenlizenz Deutschland – Namensnennung – Version 2.0. © GDI-Th
- Saxony-Anhalt: Datenlizenz Deutschland – Namensnennung – Version 2.0 GeoBasis-DE / LVermGeo ST
- Schleswig-Holstein: © GeoBasis-DE/LVermGeo SH/CC BY-SA 4.0
- Bavaria: CC BY 4.0 Bayerischen Vermessungsverwaltung
- Berlin: Datenlizenz Deutschland - Zero - Version 2.0
- Hamburg: Datenlizenz Deutschland – Namensnennung – Version 2.0. © LGV Hamburg
- Bremen: CC-BY-4.0  Landesamt GeoInformation Bremen
- Poland: CC-BY-4.0  Główny Urząd Geodezji i Kartografii
- Czechia: CC-BY-4.0 ČÚZK
- Slovakia: CC-BY 4.0 Úrad geodézie, kartografie a katastra SR
- Austria: CC-BY 4.0 BEV
- Lithuania: CC-BY 4.0 Geoportal.lt

## Naming and portraits

In [None]:
mapping = {
    1: {
        1: "Incoherent Fabric",
        2: "Coherent Fabric",
    },
    2: {
        1: "Incoherent Large-Scale Fabric",
        2: "Incoherent Small-Scale Fabric",
        3: "Coherent Interconnected Fabric",
        4: "Coherent Dense Fabric",
    },
    3: {
        1: "Incoherent Large-Scale Homogeneous Fabric",
        2: "Incoherent Large-Scale Heterogeneous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    },
}

In [3]:
with open("/data/uscuni-ulce/data_product/label_name.json", "w") as f:
    json.dump(mapping, f, indent=4)

In [None]:
pens = {
    1: {
        "Incoherent Fabric": "Incoherent fabric covers a wide morphological variety, with a common theme of partial or complete breakage of the traditional structural roles of streets, plots, and buildings. Common for modernist period, post-modern, and industrial developments, this branch has less typically less connected street networks and may showcase buildings facing open spaces and internal parts of blocks rather than streets. At the same time, it contains less defined village developments.",
        "Coherent Fabric": "In coherent fabric, all streets, plots, and buildings take their traditional structural roles in defining the spatial arrangement of the urban form. It is common for traditional European development with densely connected street networks and legible plot structure, facilitating direct relation between buildings and streets.",
    },
    2: {
        "Incoherent Large-Scale Fabric": "Incoherent large-scale fabric captures typically urban development composed of buildings larger than the average, that may or may not be far from each other, creating large open spaces. Streets tend to be of an utilitarian use, rather than a structural one, typical for modernist housing estates or industrial zones.",
        "Incoherent Small-Scale Fabric": "Incoherent small-scale fabric is mostly non-urban development capturing various kinds of villages and small towns, which show high variation of morphological properties. Buildings tend to be smaller, but distances between them vary, as well as the relations between buildings and streets.",
        "Coherent Interconnected Fabric": "Coherent interconnected fabric is typical for historical city and town centres, where buildings form intensive development. In this branch, the built-up density and local street connectivity are high, while inter-building distances remain relatively small. Buildings frequently share walls, forming larger structures with courtyards along relatively short and narrow streets.",
        "Coherent Dense Fabric": "Coherent dense fabric captures morphology typical for urban residential areas with lower density, where blocks are defined by streets more than buildings. The street networks are well defined and connected with buildings being either adjacent (e.g. row houses) or disjoint (e.g. urban villas).",
    },
    3: {
        "Incoherent Large-Scale Homogeneous Fabric": "Incoherent large-scale homogeneous fabric consists of the large buildings with moderate variations in size and shape, as well as low to moderate street connectivity and wide streets. The resulting environment is spacious, with significant open areas between structures, typical of modernist housing, with areas showing a relatively high degree of homogeneity caused by underlying planning principles.",
        "Incoherent Large-Scale Heterogeneous Fabric": "Incoherent large-scale heterogeneous fabric consists of the largest buildings with notable variations in size and shape, as well as low to moderate street connectivity and wide streets. The design does not emphasise sunlight exposure, creating broad but less refined configurations, typical of industrial and other service areas.",
        "Incoherent Small-Scale Linear Fabric": "Incoherent small-scale linear fabric has a moderate built-up area and low local street connectivity, typically forming long linear villages. Its streets are long, linear, wide, and there are minimal shared walls between structures.",
        "Incoherent Small-Scale Sparse Fabric": "Incoherent small-scale sparse fabric is characterised by low built-up density, low street connectivity, large distances between buildings, few shared walls, and large open spaces around buildings. The streets are few, open, and wide. The buildings are small to moderate in size, and their layout is more typical of rural areas.",
        "Incoherent Small-Scale Compact Fabric": "Incoherent small-scale compact fabric has low to moderate built-up area and street connectivity. Buildings exhibit a consistent alignment among themselves and also along streets of varying length, width, and linearity. There is also a significant number of shared walls between structures, typical for more traditional villages.",
        "Coherent Interconnected Fabric": "Coherent interconnected fabric is typical for historical city and town centres, where buildings form intensive development. In this branch, the built-up density and local street connectivity are high, while inter-building distances remain relatively small. Buildings frequently share walls, forming larger structures with courtyards along relatively short and narrow streets.",
        "Coherent Dense Disjoint Fabric": "The coherent dense disjoint fabric has moderate to high built-up density and local street connectivity, with longer and wider streets compared to other dense developments. Shared walls between buildings are less common, and distances within buildings are moderate, reflecting a pattern of standalone structures within a robust street network.",
        "Coherent Dense Adjacent Fabric": "In coherent dense adjacent fabric, the built-up density and local street connectivity are high, while inter-building distances remain relatively small. Buildings frequently share walls, forming larger structures along relatively short and narrow streets.",
    },
}

In [9]:
with open("/data/uscuni-ulce/data_product/pen_portraits.json", "w") as f:
    json.dump(pens, f, indent=4)