In [29]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from shapely import Polygon

In [30]:
v = 'v10'

# Buildings

In [2]:
v = 'v10'


In [3]:
cluster_mapping = pd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq')
regions = gpd.read_parquet('/data/uscuni-ulce/regions/cadastre_regions_hull.parquet')

In [None]:
%%time

all_clusters = []

for region_id in regions.index:

    print(region_id)

    clusters = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/clusters_{region_id}_{v}.pq')
    clusters = clusters[['geometry', 'morph', 'final', 'final_without_noise']]
    clusters.index = str(region_id) + '_' + clusters.index.astype(str)

    clusters['initially_noise'] = clusters.final != clusters.final_without_noise

    for level in range(1,7):
        clusters[f'level_{level}_label'] = clusters.final_without_noise.map(cluster_mapping[level].to_dict())
    clusters[f'level_7_label'] = clusters.final_without_noise

    clusters = clusters[['geometry', 'morph', 'initially_noise', 'level_1_label', 'level_2_label', 'level_3_label', 'level_4_label', 'level_5_label', 'level_6_label', 'level_7_label']]

    all_clusters.append(clusters.to_crs(epsg=4326))

all_clusters = pd.concat(all_clusters)

4
10
132
134
286
313
400
523
765
801
832
913
960
1124
1154
1387
1478
1515
1605
1718
1736
1782
1970
1981
2096
2322
2350
2478
2514
2625
2728
2975
3039
3109
3150
3221
3250
3526
3610
3612
3701
3705
3752
3759
3981
4070
4214
4215
4235
4284
4356
4382
4723
4805
5096
5191
5246
5310
5408
5427
5662
5671
5766
5883
6254
6529
6560
6576
6741
6749
6811
6873
6996
7068
7094
7280
7485
7528
7534
7681
7688
7712
7727
7805
7914
7937
7963
8046
8216
8238
8256
8265
8345
8374
8396
8592
8707
8731
8757
8759
8813
9016
9064
9074
9150
9169
9194
9284
9824
9924
9954
9972
10019
10086
10095
10124
10179
10222
10263
10277
10455
10510
10511
10563
10579
10602
10666
10794
10847
10908
10926
10970
11002
11019
11057
11141
11210
11256
11261
11305
11309
11311
11318
11367
11444
11455
11471
11667
11678
11735
11757
11799
11877
11905
12027
12084
12100
12115
12154
12191
12381
12440
12483
12552
12667
12707
12755
12756
12844
12919
12965
13076
13137
13172
13191
13196
13229
13301
13395
13442
13482
13506
13553
13555
13614
13616
13655
13677


In [5]:
import gc
gc.collect()

0

In [6]:
all_clusters.head()

Unnamed: 0,geometry,morph,initially_noise,level_1_label,level_2_label,level_3_label,level_4_label,level_5_label,level_6_label,level_7_label
4_0,"POLYGON ((6.84318 51.04242, 6.84322 51.04245, ...",4_0_88,False,2,4,7,13,20,48,90
4_1,"POLYGON ((6.84996 51.04759, 6.85016 51.04784, ...",4_0_56,False,2,4,7,13,20,46,85
4_2,"POLYGON ((6.8481 51.05071, 6.84823 51.05079, 6...",4_0_97,False,2,4,7,13,20,48,88
4_3,"POLYGON ((6.84973 51.04307, 6.84981 51.04323, ...",4_0_-1,True,2,4,8,17,24,55,102
4_4,"POLYGON ((6.84778 51.04658, 6.84782 51.04662, ...",4_0_56,False,2,4,7,13,20,46,85


In [7]:
%%time
all_clusters.sort_values('geometry').to_parquet('/data/uscuni-ulce/data_product/buildings.parquet', index=True, geometry_encoding='geoarrow', write_covering_bbox=True, schema_version='1.1.0')

CPU times: user 2min 39s, sys: 24.1 s, total: 3min 3s
Wall time: 3min 3s


In [None]:
# all_clusters.sort_values('geometry').to_file("/data/uscuni-ulce/data_product/buildings.fgb", engine="pyogrio")

# Buildings geoms per NUTS1 region

In [31]:
nuts = gpd.read_file('https://gisco-services.ec.europa.eu/distribution/v2/nuts/gpkg/NUTS_RG_60M_2024_3035.gpkg')

In [32]:
nuts_l1 = nuts[nuts.CNTR_CODE.isin(['DE', "AT", "CZ", "SK", "PL", "LT"])].query('LEVL_CODE == 1')

In [33]:
region_hulls = gpd.read_parquet('/data/uscuni-ulce/regions/cadastre_regions_hull.parquet')

region_idxs, nuts_idxs = nuts_l1.sindex.query(region_hulls.geometry, predicate='intersects')
intersections = region_hulls.iloc[region_idxs].intersection(nuts_l1.iloc[nuts_idxs], align=False).area
intersection_df = gpd.GeoDataFrame(
    {
    'region_id': region_hulls.index[region_idxs].values,
    'NUTS_ID': nuts_l1.iloc[nuts_idxs, 0].values,
    'intersection_area': intersections.values,
    'geometry': region_hulls.iloc[region_idxs, 0].values
    },
    crs=region_hulls.crs

)
cluster_mapping = pd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq')

In [34]:
folder = "/data/uscuni-ulce/data_product/buildings"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):

    region_data = []
    print(nuts_region)

    for region in intersection_df[intersection_df.NUTS_ID == nuts_region].region_id.values:

        clusters = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/clusters_{region}_{v}.pq')
        clusters = clusters[['geometry', 'unique_morph', 'final', 'final_without_noise']].rename(columns={'unique_morph': 'morphotope_id'})
        clusters.index = str(region) + '_' + clusters.index.astype(str)

        clusters['initially_noise'] = clusters.final != clusters.final_without_noise

        for level in range(1,7):
            clusters[f'level_{level}_label'] = clusters.final_without_noise.map(cluster_mapping[level].to_dict())
        clusters[f'level_7_label'] = clusters.final_without_noise

        clusters = clusters[['geometry', 'morphotope_id', 'initially_noise', 'level_1_label', 'level_2_label', 'level_3_label', 'level_4_label', 'level_5_label', 'level_6_label', 'level_7_label']]

        clusters = clusters.iloc[clusters.centroid.sindex.query(region_polygon, predicate='contains')]

        region_data.append(clusters)

    # save region data
    region_data = pd.concat(region_data)
    region_data.sort_values('geometry').to_parquet(f'{folder}/{nuts_region.lower()}.parquet', index=True, write_covering_bbox=True, schema_version='1.1.0')

DEA
DEB
DEC
DED
DEE
DEF
DEG
AT1
AT2
AT3
CZ0
DE1
DE2
DE3
DE4
DE5
DE6
DE7
DE8
DE9
LT0
PL2
PL4
PL5
PL6
PL7
PL8
PL9
SK0


# Linkage

In [6]:
linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/clusters/complete_linkage_10_{v}.npy')


In [7]:
np.save(f'/data/uscuni-ulce/data_product/ward_linkage_10.npy', linkage_matrix)

# Data

primary chars, morphotope labels, morphotope variables, linkage morphotopes.

Copy cluster input data

In [4]:
# !cp /data/uscuni-ulce/processed_data/clusters/cluster_input_data_v10.pq /data/uscuni-ulce/data_product/
# !mv  /data/uscuni-ulce/data_product/cluster_input_data_v10.pq /data/uscuni-ulce/data_product/cluster_input_data.parquet

In [5]:
# !cp /data/uscuni-ulce/processed_data/clusters/cdata_v10.pq /data/uscuni-ulce/data_product/
# !mv  /data/uscuni-ulce/data_product/cdata_v10.pq /data/uscuni-ulce/data_product/cluster_median_chars_200.parquet

Copy primary characters

In [37]:
primary = pd.read_parquet('/data/uscuni-ulce/processed_data/chars/primary_chars_69333.parquet')

In [38]:
primary

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,ssbCCM,ssbCCD,...,mibElo,mibERI,mibCCo,mibLAL,mibFR,mibSCo,micBAD,licBAD,misBAD,midBAD
-4242,,,,,,,,,,,...,,,,,,,208.790015,209.215673,,
-4241,,,,,,,,,,,...,,,,,,,201.959933,349.549459,,
-4240,,,,,,,,,,,...,,,,,,,191.996473,343.889482,0.000000,
-4239,,,,,,,,,,,...,,,,,,,199.266220,405.939270,,77.972974
-4238,,,,,,,,,,,...,,,,,,,56.400397,334.250353,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446822,18.808939,18.500639,0.0,0.497522,4.0,0.281908,1.000161,0.483786,3.462293,0.006698,...,0.411406,1.002327,0.448064,32.928188,4.448825,4.448825,0.229979,375.277537,262.586086,90.207646
446823,20.175570,18.761721,0.0,0.538543,4.0,0.075115,0.999742,0.553808,3.451425,0.001870,...,0.188995,0.996211,0.229472,34.127736,2.638659,2.638659,508.502435,742.332821,231.441900,274.249781
446824,13.866140,14.948105,0.0,0.625960,4.0,0.156247,0.999969,0.845145,2.651838,0.003801,...,0.852422,1.010727,0.645340,5.510775,0.988196,0.988196,163.211969,405.444622,501.412662,594.756223
446825,33.743731,29.450456,0.0,0.284469,4.0,1.529599,0.972651,0.261972,6.110411,0.157801,...,0.274174,0.980486,0.299731,12.489400,1.220809,1.220809,116.037844,98.813961,163.218683,99.352656


In [8]:
# !cp /data/uscuni-ulce/processed_data/chars/primary_chars_*.parquet /data/uscuni-ulce/data_product/data/

### Morphotope labels

In [None]:
regions = gpd.read_parquet('/data/uscuni-ulce/regions/cadastre_regions_hull.parquet')

In [10]:
model_params = '_post_processing_v1'
# region_id = 69333

In [19]:
%%time

all_morphs = []

for region_id in regions.index:
    print(region_id)
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq').morphotope_label
    morphotopes.index = str(region_id) + '_' + morphotopes.index.astype(str)
    all_morphs.append(morphotopes)

all_morphs = pd.concat(all_morphs)

4
10
132
134
286
313
400
523
765
801
832
913
960
1124
1154
1387
1478
1515
1605
1718
1736
1782
1970
1981
2096
2322
2350
2478
2514
2625
2728
2975
3039
3109
3150
3221
3250
3526
3610
3612
3701
3705
3752
3759
3981
4070
4214
4215
4235
4284
4356
4382
4723
4805
5096
5191
5246
5310
5408
5427
5662
5671
5766
5883
6254
6529
6560
6576
6741
6749
6811
6873
6996
7068
7094
7280
7485
7528
7534
7681
7688
7712
7727
7805
7914
7937
7963
8046
8216
8238
8256
8265
8345
8374
8396
8592
8707
8731
8757
8759
8813
9016
9064
9074
9150
9169
9194
9284
9824
9924
9954
9972
10019
10086
10095
10124
10179
10222
10263
10277
10455
10510
10511
10563
10579
10602
10666
10794
10847
10908
10926
10970
11002
11019
11057
11141
11210
11256
11261
11305
11309
11311
11318
11367
11444
11455
11471
11667
11678
11735
11757
11799
11877
11905
12027
12084
12100
12115
12154
12191
12381
12440
12483
12552
12667
12707
12755
12756
12844
12919
12965
13076
13137
13172
13191
13196
13229
13301
13395
13442
13482
13506
13553
13555
13614
13616
13655
13677


In [22]:
all_morphs = all_morphs.to_frame().reset_index()
all_morphs.columns = ['building_index', 'morphotope_label']
all_morphs.to_parquet('/data/uscuni-ulce/data_product/morphotope_labels.parquet')

# Morphotope geoms per NUTS1 region

In [35]:
folder = "/data/uscuni-ulce/data_product/morphotopes"
Path(folder).mkdir(parents=True, exist_ok=True)

for nuts_region, region_polygon in zip(nuts_l1.NUTS_ID, nuts_l1.geometry):

    region_data = []
    print(nuts_region)

    for region in intersection_df[intersection_df.NUTS_ID == nuts_region].region_id.values:

        clusters = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotope_clusters/{v}/{region}_clusters.pq')
        clusters = clusters[['geometry', 'final', 'final_without_noise', 'unique_morph']].rename(columns={'unique_morph': 'morphotope_id'})
        clusters['initially_noise'] = clusters.final != clusters.final_without_noise

        for level in range(1,7):
            clusters[f'level_{level}_label'] = clusters.final_without_noise.map(cluster_mapping[level].to_dict())
        clusters[f'level_7_label'] = clusters.final_without_noise
        clusters = clusters[['geometry', 'morphotope_id', 'initially_noise', 'level_1_label', 'level_2_label', 'level_3_label', 'level_4_label', 'level_5_label', 'level_6_label', 'level_7_label']]

        clusters = clusters.iloc[clusters.centroid.sindex.query(region_polygon, predicate='contains')]

        region_data.append(clusters)

    region_data = pd.concat(region_data)
    region_data.sort_values('geometry').to_parquet(f'{folder}/{nuts_region.lower()}.parquet', index=True, write_covering_bbox=True, schema_version='1.1.0')

DEA
DEB
DEC
DED
DEE
DEF
DEG
AT1
AT2
AT3
CZ0
DE1
DE2
DE3
DE4
DE5
DE6
DE7
DE8
DE9
LT0
PL2
PL4
PL5
PL6
PL7
PL8
PL9
SK0
