# Cluster characteristics

Generate summary characteristics of clusters at a selected resolution.

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
import rioxarray
import dataframe_image as dfi
from core.utils import get_cluster_names

from matplotlib import rcParams

rcParams["font.family"] = "sans-serif"
rcParams["font.sans-serif"] = ["Lato"]

v = "v10"

regions = gpd.read_parquet("/data/uscuni-ulce/regions/cadastre_regions_hull.parquet")

In [None]:
cluster_mapping = pd.read_parquet(
    f"/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq"
)

In [None]:
# regional clustering load

results = []
for region_id, region_hull in regions.iterrows():
    print(region_id)

    # setup regional data
    region_clusters = pd.read_parquet(
        f"/data/uscuni-ulce/processed_data/clusters/clusters_{region_id}_{v}.pq",
        columns=["final_without_noise"],
    ).final_without_noise
    # region_clusters = region_clusters.map(cluster_mapping[3].to_dict())

    region_data = pd.read_parquet(
        f"/data/uscuni-ulce/processed_data/chars/primary_chars_{region_id}.parquet"
    )
    region_data = region_data.loc[region_clusters.index]

    region_data["label"] = region_clusters

    results.append(region_data)

all_data = pd.concat(results, ignore_index=True)

4
10
132
134
286
313
400
523
765
801
832
913
960
1124
1154
1387
1478
1515
1605
1718
1736
1782
1970
1981
2096
2322
2350
2478
2514
2625
2728
2975
3039
3109
3150
3221
3250
3526
3610
3612
3701
3705
3752
3759
3981
4070
4214
4215
4235
4284
4356
4382
4723
4805
5096
5191
5246
5310
5408
5427
5662
5671
5766
5883
6254
6529
6560
6576
6741
6749
6811
6873
6996
7068
7094
7280
7485
7528
7534
7681
7688
7712
7727
7805
7914
7937
7963
8046
8216
8238
8256
8265
8345
8374
8396
8592
8707
8731
8757
8759
8813
9016
9064
9074
9150
9169
9194
9284
9824
9924
9954
9972
10019
10086
10095
10124
10179
10222
10263
10277
10455
10510
10511
10563
10579
10602
10666
10794
10847
10908
10926
10970
11002
11019
11057
11141
11210
11256
11261
11305
11309
11311
11318
11367
11444
11455
11471
11667
11678
11735
11757
11799
11877
11905
12027
12084
12100
12115
12154
12191
12381
12440
12483
12552
12667
12707
12755
12756
12844
12919
12965
13076
13137
13172
13191
13196
13229
13301
13395
13442
13482
13506
13553
13555
13614
13616
13655
13677


In [None]:
from core.utils import used_keys

new_keys = used_keys.copy()
new_keys["limLPS"] = "building structure optimised for sunlight"
new_keys["limAre"] = "sum of the largest 75 building areas in the morphotope"
new_keys["limPer"] = "sum of the largest 75 building perimeters in the morphotope"
new_keys["limLAL"] = "sum of the largest 75 axis lengths"

In [None]:
# remap to cut
all_data["label"] = all_data["label"].map(cluster_mapping[3].to_dict())

In [None]:
all_data["label"].value_counts()

label
7    34554418
5    12840909
4    11733158
3     7383591
8     7186509
1     3277945
2     2388970
6     1430313
Name: count, dtype: int64

In [None]:
## doesnt include indicator var, since that is a morphotope variable...
cluster_centres = all_data.groupby("label").mean()
cluster_centres.columns = cluster_centres.columns.map(new_keys)
table_data = cluster_centres

In [None]:
cols1 = table_data.columns[: table_data.columns.shape[0] // 2]
cols2 = table_data.columns[table_data.columns.shape[0] // 2 :]

#### get a subset of differentiating variables

In [None]:
linkage_matrix = np.load(
    f"/data/uscuni-ulce/processed_data/clusters/kmeans_linkage_{v}.npy"
)
regional_ward_morphotopes_data = pd.read_parquet(
    f"/data/uscuni-ulce/processed_data/clusters/cluster_input_data_{v}.pq"
)

In [None]:
from scipy.cluster.hierarchy import fcluster

final_cutoff = 1 / 4 - 0.00001
clusters = fcluster(linkage_matrix, t=final_cutoff, criterion="distance")
final_clusters = pd.Series(clusters, regional_ward_morphotopes_data.index)

cinput_data = regional_ward_morphotopes_data.groupby(final_clusters).median()
total_median = regional_ward_morphotopes_data.median()


results = []
for i, row_data in cinput_data.iterrows():
    diffs = row_data - total_median
    top_diffs = ((diffs) ** 2).sort_values().iloc[-10:]
    top_diffs = diffs.loc[top_diffs.index].sort_values(ascending=False)
    results.append(top_diffs.index.values)

col_subset = np.unique(results)
col_subset.shape

(52,)

#### plot tables

In [None]:
cluster_centres = all_data.groupby("label").mean()

In [None]:
col_keys = new_keys.copy()
to_drop = [
    "stcSAl",
    "stbOri",
    "stcOri",
    "stbCeA",
    "ldkAre",
    "ldkPer",
    "lskCCo",
    "lskERI",
    "lskCWA",
    "ltkOri",
    "ltkWNB",
    "likWBB",
    "likWCe",
    "licBAD",
    "misBAD",
    "ssbCCM",
    "ssbCCD",
]

for k in to_drop:
    col_keys.pop(k, None)

In [None]:
# cluster_centres['limLPS'] = [2.0] * 9 + [-4.8, 2, 2, 2, 2, -4.8, 2]
table_data = cluster_centres.loc[
    :, cluster_centres.columns[cluster_centres.columns.isin(list(col_keys.keys()))]
]
table_data.columns = [col_keys[k] for k in table_data.columns]
table_data.index = table_data.index.map(get_cluster_names(3)).values

In [None]:
cols1 = table_data.columns[: table_data.columns.shape[0] // 2]
cols2 = table_data.columns[table_data.columns.shape[0] // 2 :]

In [None]:
table = table_data.loc[:, cols1]
f = {k: "{:.2f}" for k in table_data.columns.values}
table = table.style.format(f).background_gradient(axis=0, cmap="BuGn")
table

Unnamed: 0,area of building,perimeter of building,courtyard area of building,circular compactness of building,corners of building,squareness of building,equivalent rectangular index of building,elongation of building,shared walls ratio of buildings,number of courtyards within adjacent buildings,perimeter wall length of adjacent buildings,level of building adjacency,alignment of neighbouring buildings,mean distance between neighbouring buildings,mean inter-building distance,street alignment of building,length of street segment,linearity of street segment,mean segment length within 3 steps,reached ETCs by local street network,reached total ETC area by local street network,buildings per meter of street segment,width of street profile,openness of street profile,width deviation of street profile,node degree of junction,local meshedness of street network,local proportion of 3-way intersections of street network,local proportion of 4-way intersections of street network
Incoherent Large-Scale Homogeneous Fabric,368.78,76.38,1.63,0.47,7.49,2.71,0.95,0.58,0.17,0.05,164.8,0.75,3.82,22.04,24.05,8.76,381.39,0.93,233.6,322.16,605944.26,0.09,30.91,0.65,3.92,2.84,0.1,0.71,0.13
Incoherent Large-Scale Heterogeneous Fabric,561.25,74.23,1.75,0.5,6.49,3.54,0.96,0.61,0.18,0.08,217.87,0.75,3.84,23.15,25.63,9.92,623.46,0.92,342.0,335.16,870183.15,0.09,32.22,0.75,4.14,2.72,0.08,0.71,0.11
Incoherent Small-Scale Linear Fabric,133.27,46.05,0.08,0.52,5.22,1.51,0.97,0.62,0.08,0.01,67.78,0.87,3.53,24.7,26.69,11.73,1987.72,0.91,912.85,733.44,2813207.11,0.07,31.62,0.89,4.67,2.91,0.08,0.72,0.09
Incoherent Small-Scale Sparse Fabric,140.66,47.28,0.32,0.53,5.81,1.82,0.96,0.66,0.07,0.0,63.23,0.89,4.8,30.44,32.8,10.55,765.05,0.92,529.46,355.26,1372893.55,0.07,31.08,0.83,4.56,2.71,0.07,0.72,0.07
Incoherent Small-Scale Compact Fabric,117.9,42.87,0.15,0.53,6.1,2.03,0.96,0.69,0.11,0.01,66.69,0.84,4.42,20.0,21.87,9.72,436.8,0.93,330.02,345.31,678119.83,0.13,27.52,0.67,4.69,2.73,0.08,0.71,0.08
Coherent Interconnected Fabric,221.41,61.48,2.04,0.49,7.98,5.01,0.92,0.64,0.4,2.58,671.06,0.4,3.28,7.69,10.33,5.61,174.29,0.97,133.23,433.65,311712.83,0.18,19.78,0.28,3.23,3.21,0.16,0.69,0.21
Coherent Dense Disjoint Fabric,92.17,38.03,0.06,0.54,6.12,2.16,0.96,0.69,0.18,0.01,74.82,0.75,3.41,12.52,14.29,8.07,224.27,0.93,180.53,375.76,363542.71,0.17,24.61,0.48,4.5,2.86,0.1,0.72,0.11
Coherent Dense Adjacent Fabric,110.43,42.16,0.29,0.5,6.36,3.15,0.95,0.63,0.35,0.15,182.95,0.56,3.14,9.24,11.73,7.69,194.62,0.95,154.13,413.85,319197.72,0.22,22.26,0.39,4.38,2.92,0.11,0.72,0.12


In [None]:
await dfi.export_async(table, "../data/features1.png")

In [None]:
table = table_data.loc[:, cols2]
f = {k: "{:.2f}" for k in table_data.columns.values}
table2 = table.style.format(f).background_gradient(axis=0, cmap="BuGn")
table2

Unnamed: 0,local proportion of cul-de-sacs of street network,local closeness of street network,local node density of street network,local degree weighted node density of street network,local cul-de-sac length of street network,square clustering of street network,mean distance to neighbouring nodes of street network,area covered by node-attached ETCs,number of tess cells in node neigbhorhood,total area of tess cells in node neigbhorhood,longest axis length of ETC,area of ETC,circular compactness of ETC,equivalent rectangular index of ETC,perimeter-weighted neighbours of ETC,area covered by neighbouring cells,number of unique enclosures in ETC neigbhourhood,covered area ratio of ETC,area covered by edge-attached ETCs,connected buildings count,connected buildings area,connected buildings perimeter,connected buildings elongation,connected buildings equivalent rectangle index,connected buildings circular compactness,connected buildings longest axis length,connected buildings facade ratio,connected buildings square compactness,deviation of building area in tess. neighbourhood,deviation of building area in node-attached buildings
Incoherent Large-Scale Homogeneous Fabric,0.15,0.0,0.0,0.01,400.86,0.02,272.08,58854.34,59.11,117764.3,76.84,3010.63,0.44,0.95,0.05,20800.78,0.0,0.19,62187.71,3.23,1082.84,168.39,0.54,0.85,0.4,53.18,4.49,4.49,527.63,430.59
Incoherent Large-Scale Heterogeneous Fabric,0.17,0.0,0.0,0.01,632.93,0.02,432.82,105917.74,62.73,181060.05,87.65,4014.27,0.43,0.95,0.05,28584.53,0.0,0.17,113373.58,3.28,2564.57,228.06,0.61,0.87,0.46,60.63,5.27,5.27,1314.05,1003.0
Incoherent Small-Scale Linear Fabric,0.19,0.0,0.0,0.0,1796.31,0.01,1349.36,344374.7,160.68,621634.5,97.25,4719.85,0.45,0.96,0.04,23617.53,0.0,0.09,366840.38,1.68,240.92,67.37,0.62,0.93,0.49,23.3,2.74,2.74,108.67,147.69
Incoherent Small-Scale Sparse Fabric,0.2,0.0,0.0,0.0,959.54,0.02,565.17,136043.59,70.82,271830.92,98.84,4627.89,0.44,0.95,0.04,25720.61,0.0,0.08,142247.37,1.49,223.9,62.72,0.66,0.93,0.51,21.81,2.73,2.73,129.41,142.58
Incoherent Small-Scale Compact Fabric,0.21,0.0,0.0,0.01,763.08,0.01,319.31,62406.02,72.69,130814.92,61.2,1814.11,0.45,0.96,0.06,12494.42,0.0,0.13,65609.74,1.86,229.85,66.53,0.67,0.91,0.5,22.26,2.63,2.63,120.94,129.15
Coherent Interconnected Fabric,0.09,0.0,0.01,0.01,150.49,0.04,134.79,18909.71,85.03,53686.22,35.82,618.34,0.46,0.95,0.09,5875.42,0.0,0.43,17116.34,25.64,5373.59,885.49,0.57,0.43,0.29,148.7,5.1,5.1,301.19,226.48
Coherent Dense Disjoint Fabric,0.17,0.0,0.01,0.01,302.04,0.03,169.87,25636.24,77.91,65095.77,40.42,781.85,0.44,0.96,0.08,6002.9,0.0,0.2,25920.58,2.7,245.38,74.89,0.67,0.87,0.48,24.42,2.67,2.67,98.57,96.12
Coherent Dense Adjacent Fabric,0.15,0.0,0.01,0.01,259.15,0.03,148.46,21494.13,84.26,55296.95,35.24,596.72,0.41,0.95,0.1,4776.62,0.0,0.3,21432.79,8.26,763.48,190.76,0.53,0.73,0.35,52.96,3.45,3.45,127.25,117.79


In [None]:
await dfi.export_async(table2, "../data/features2.png")

In [None]:
selected_cols = [
    "area of building",
    "courtyard area of building",
    "shared walls ratio of buildings",
    "street alignment of building",
    "mean distance between neighbouring buildings",
    "alignment of neighbouring buildings",
    "buildings per meter of street segment",
    "length of street segment",
    "width of street profile",
    "linearity of street segment",
    "local proportion of cul-de-sacs of street network",
    "node degree of junction",
    "square clustering of street network",
    "mean distance to neighbouring nodes of street network",
    "area of ETC",
    "perimeter-weighted neighbours of ETC",
    "connected buildings count",
    "connected buildings facade ratio",
    "deviation of building area in tess. neighbourhood",
]
table = table_data.loc[:, selected_cols]
f = {k: "{:.2f}" for k in table_data.columns.values}
table.loc[
    [
        "Dense Connected Developments",
        "Large Interconnected Blocks",
        "Dense Standalone Buildings",
        "Compact Development",
        "Cul-de-Sac Layout",
        "Aligned Winding Streets",
        "Sparse Rural Development",
        "Dispersed Linear Development",
        "Linear Development",
        "Sparse Open Layout",
        "Sparse Road-Linked Development",
        "Large Utilitarian Development",
        "Extensive Wide-Spaced Developments",
    ]
].style.format(f).background_gradient(axis=0, cmap="BuGn")

Unnamed: 0,area of building,courtyard area of building,shared walls ratio of buildings,street alignment of building,mean distance between neighbouring buildings,alignment of neighbouring buildings,buildings per meter of street segment,length of street segment,width of street profile,linearity of street segment,local proportion of cul-de-sacs of street network,node degree of junction,square clustering of street network,mean distance to neighbouring nodes of street network,area of ETC,perimeter-weighted neighbours of ETC,connected buildings count,connected buildings facade ratio,deviation of building area in tess. neighbourhood
Dense Connected Developments,122.59,0.34,0.34,7.52,9.46,2.99,0.22,195.82,22.87,0.94,0.15,2.95,0.03,149.72,579.31,0.1,7.59,3.53,150.82
Large Interconnected Blocks,203.45,2.21,0.41,5.85,6.79,3.37,0.19,165.29,18.71,0.97,0.1,3.21,0.04,128.42,536.98,0.1,25.41,4.99,279.88
Dense Standalone Buildings,101.66,0.12,0.1,6.04,14.84,2.94,0.15,221.18,26.3,0.95,0.13,3.06,0.05,167.88,867.94,0.07,1.72,2.47,111.32
Compact Development,83.27,0.05,0.2,7.44,10.68,3.03,0.18,180.97,23.81,0.93,0.15,2.93,0.04,137.91,569.6,0.09,2.73,2.63,90.59
Cul-de-Sac Layout,91.69,0.07,0.14,9.19,15.16,3.97,0.17,233.34,25.37,0.92,0.25,2.48,0.01,178.2,1188.85,0.07,2.0,2.49,98.79
Aligned Winding Streets,122.45,0.17,0.18,10.04,15.08,4.31,0.14,399.0,25.8,0.92,0.19,2.81,0.02,284.72,1490.98,0.07,2.71,2.92,142.95
Sparse Rural Development,147.03,0.07,0.05,11.4,34.58,3.85,0.05,1812.78,33.84,0.92,0.18,2.86,0.02,1281.57,6435.64,0.03,1.35,2.71,127.48
Dispersed Linear Development,116.73,0.02,0.06,9.83,20.95,2.45,0.11,2102.63,31.48,0.94,0.19,3.1,0.01,1469.76,2265.6,0.05,1.54,2.59,73.27
Linear Development,120.35,0.02,0.07,10.18,22.6,3.16,0.1,1505.61,31.04,0.94,0.2,2.92,0.02,1054.11,2881.58,0.05,1.6,2.61,89.15
Sparse Open Layout,172.28,0.54,0.07,10.45,29.78,4.9,0.06,702.87,30.8,0.92,0.2,2.71,0.02,507.39,4468.9,0.03,1.5,2.92,177.03
