In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from fast_hdbscan.numba_kdtree import NumbaKDTree, kdtree_to_numba, rdist, point_to_node_lower_bound_rdist
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
import umap

In [2]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_100_0_None_None_False'

In [3]:
%%time

### read all morphotopes
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

data = []
for region_id, _ in region_hulls.iterrows():
    region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
    data.append(region_morphotope_data)

data = pd.concat(data)

CPU times: user 19.5 s, sys: 6.12 s, total: 25.6 s
Wall time: 11.3 s


In [4]:
data.shape

(1046897, 361)

In [5]:
# ## read a single region


# # model_params = '_100_3__median_gaussian_False'

# region_id = 69333
# data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
# data.index = str(region_id) + '_' + data.index

In [6]:
### drop noise
data = data[data.index.str[-2:] != '-1']

In [7]:
# data = data.drop(columns=['mibLAL', 'mibCCo'], level=0)

In [8]:
sizes = data.iloc[:, -1]
data = data.iloc[:, :-1]

In [9]:
vals = StandardScaler().fit_transform(data)
component_data = pd.DataFrame(vals, columns=data.columns, index=data.index)
component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)

# component_data = component_data[component_data.index >= 0]
vals = np.nan_to_num(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)


In [10]:
standardised_data = component_data.copy()

In [11]:
sizes.sort_values(ascending=False).describe().iloc[1:]

mean    177.033752
std      72.752014
min     100.000000
25%     122.000000
50%     156.000000
75%     211.000000
max     903.000000
Name: (Size, Size), dtype: float64

In [526]:
component_data = standardised_data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)

In [527]:
component_data.columns.get_level_values(0).unique()

Index(['sdbAre', 'sdbPer', 'sdbCoA', 'ssbCCo', 'ssbCor', 'ssbSqu', 'ssbERI',
       'ssbElo', 'ssbCCM', 'ssbCCD', 'stbOri', 'mtbSWR', 'libNCo', 'ldbPWL',
       'ltcBuA', 'mtbAli', 'mtbNDi', 'ltbIBD', 'stbCeA', 'stbSAl', 'sdsLen',
       'sssLin', 'ldsMSL', 'ldsRea', 'ldsAre', 'sisBpM', 'sdsSPW', 'sdsSPO',
       'sdsSWD', 'mtdDeg', 'lcdMes', 'linP3W', 'linP4W', 'linPDE', 'lcnClo',
       'lddNDe', 'linWID', 'ldsCDL', 'xcnSCl', 'mtdMDi', 'sddAre', 'midRea',
       'midAre', 'stcOri', 'sdcLAL', 'sdcAre', 'sscCCo', 'sscERI', 'mtcWNe',
       'mdcAre', 'ltcWRB', 'sicCAR', 'stcSAl', 'ldkAre', 'ldkPer', 'lskCCo',
       'lskERI', 'lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'sdsAre', 'likWCe',
       'mibCou', 'mibAre', 'mibLen', 'mibElo', 'mibERI', 'mibCCo', 'mibLAL',
       'mibFR', 'mibSCo'],
      dtype='object')

In [528]:
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe',
# 'sdbAre', 'ssbCCM', 'sdcLAL'
    # 'mibCCo', 'mibLAL'
]

In [529]:
component_data = component_data.drop(
    columns=to_drop, level=0)

In [530]:
component_data.columns =  ['_'.join(col).strip() for col in component_data.columns.values]

In [93]:
x = component_data.loc['69333_849_236'] # karlin offices
y = component_data.loc['69333_849_182'] # zelena liska
z = component_data.loc['69333_849_486'] # chodov

np.sqrt(((x - y)**2).sum()), np.sqrt(((y - z)**2).sum())

(np.float64(8.84110958854128), np.float64(9.913293362707826))

In [94]:
(x-y).iloc[(x - y).abs().argsort().values[::-1]].iloc[:25]

sdbCoA_mean    4.022492
sdcAre_mean    3.984369
ssbCor_mean    3.640017
mdcAre_mean    2.645649
sdcLAL_mean    1.792943
sdbAre_mean    1.690348
mtbNDi_mean    1.521084
ssbCCM_mean    1.482951
ssbERI_mean   -1.385401
linPDE_mean    1.247006
mtdDeg_mean   -1.203678
ssbCCD_mean    1.075466
mibElo_mean    1.054611
linP3W_mean   -0.899570
sdbPer_mean    0.887113
ssbElo_mean    0.859899
mibCCo_mean    0.837118
sscERI_mean   -0.773786
sicCAR_mean   -0.736595
sscCCo_mean    0.624066
mibSCo_mean   -0.615479
mibFR_mean    -0.615479
lcdMes_mean   -0.612787
mtbSWR_mean   -0.579919
ssbCCo_mean    0.529240
dtype: float64

In [95]:
(y - z).iloc[(y - z).abs().argsort().values[::-1]].iloc[:25]

sdbCoA_mean    7.412880
ssbSqu_mean    2.069991
lcdMes_mean    1.998629
sscCCo_mean   -1.836046
linPDE_mean   -1.821915
mdcAre_mean   -1.720401
mtbAli_mean    1.668633
ssbERI_mean    1.611204
sssLin_mean    1.486647
linP3W_mean    1.422661
ssbCCM_mean    1.325559
sscERI_mean   -1.137118
mibCCo_mean   -1.120127
ssbCor_mean    1.044433
ltcBuA_mean   -1.014777
mibElo_mean   -0.955507
mtbSWR_mean    0.856398
ssbCCo_mean   -0.804218
sdbAre_mean    0.795624
sdcAre_mean   -0.788661
ltbIBD_mean   -0.759279
mibFR_mean     0.702737
mibSCo_mean    0.702737
mibAre_mean    0.627306
sdbPer_mean   -0.603346
dtype: float64

In [96]:
(x - z).iloc[(x - z).abs().argsort().values[::-1]].iloc[:25]

sdbCoA_mean    11.435373
ssbCor_mean     4.684450
sdcAre_mean     3.195708
ssbCCM_mean     2.808510
sdbAre_mean     2.485971
sscERI_mean    -1.910905
ssbSqu_mean     1.868737
mtbAli_mean     1.846933
sssLin_mean     1.605119
lcdMes_mean     1.385842
sdcLAL_mean     1.305528
sscCCo_mean    -1.211980
mtbNDi_mean     1.130538
mdcAre_mean     0.925248
ssbCCD_mean     0.777969
linWID_mean     0.711534
mtdDeg_mean    -0.706660
ltcBuA_mean    -0.665788
linPDE_mean    -0.574909
linP3W_mean     0.523091
lddNDe_mean     0.488031
ssbElo_mean     0.372072
sdsSWD_mean     0.335930
mibERI_mean    -0.291230
stbSAl_mean    -0.287568
dtype: float64

In [289]:
from sklearn.neighbors import KDTree
sklearn_tree = KDTree(component_data)

In [293]:
### 182 and similar to be split
### 69333_849_104 and _12 and their similar to be kept
### 69333_849_235

In [531]:
dists, indxs = sklearn_tree.query(component_data.loc[['69333_849_235']], k=20)

In [532]:
component_data.iloc[indxs[0]]

Unnamed: 0,sdbAre_mean,sdbPer_mean,sdbCoA_mean,ssbCCo_mean,ssbCor_mean,ssbSqu_mean,ssbERI_mean,ssbElo_mean,ssbCCM_mean,ssbCCD_mean,...,sdsAre_mean,mibCou_mean,mibAre_mean,mibLen_mean,mibElo_mean,mibERI_mean,mibCCo_mean,mibLAL_mean,mibFR_mean,mibSCo_mean
69333_849_235,6.361066,6.559513,9.544403,-2.809671,6.581269,5.084161,-2.810064,-1.977407,5.241692,7.897009,...,-0.299567,-0.282339,0.276151,0.202139,-1.162951,-0.135145,-0.8843,0.527682,2.916929,2.916929
4_1015_576,6.657294,6.076534,9.647267,-2.583384,3.99008,2.9915,-2.443071,-1.605773,5.22925,5.296136,...,-0.168404,0.103491,0.523715,0.379225,-0.826229,-0.549497,-0.799787,0.73246,3.479966,3.479966
69333_849_507,8.212463,7.248257,7.517238,-2.587798,7.592602,1.973517,-3.184784,-1.614845,5.800661,6.634778,...,-0.101731,-0.284117,0.364007,0.235156,-1.608493,-0.010893,-1.171413,0.715923,3.479468,3.479468
69333_849_4,5.798241,7.583892,7.706752,-4.843136,7.915635,2.688038,-5.078881,-3.04107,6.120845,8.7315,...,-0.085397,-0.110257,0.446675,0.551771,-1.968133,-1.006411,-2.058667,1.142481,2.27548,2.27548
3221_1_13,5.828617,6.045937,9.808451,-1.278046,6.065556,3.577161,-3.587298,-0.727422,3.907439,5.577772,...,0.23901,-0.309612,0.530071,0.376623,-0.445551,-0.48544,-0.232432,0.512537,1.94153,1.94153
84962_217_105,6.887221,7.470889,4.662834,-3.253057,9.324043,3.67997,-3.521517,-2.15269,5.810552,8.17247,...,-0.207482,-0.29106,0.294038,0.245352,-1.235082,-0.1723,-0.805623,0.615343,2.873785,2.873785
90770_255_247,5.11762,6.097275,8.915212,-2.830032,7.949672,2.555351,-3.652639,-1.665348,4.901957,5.835443,...,-0.272343,0.069175,0.512959,0.553414,-0.874654,-1.380445,-1.33672,0.950898,3.02292,3.02292
86873_3_951,6.399646,6.098583,12.310126,-2.32087,4.562314,1.679318,-1.666646,-1.912052,4.905438,5.110082,...,-0.141969,-0.342037,0.353518,0.248814,-0.966019,0.2479,-0.327567,0.46475,1.779786,1.779786
86873_3_339,7.20571,8.073597,5.427752,-3.160946,6.598555,0.845943,-5.227485,-1.760072,5.412708,9.50577,...,-0.18332,-0.242708,0.480516,0.485906,-1.055669,-1.037844,-1.11127,0.79242,1.992134,1.992134
109491_31_685,9.11198,7.019472,12.195062,-1.198469,3.941199,4.917797,-2.08054,-1.0358,6.013253,5.674274,...,-0.065779,-0.288364,0.538585,0.328669,-0.287317,-0.159426,0.025317,0.678661,4.088997,4.088997


In [105]:
region_id = 69333


etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')
etcs['morph'] = "-1"

morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]
etcs.loc[morphotopes.index, 'morph'] = morphotopes.values

In [297]:
# etcs[etcs.morph == '849_122'].explore()

In [62]:
x = component_data.loc['69333_849_182'] # karlin offices
y = component_data.loc['69333_849_333'] # zelena liska

In [263]:
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
primary = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

to_drop = ['stcSAl','stbOri','stcOri','stbCeA', 
               'ldkAre', 'ldkPer', 'lskCCo', 'lskERI',
               'lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe']
primary = primary.drop(to_drop, axis=1)

In [264]:
# 849_182, 849_507, 849_436
# 849_104, 849_121, 849_46, 849_122, 849_101
### 849_235, 849_507, 849_4, 849_182

In [None]:
primary

In [654]:
target_morphotope = '849_507'

In [655]:
chars_data = primary.loc[etcs[etcs.morph == target_morphotope].index]
# vals = StandardScaler().fit_transform(chars_data)
# component_data = pd.DataFrame(vals, columns=chars_data.columns, index=chars_data.index)
# component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)

# # component_data = component_data[component_data.index >= 0]
# vals = np.nan_to_num(component_data)
# chars_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)

In [656]:
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
clusterer = AgglomerativeClustering(linkage='single',
                                    metric='euclidean',
                                    compute_full_tree=True,
                                    compute_distances=True)
model = clusterer.fit(chars_data.values)
linkage_matrix = get_linkage_matrix(model)

In [657]:
chars_clusters = fcluster(linkage_matrix, t=5, criterion='distance')
pd.Series(chars_clusters).value_counts()

3     45
18    19
2      9
14     4
10     3
      ..
46     1
60     1
40     1
66     1
48     1
Name: count, Length: 72, dtype: int64

In [658]:
top_10_clusters = pd.Series(chars_clusters).value_counts().iloc[:10].index

In [659]:
chars_clusters[~np.isin(chars_clusters, top_10_clusters)] = -1

In [660]:
etcs[etcs.morph == target_morphotope].reset_index().explore(column=chars_clusters, categorical=True)

In [564]:
from core.generate_clusters import preprocess_clustering_data,get_clusters,get_tree

In [661]:
from libpysal.graph import read_parquet
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')


min_cluster_size = 100
clip=None
eom_clusters = False
linkage='ward'
metric='euclidean'

In [662]:
# label building input data, could work with empty tess as well
building_graph = graph.subgraph(graph.unique_ids[graph.unique_ids >= 0])
labels = building_graph.component_labels

In [None]:
def post_process_clusters(group, min_cluster_size):
    
    if group.name == -1: return pd.Series(np.full(group.shape[0], -1), group.index)

    clusterer = AgglomerativeClustering(linkage='single',
                                    metric='euclidean',
                                    compute_full_tree=True,
                                    compute_distances=True)
    model = clusterer.fit(group.values)
    linkage_matrix = get_linkage_matrix(model)
    clusters = fcluster(linkage_matrix, t=5, criterion='distance')
    
    chars_clusters = pd.Series(clusters).value_counts()
    chars_clusters[chars_clusters < min_cluster_size] = -1
    chars_clusters[chars_clusters >= min_cluster_size] = group.name
    clusters = pd.Series(clusters).map(lambda x: chars_clusters.loc[x]).values
    return pd.Series(clusters, group.index)

In [738]:
for label, group in labels.groupby(labels):

    if label != 849: continue
    
    if group.shape[0] <= min_cluster_size:
        component_clusters = np.full(group.shape[0], -1)

    else:
        component_buildings_data = preprocess_clustering_data(X_train.loc[group.index.values], clip=clip, to_drop=to_drop)
        component_graph = building_graph.subgraph(group.index.values)
        ward_tree = get_tree(component_buildings_data, component_graph.transform('B').sparse, linkage, metric)

        # # sometimes ward linkage breaks the monotonic increase in the MST
        # # if that happens shift all distances by the max drop
        # # need a loop because several connections might be problematic
        problem_idxs = np.where(ward_tree[1:, 2] < ward_tree[0:-1, 2])[0]
        while problem_idxs.shape[0]:
            ward_tree[problem_idxs + 1, 2] = ward_tree[problem_idxs, 2] + .01
            problem_idxs = np.where(ward_tree[1:, 2] < ward_tree[0:-1, 2])[0]
        # check if ward tree distances are always increasing
        assert (ward_tree[1:, 2] >= ward_tree[0:-1, 2]).all()
        
        component_clusters = get_clusters(ward_tree, min_cluster_size, component_buildings_data.shape[0], eom_clusters=eom_clusters)

        ## post process
        res = component_buildings_data.groupby(component_clusters).apply(post_process_clusters, min_cluster_size=min_cluster_size)
        component_clusters = pd.Series(res.values, res.index.get_level_values(1)).loc[component_buildings_data.index].values
        break

In [746]:
# etcs.loc[group.index.values][component_clusters == 101].explore()