In [1]:
import pandas as pd
import numpy as np

In [2]:
embedding = pd.read_parquet('../data/morphotope_embedding_no_isolates.pq')

In [3]:
MIN_SAMPLES = 15
data = embedding.values

In [4]:
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.cluster_trees import (
    mst_to_linkage_tree, condense_tree, 
cluster_tree_from_condensed_tree, extract_eom_clusters, extract_leaves,
get_cluster_label_vector, get_point_membership_strength_vector
)
from sklearn.neighbors import KDTree


def compute_minimum_spanning_tree(data, min_samples=10, sample_weights=None):
    
    edges, neighbors, core_distances = parallel_boruvka(
        numba_tree, min_samples=min_samples, sample_weights=sample_weights
    )
    return edges, neighbors, core_distances



In [5]:
%%time
sklearn_tree = KDTree(data)
numba_tree = kdtree_to_numba(sklearn_tree)
mst = parallel_boruvka(
        numba_tree, min_samples=MIN_SAMPLES, sample_weights=None
)
n_points = mst.shape[0] + 1
sorted_mst = mst[np.lexsort((mst.T[1], mst.T[0], mst.T[2]))]

linkage_tree = mst_to_linkage_tree(sorted_mst)


CPU times: user 8.21 s, sys: 42.6 ms, total: 8.25 s
Wall time: 876 ms


In [48]:
MIN_SAMPLES = 100

cluster_selection_method = 'eom'


In [49]:
condensed_tree = condense_tree(
    linkage_tree, min_cluster_size=MIN_SAMPLES, sample_weights=None
)
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)

if cluster_selection_method == "eom":
    selected_clusters = extract_eom_clusters(
                condensed_tree,
                cluster_tree,
                allow_single_cluster=False,
    )
else:
    selected_clusters = extract_leaves(condensed_tree, False)

In [50]:
clusters = get_cluster_label_vector(
    condensed_tree,
    selected_clusters,
    0.0,
    n_samples=n_points
)
membership_strengths = get_point_membership_strength_vector(
    condensed_tree, selected_clusters, clusters
)

In [51]:
pd.Series(clusters).value_counts()

 22    321941
 15     36302
 34     24750
 7      20449
 3      13714
 27     10918
 28      9338
 35      9128
 0       6329
 26      5744
 5       3297
 10      3037
-1       2839
 13      1999
 32      1856
 30      1830
 23      1573
 33      1537
 25      1016
 1        973
 31       799
 18       786
 4        543
 12       485
 9        479
 29       325
 11       321
 14       245
 21       238
 16       235
 6        206
 8        162
 17       156
 24       140
 2        126
 19       116
 20       105
Name: count, dtype: int64

In [33]:
final_clusters = pd.Series(clusters, index=embedding.index)

In [34]:
dists, nns = sklearn_tree.query(embedding.loc[['69333_849_74']].values, k=25)

In [35]:
clusters[nns]

array([[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4]])

In [36]:
clusters[clusters == 4]

array([4, 4, 4, ..., 4, 4, 4])

In [157]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [0,0,0]

NameError: name 'final_clusters' is not defined

In [148]:
region_id = 69333 

# region_id = 99886 # bratislava

# region_id = 151676 # vilnius

# region_id = 8707 # mainz/frankfurt
# region_id = 5883 #  freiburg
# region_id = 38679 #munich
# region_id = 55763 # berlin

# region_id = 86873 # vienna

# region_id = 107131 # krakow


# region_id= 66593

# region_id = 91011

In [7]:
# etcs=False to read buildings, etcs=True for tessellation cells.
model_params = '_post_processing_v1'
import geopandas as gpd

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs
    


In [5]:
etcs = morphotopes_to_etcs(region_id, etcs=False, model_params=model_params)

NameError: name 'region_id' is not defined

In [6]:
# etcs1 = morphotopes_to_etcs(69333, etcs=False, model_params=model_params)
# etcs2 = morphotopes_to_etcs(74378, etcs=False, model_params=model_params)

# etcs = pd.concat((etcs1, etcs2), ignore_index=True)

In [193]:


# # direct kmeans
# etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[x] if x in embedding.index else -1)
# etcs['regional'] = etcs['morph']


In [194]:

etcs['problem'] = ((etcs.morph.isin(problem_morphs)) | (etcs.morph.str[-2:] == '-1')).astype(int)

In [195]:
# ## can run this to change colors on an exisitng layer
# layer.get_fill_color = get_color(etcs.final)

In [196]:
etcs['geometry'] = etcs.simplify(1).to_crs(epsg=4326).make_valid()
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']

In [7]:

%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

NameError: name 'etcs' is not defined

In [8]:
from sidecar import Sidecar
sc = Sidecar(title=f'Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.DarkMatter)
with sc:
    display(m)

NameError: name 'layer' is not defined

In [None]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(np.arange(3)))
final_colors.loc[-1] = [0,0,0]

is_noise = etcs.morph.str[-2:] == '-1'
etcs['problem'] = (etcs.morph.isin(problem_morphs) | is_noise).astype(int)
etcs.loc[is_noise, 'problem'] = -1

layer.get_fill_color = final_colors.loc[etcs.problem].values.astype('uint8')

In [190]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[etcs.final].values.astype('uint8')

AttributeError: 'GeoDataFrame' object has no attribute 'final'

In [8]:
region_id = 69333

In [9]:
etcs = morphotopes_to_etcs(region_id, etcs=False, model_params=model_params)

In [10]:
modernist_morph = '69333_849_364'
industrial_morph = '69333_1354_5'

In [11]:
modernist_etcs = etcs[etcs.morph == modernist_morph]
industrial_etcs = etcs[etcs.morph == industrial_morph]


In [12]:
from libpysal.graph import read_parquet, Graph
graph_dir = '/data/uscuni-ulce/processed_data/neigh_graphs/'
buildings_q1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")

In [13]:
def connect_buildings(group):
    group_graph = buildings_q1.subgraph(group.index)
    connected_buildings = group.dissolve(group_graph.component_labels)
    return connected_buildings

In [14]:
connected_modernist = connect_buildings(modernist_etcs)
connected_industrial = connect_buildings(industrial_etcs)

In [16]:
# %%time
# problem_morphs = []
# ok_morphs = []

# for morph in etcs.morph.unique():
    
#     if morph[-2:] == '-1': continue

#     morph_etcs = etcs[etcs.morph == morph]
#     cb = connect_buildings(morph_etcs)
    
#     if cb.shape[0] < 3:
#         problem_morphs.append(morph)
#         continue

#     tri = Graph.build_triangulation(cb.representative_point(), method='relative_neighborhood', kernel='identity')
#     res = tri._adjacency / cb.length.median()
    
#     if res.max() > 2:
#         problem_morphs.append(morph)
#     else:
#         ok_morphs.append(morph)

In [17]:
len(problem_morphs)

48

In [23]:
morph = '69333_849_204'

In [24]:
morph_etcs = etcs[etcs.morph == morph]
cb = connect_buildings(morph_etcs)
tri = Graph.build_triangulation(cb.representative_point(), method='relative_neighborhood', kernel='identity')

# cut = tri._adjacency.sort_values().values[int(.15*len(tri._adjacency)):int(.85*len(tri._adjacency))]
# pd.Series(cut).describe()

tri._adjacency.describe()

count     46.000000
mean      93.650099
std       33.771822
min       23.034998
25%       66.348799
50%       87.182840
75%      111.593623
max      183.617370
Name: weight, dtype: float64

In [25]:
cb.length.median()

np.float64(831.4940793153625)

In [26]:
m = tri.explore(cb)
cb.explore(m=m)

In [27]:
tri2 = Graph.from_adjacency(tri._adjacency[(tri._adjacency < 2 * cb.length.median()).values].reset_index())
m = tri2.explore(cb)
cb.explore(m=m)

In [108]:
tri = Graph.build_triangulation(connected_industrial.representative_point(), method='relative_neighborhood', kernel='identity')

# cut = tri._adjacency.sort_values().values[int(.15*len(tri._adjacency)):int(.85*len(tri._adjacency))]
# pd.Series(cut).describe()

tri._adjacency.describe()

count    334.000000
mean      73.176054
std       39.900737
min        7.867718
25%       46.564948
50%       64.367869
75%       90.965714
max      221.032582
Name: weight, dtype: float64

In [109]:
tri._adjacency.describe()['std'] / connected_industrial.length.median()

np.float64(0.35862188590235045)

In [101]:
# m = tri.explore(connected_industrial)
# connected_industrial.explore(m=m)