In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
from libpysal.graph import read_parquet, Graph
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import preprocess_clustering_data
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
from scipy.cluster.hierarchy import fcluster
from core.utils import used_keys



In [2]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_75_0_None_None_False'
clip = None
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe',
          'licBAD', 'misBAD',
    'ssbCCM','ssbCCD'    
]


linkage = 'complete'
metric = 'euclidean'

morphotope_stat_columns = ['percentile_25', 'percentile_75', 'mean', 'std']


regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

In [3]:
def preprocess_data(data, scalar, drop_columns=morphotope_stat_columns):
    component_data = data.drop(columns=drop_columns, level=1)
    component_data = component_data.drop(columns=to_drop, level=0)
    component_data.columns =  component_data.columns.get_level_values(0)
    
    component_data = component_data[component_data.index.str[-2:] != '-1']
    
    vals = scalar.fit_transform(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)
    
    # component_data = component_data[component_data.index >= 0]
    vals = np.nan_to_num(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    return component_data

def read_region_morphotope_data(region_id, scalar, read_extra=False):
    print('processing' , region_id)
    data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    
    data.index = str(region_id) + '_' + data.index
    data = data.iloc[:, :-1]
    component_data = preprocess_data(data, scalar)


    if read_extra:
        extra_chars = pd.read_parquet(f'{morphotopes_dir}morph_chars_{region_id}.pq')
        vals = scalar.fit_transform(extra_chars)
        extra_chars = pd.DataFrame(vals, columns=extra_chars.columns, index=extra_chars.index).fillna(0)
        extra_chars = extra_chars.drop(component_data.columns[component_data.std() == 0], axis=1)
        extra_chars = extra_chars[['limLPS']]
        extra_chars['limLPS2'] = extra_chars['limLPS']

        merged_data = pd.merge(component_data, extra_chars, how='inner', left_index=True, right_index=True)
        pd.testing.assert_index_equal(merged_data.index, component_data.index)
        component_data = merged_data
    
    return component_data

def get_morphotope_linkage(region_data):
    
    clusterer = AgglomerativeClustering(linkage=linkage,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(region_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix


def get_all_clusters(cutoff):
    
    all_clusters = []
    
    for region_id, _ in region_hulls.iterrows():
        data = read_region_morphotope_data(region_id, scalar=StandardScaler())
        region_index = data.index
        linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}.npy')
        clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
        clusters = pd.Series(clusters, region_index).astype(str)
        clusters = str(region_id) + '_' +clusters
        all_clusters.append(clusters)
    
    all_clusters = pd.concat(all_clusters)
    
    return all_clusters


def read_morphotopes_data(model_params):
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
        region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
        data.append(region_morphotope_data)
    
    data = pd.concat(data)
    return data

def read_morphotopes_chars():
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        extra_chars = pd.read_parquet(f'{morphotopes_dir}morph_chars_{region_id}.pq')
        data.append(extra_chars)
    
    data = pd.concat(data)
    return data 

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs

In [4]:
# read morphotopes data
morphotopes_data = read_morphotopes_data(model_params)
morphotopes_data = morphotopes_data[~morphotopes_data.index.str.endswith('-1')]
morphotopes_data = morphotopes_data.iloc[:, :-1]

In [22]:
%%time



## drop unwanted columns
component_data = morphotopes_data.drop(columns=morphotope_stat_columns, level=1)
component_data = component_data.drop(columns=to_drop, level=0)
component_data.columns = component_data.columns.get_level_values(0)

# add morphotope variables
morph_chars_data = read_morphotopes_chars()
morph_chars_data = morph_chars_data[['limLPS']]
merged_data = pd.merge(component_data, morph_chars_data, how='inner', left_index=True, right_index=True)
pd.testing.assert_index_equal(merged_data.index, component_data.index)
grouped_data = merged_data

CPU times: user 25.1 s, sys: 7.38 s, total: 32.5 s
Wall time: 14.9 s


In [4]:
# # drop industrial data
# grouped_data = grouped_data[grouped_data['limLPS'] == 1]

In [5]:
# clipped_data = grouped_data.copy()
# target = 100

# for col in grouped_data.columns:
#     sorted_col = grouped_data[col].sort_values()
#     top = sorted_col.iloc[target]
#     bottom = sorted_col.iloc[-target]
#     clipped_data[col] = grouped_data[col].clip(bottom, top)

In [77]:
# standardise
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
vals = scalar.fit_transform(clipped_data)
vals = np.nan_to_num(vals, 0)
regional_ward_morphotopes_data = pd.DataFrame(vals, index=grouped_data.index, columns=grouped_data.columns)

# regional_ward_morphotopes_data.loc[grouped_data['limLPS'] == 1, 'limLPS'] = 2

In [9]:
### Limit to prague
# regional_ward_morphotopes_data = regional_ward_morphotopes_data[regional_ward_morphotopes_data.index.str.startswith('69333')]

In [79]:
%%time
import umap
reducer = umap.UMAP(n_neighbors=25, n_components=20, 
                    min_dist=0,
                    metric='euclidean', verbose=True, random_state=1)
embedding = reducer.fit_transform(regional_ward_morphotopes_data)
embedding = pd.DataFrame(embedding, index=regional_ward_morphotopes_data.index)

  warn(


UMAP(min_dist=0, n_components=20, n_jobs=1, n_neighbors=25, random_state=1, verbose=True)
Fri Apr 25 23:51:45 2025 Construct fuzzy simplicial set
Fri Apr 25 23:51:45 2025 Finding Nearest Neighbors
Fri Apr 25 23:51:45 2025 Building RP forest with 41 trees
Fri Apr 25 23:51:57 2025 NN descent for 19 iterations
	 1  /  19
	 2  /  19
	 3  /  19
	 4  /  19
	Stopping threshold met -- exiting after 4 iterations
Fri Apr 25 23:52:40 2025 Finished Nearest Neighbor Search
Fri Apr 25 23:52:42 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Apr 26 00:03:34 2025 Finished embedding
CPU times: user 55min 21s, sys: 5.56 s, total: 55min 26s
Wall time: 11min 49s


In [82]:
embedding.to_parquet('../data/morphotope_embedding_no_industry.pq')

### Final ward clustering

In [9]:
linkage = 'complete'
metric = 'euclidean'

In [12]:
%%time
linkage_matrix = get_morphotope_linkage(embedding)

In [13]:
## final dendrogram
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
fig,ax = plt.subplots(figsize=(10,10))
_ = dendrogram(linkage_matrix,ax=ax, no_labels=True)

In [None]:
np.save('../data/umap_complete_linkage_morphotopes.npy', linkage_matrix)

### Divisive kmeans

In [6]:
embedding = pd.read_parquet('../data/morphotope_embedding.pq')

In [7]:
from sklearn.cluster import KMeans

In [8]:
class DivisiveClustering:
    def __init__(self):
        self.labels_ = None
        self.linkage_matrix = None
        self.current_id = 0

    def fit(self, X):
        n_samples = X.shape[0]
        self.labels_ = np.zeros(n_samples, dtype=int)
        self.linkage_matrix = []
        self.current_id = n_samples
        
        # Start with all points in one cluster, initial recursion_level is 1
        self._recursive_split(X, np.arange(n_samples), 1)
        
        self.linkage_matrix = np.array(self.linkage_matrix)
        return self

    def _recursive_split(self, X, indices, recursion_level):
        if len(indices) <= 1:
            return indices[0], 1
        
        kmeans = KMeans(n_clusters=2, random_state=123, n_init=500)
        kmeans.fit(X[indices])
        
        mask0 = kmeans.labels_ == 0
        mask1 = kmeans.labels_ == 1
        indices0 = indices[mask0]
        indices1 = indices[mask1]
        
        id0, size0 = self._recursive_split(X, indices0, recursion_level + 1)
        id1, size1 = self._recursive_split(X, indices1, recursion_level + 1)
        
        # Use 1/recursion_level as distance
        distance = 1.0 / recursion_level
        
        self.linkage_matrix.append([float(min(id0, id1)), float(max(id0, id1)), 
                                  float(distance), float(size0 + size1)])
        
        current_cluster = self.current_id
        self.current_id += 1
        return current_cluster, size0 + size1

In [9]:
clusterer = DivisiveClustering()

In [None]:
%%time
start = np.datetime64('now')
clusterer.fit(embedding.values)
end = np.datetime64('now')

In [None]:
np.save('../data/kmeans_linkage_umap_embedded_morphotopes.npy', clusterer.linkage_matrix)

In [None]:
# np.save('/data/uscuni-ulce/processed_data/clusters/kmeans_linkage.npy', clusterer.linkage_matrix)

In [5]:
linkage_matrix = np.load('../data/kmeans_linkage_umap_embedded_morphotopes.npy')

# linkage_matrix = clusterer.linkage_matrix

In [6]:
final_cutoff = 1/5

In [7]:
clusters = fcluster(linkage_matrix, t=final_cutoff, criterion='distance')
final_clusters = pd.Series(clusters, morphotopes_data.index)
# clusters_description = regional_ward_morphotopes_data.groupby(final_clusters).mean()
# clusters_description = grouped_data.groupby(final_clusters).mean()
final_clusters.value_counts()

11    193779
12    121648
9     102341
10     70372
1       4849
8       4698
7       4333
3       3799
13      3111
16      3106
2       3052
15      2949
14      1998
5       1680
6       1592
4        288
Name: count, dtype: int64

In [8]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [255,255,255]

In [9]:
region_id = 69333 

# region_id = 99886 # bratislava

# region_id = 151676 # vilnius

# region_id = 8707 # mainz/frankfurt
# region_id = 5883 #  freiburg
# region_id = 38679 #munich
# region_id = 55763 # berlin

# region_id = 86873 # vienna

# region_id = 107131 # krakow


# region_id= 66593

# region_id = 91011

In [10]:
# etcs=False to read buildings, etcs=True for tessellation cells.


etcs = morphotopes_to_etcs(region_id, etcs=False, model_params=model_params)

In [11]:
# etcs = morphotopes_to_etcs(69333, etcs=False, model_params=model_params)
# etcs2 = morphotopes_to_etcs(91011, etcs=False, model_params=model_params)
# etcs3 = morphotopes_to_etcs(93167, etcs=False, model_params=model_params)

# etcs = pd.concat((etcs, etcs2, etcs3), ignore_index=True)

In [12]:


# direct kmeans
etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[x] if x in morphotopes_data.index else -1)
etcs['regional'] = etcs['morph']


In [13]:
# ## can run this to change colors on an exisitng layer
# layer.get_fill_color = get_color(etcs.final)

In [14]:
etcs['geometry'] = etcs.simplify(1).to_crs(epsg=4326).make_valid()
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']

In [15]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

CPU times: user 1.07 s, sys: 115 ms, total: 1.18 s
Wall time: 1.18 s


In [16]:
from sidecar import Sidecar
sc = Sidecar(title=f'Final Clusters - {final_cutoff}')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.DarkMatter)
with sc:
    display(m)

In [17]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[etcs.final].values.astype('uint8')