In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from sklearn.neighbors import KDTree

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from core.cluster_validation import get_linkage_matrix

from fast_hdbscan.cluster_trees import (
    cluster_tree_from_condensed_tree,
    condense_tree,
    extract_eom_clusters,
    extract_leaves,
    get_cluster_label_vector,
    mst_to_linkage_tree,
)
from sklearn.neighbors import KDTree
import shapely
from core.cluster_validation import generate_detailed_clusters
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import cluster_data
from core.generate_clusters import preprocess_clustering_data, get_clusters, post_process_clusters, get_tree
from core.generate_context import partial_weighted_percentile

import umap
import umap.plot


tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
cluster_dir = '/data/uscuni-ulce/processed_data/clusters/'

ImportError: cannot import name 'post_process_clusters' from 'core.generate_clusters' (/home/krasen/urban_taxonomy/src/core/generate_clusters.py)

## Clustering parameters

In [None]:
### specify region id
region_id = 69333

In [None]:
## speficy clustering parameters

min_cluster_size = 100
spatial_lag = 3
kernel='gaussian'

# least important 10 features
to_drop = ['sdsLen', 'sssLin', 'ltcBuA', 'lcnClo', 'mtbSWR', 'ssbCor', 'xcnSCl', 'mtdDeg', 'libNCo', 'sdbCoA']

lag_type = '_median'

clip = None
linkage='ward'
metric='euclidean'

In [None]:
# some other available options


# spatial_lags = [1, 2, 3, 4, 5]
# kernels = ['gaussian', 'inverse', 'weights']
# lag_types = ['_median', '_iqr', '_']
# cluster_sizes = [50, 75, 100, 150, 250]
# chars_to_drop = [
#     [],
#     ['stcSAl', 'stcOri'],
#     [
#        'stcSAl',
#        'ltkOri',
#          'stbOri',
#          'stcOri',
#          'stbCeA'
#     ]
          
# ]
# clips = [None, (-5,5), (-10, 10)]


# linkage = 'ward' - any of the sklearn options
# metric = 'euclidean' - any of the sklearn options

## Read morph. characters data

In [None]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [None]:
%%time

# get centroids for distance calculations
centroids = shapely.get_coordinates(tessellation.representative_point())

# generate spatial context
lag = spatially_weighted_partial_lag(X_train, graph, centroids, kernel=kernel, k=spatial_lag, n_splits=10, bandwidth=-1)

## drop unwated lag_types and join with tessellation cell data
lag = lag[[c for c in lag.columns if lag_type in c]]
clustering_data = X_train.join(lag, how='inner')

In [None]:
# cluster data

In [None]:
%%time
region_cluster_labels = cluster_data(clustering_data, graph, to_drop, clip, min_cluster_size, linkage, metric)

In [None]:
np.unique(region_cluster_labels).shape

## Visualise a subset of morphotopes

In [None]:
#split tessellation graph into connected components, based on the contiguity of tessellation cells with buildings
building_graph = graph.subgraph(graph.unique_ids[graph.unique_ids >= 0])
labels = building_graph.component_labels

In [None]:
labels.value_counts()

In [None]:

label = 849 ### pick the connected component to visualise
labels.groupby(labels).get_group(label).shape # should be the same as above

In [None]:
## setup plotting dataframe
plotting = tessellation.loc[labels.groupby(labels).get_group(label).index.values].reset_index()
plotting['label'] = region_cluster_labels.loc[labels.groupby(labels).get_group(label).index.values].values
plotting['label'] = plotting['label'].str.split('_').str[1].astype(int)
plotting.shape, plotting['label'].nunique()

In [None]:
%%time
import lonboard
from sidecar import Sidecar
from core.cluster_validation import get_color



layer = lonboard.SolidPolygonLayer.from_geopandas(plotting, opacity=.08)

sc = Sidecar(title='Morphotopes')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

layer.get_fill_color = get_color(plotting['label'].values)

In [None]:
### dissolve and save morphotopes

In [None]:
%%time

clrs_geometry = tessellation.loc[region_cluster_labels.index]
clrs_geometry['label'] = region_cluster_labels.values
clrs_geometry = clrs_geometry.dissolve('label').simplify(1).to_frame()
clrs_geometry.columns = ['geometry']
morph_clrs_geometry = clrs_geometry.set_geometry('geometry').reset_index()

In [None]:
morph_clrs_geometry.to_parquet(f'../data/morphotopes_{region_id}_{min_cluster_size}_{spatial_lag}_{lag_type}_{kernel}.pq')

## Generate morphotope data for clustering

In [None]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [None]:
%%time

component_data = X_train.loc[region_cluster_labels.index]
component_data = component_data.groupby(region_cluster_labels.values).agg([percentile(10), 
                                                             'median', 
                                                             percentile(90)])


In [None]:
component_data

In [None]:
# ## Standardise component data and assign 0 to nulls, in order to cluster it
vals = StandardScaler().fit_transform(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)

vals = np.nan_to_num(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)

### UMAP visualisation and dim. reduction

In [None]:
umap_metric ='euclidean'
n_neighbors = 10

#### UMAP run for 2d visualisation only

In [None]:

reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=2, 
                    negative_sample_rate=50, min_dist=0.1, metric=umap_metric, verbose=False, random_state=1)
emb2d = reducer.fit_transform(component_data)
umap.plot.points(reducer)

#### UMAP run for dim. reduction to 20 dim

In [None]:
reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=20, 
                    min_dist=0, negative_sample_rate=50, 
                    metric=umap_metric, verbose=True, random_state=1)
embedding = reducer.fit_transform(component_data)

## Hierarchy generation 

In [None]:
## hierarchy and input data type

final_linkage = 'ward'
final_metric = 'euclidean'
tr_data = embedding

In [None]:
%%time
## cluster data
clusterer = AgglomerativeClustering(linkage=final_linkage,
                                    metric=final_metric,
                                    compute_full_tree=True,
                                    compute_distances=True)
model = clusterer.fit(tr_data)

In [None]:
## visualise dendrogram
linkage_matrix = get_linkage_matrix(model)
fig,ax = plt.subplots(figsize=(20,10))
_ = dendrogram(linkage_matrix, ax=ax)

In [None]:
# select cutoff and cut

In [None]:
cutoff = 25
clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
np.unique(clusters)

### Visualise urban fabrics

In [None]:
# assign the new cluster labels to the original tessellation cells for visualisation
morh_clusters = region_cluster_labels.values
clusters = pd.Series(clusters, index=component_data.index.values)
clusters = clusters.loc[morh_clusters].values

In [None]:
%%time
import lonboard
layer = lonboard.SolidPolygonLayer.from_geopandas(tessellation[tessellation.index >= 0], opacity=.08)

In [None]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [None]:
from core.cluster_validation import get_color
layer.get_fill_color = get_color(clusters)

In [None]:
## try other cutoffs and repaint the map
cutoff = 5
clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
np.unique(clusters)

# assign the new cluster labels to the original tessellation cells for visualisation
morh_clusters = region_cluster_labels.values
clusters = pd.Series(clusters, index=component_data.index.values)
clusters = clusters.loc[morh_clusters].values

layer.get_fill_color = get_color(clusters)

In [None]:
### Dissolve and final urban fabrics clusters geometries and morphotope data component data

In [None]:
clrs_geometry = tessellation.loc[region_cluster_labels.index]
clrs_geometry['label'] = clusters
clrs_geometry = clrs_geometry.dissolve('label').simplify(1).to_frame()
clrs_geometry.columns = ['geometry']
clrs_geometry = clrs_geometry.set_geometry('geometry')
clrs_geometry = clrs_geometry.reset_index()

In [None]:
clrs_geometry.to_parquet(f'../data/clusters_umap_{region_id}_{min_cluster_size}_{spatial_lag}_{kernel}_{umap_metric}_{final_linkage}_{final_metric}_{cutoff}.pq')

In [None]:
# morphotope aggregated data 
component_data.to_parquet(f'../data/morphotopes_data_{region_id}_{min_cluster_size}_{spatial_lag}_{lag_type}_{kernel}.pq')