In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from sklearn.neighbors import KDTree

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from core.cluster_validation import get_linkage_matrix

# try hdbscan extraction
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.cluster_trees import (
    cluster_tree_from_condensed_tree,
    condense_tree,
    extract_eom_clusters,
    get_cluster_label_vector,
    mst_to_linkage_tree,
)
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

CPU times: user 11.3 s, sys: 402 ms, total: 11.7 s
Wall time: 9.37 s


In [2]:
def preprocess_clustering_data(X_train):
    X_train = X_train[X_train.index >= 0]


    to_drop = [
     'stcSAl',
     'ltkOri',
     'stbOri',
     'stcOri',
     'stbCeA',
    
    # #not in barcelona
     # 'ltcBuA', 
    #  'midRea',
    #  'midAre',
    #  'likWBB'

    #  'sdbPer', 
    #  'sdbCoA',
    #  'ssbCCM',
    #  'ltbIBD',
    #  'sdcLAL',
    #  'sdcAre',
    #  'sscERI',
    #  'mtcWNe',
    #  'mdcAre',
    #  'ltcWRB',
    #  'sicCAR',
    # "mtdDeg",


 #        'sdbPer',
 # 'sdbCoA',
 # 'ssbCCo',
 # 'ssbCor',
 # 'ssbERI',
 # 'ssbElo',
 # 'ltcBuA',
 # 'sdcAre',
 # 'stcSAl',


        # 'sdbCoA', 'ssbERI', 'mtdDeg', 'sscERI'
        
    ]
    
    all_drop = []
    for c in to_drop:
        all_drop += X_train.columns[X_train.columns.str.contains(c)].tolist()

    X_train = X_train.drop(all_drop, axis=1)
    
    vals = StandardScaler().fit_transform(X_train)
    X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)
    
    vals = np.nan_to_num(X_train)
    X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)
    
    stats = X_train.describe()
    X_train = X_train.drop(stats.columns[stats.loc['std'] == 0], axis=1)

    return X_train

def get_tree(training_data, clustering_graph, linkage, metric):

    clusterer = AgglomerativeClustering(linkage=linkage,
                                        connectivity = clustering_graph,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(training_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix

def get_eom_clusters(linkage_matrix, min_cluster_size):

    condensed_tree = condense_tree(linkage_matrix, 
                               min_cluster_size=min_cluster_size)
    cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
    selected_clusters = extract_eom_clusters(
        condensed_tree, cluster_tree, allow_single_cluster=False
    )
    return get_cluster_label_vector(condensed_tree, selected_clusters, 0)

In [3]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [4]:
# region_id = 'freiburg'
# buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
# chars_dir = '../data/freiburg/chars/'

In [5]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

spatial_lag = 3
lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{spatial_lag}.parquet')
lag = lag[[c for c in lag.columns if '_median' in c]]

X_train = X_train.join(lag, how='inner')

# X_train = lag

In [6]:
X_train.shape

(304554, 126)

In [7]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [8]:
min_cluster_size = 100
linkage = 'ward'
metric = 'euclidean'

In [9]:
# label building input data, could work with empty tess as well
building_graph = graph.subgraph(graph.unique_ids[graph.unique_ids >= 0])
labels = building_graph.component_labels


In [10]:
%%time

results = {}

for label, group in labels.groupby(labels):

    if group.shape[0] <= min_cluster_size:
        results[label] = np.ones(group.shape[0])
    else:
        component_buildings_data = preprocess_clustering_data(X_train.loc[group.index.values])
        component_graph = building_graph.subgraph(group.index.values).transform('B').sparse
        ward_tree = get_tree(component_buildings_data, component_graph, linkage, metric)
        component_clusters = get_eom_clusters(ward_tree, min_cluster_size)
        results[label] = component_clusters

CPU times: user 22.2 s, sys: 95.6 ms, total: 22.3 s
Wall time: 22.3 s


In [11]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [12]:
labels.groupby(labels).size().sort_values(ascending=False)

component labels
776    60555
452    25177
754    17243
99     15583
726    13382
       ...  
247        1
727        1
249        1
250        1
161        1
Name: component labels, Length: 919, dtype: int64

In [13]:
label = 776
labels.groupby(labels).get_group(label).shape

(60555,)

In [14]:
# tessellation.loc[labels.groupby(labels).get_group(label).index.values].explore(column=results[label], categorical=True)

In [15]:
plotting = tessellation.loc[labels.groupby(labels).get_group(label).index.values]
plotting['label'] = results[label]

In [17]:
# layer.get_fill_color = get_color(plotting['label'].values)

In [18]:
plotting['label'].nunique()

286

In [19]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.08)



CPU times: user 1.12 s, sys: 132 ms, total: 1.25 s
Wall time: 1.25 s


In [20]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

from core.cluster_validation import get_color
layer.get_fill_color = get_color(plotting['label'].values)

In [21]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [22]:
component_data = X_train.loc[labels.groupby(labels).get_group(label).index.values]

In [23]:
# component_data = component_data.groupby(results[label]).median()

In [24]:
component_data = component_data.groupby(results[label]).agg([percentile(10), 'median', percentile(90)])

In [25]:
component_data = component_data[component_data.index >= 0]

In [26]:
vals = np.nan_to_num(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)

In [None]:
clusterer = AgglomerativeClustering(linkage='ward',
                                    compute_full_tree=True,
                                    compute_distances=True)
model = clusterer.fit(component_data)
linkage_matrix = get_linkage_matrix(model)

In [None]:
fig,ax = plt.subplots(figsize=(20,20))
_ = dendrogram(linkage_matrix, ax=ax)

In [None]:
clusters = fcluster(linkage_matrix, t=5e5, criterion='distance')

In [None]:
clusters

In [None]:
clusters = pd.Series(clusters, index=component_data.index.values)
clusters[-1] = -1
clusters = clusters.loc[results[label]].values

In [None]:
layer.get_fill_color = get_color(clusters)

In [146]:
from clustergram import Clustergram
from sklearn.mixture import GaussianMixture

In [150]:
component_data = X_train.loc[labels.groupby(labels).get_group(label).index.values]
component_data.shape

(60555, 63)

In [151]:
component_data = preprocess_clustering_data(component_data)
component_data.shape

(60555, 54)

In [None]:
%%time
gmm = GaussianMixture(n_components=10, 
                      covariance_type="full",
                      max_iter=200, 
                      n_init=100,
                      random_state=42)
fitted = gmm.fit(component_data)
clusters = fitted.predict(component_data)
pd.Series(clusters, index=component_data.index).to_csv('../data/gmm_10_clusters.csv')

In [659]:
clusters = pd.read_csv('../data/gmm_10_clusters.csv').set_index('Unnamed: 0')