In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from sklearn.neighbors import KDTree

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from core.cluster_validation import get_linkage_matrix

from fast_hdbscan.cluster_trees import (
    cluster_tree_from_condensed_tree,
    condense_tree,
    extract_eom_clusters,
    extract_leaves,
    get_cluster_label_vector,
    mst_to_linkage_tree,
)
from sklearn.neighbors import KDTree
import shapely
from core.cluster_validation import generate_detailed_clusters
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import cluster_data

CPU times: user 11.4 s, sys: 354 ms, total: 11.7 s
Wall time: 9.32 s


In [2]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
cluster_dir = '/data/uscuni-ulce/processed_data/clusters/'
val_path = '../data/prague_validation/morphotopes.pq'

In [3]:
region_id = 'freiburg'
buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
chars_dir = '../data/freiburg/chars/'
cluster_dir = '/data/uscuni-ulce/processed_data/clusters/'
val_path = '../data/fbg_cluster_validation.pq'

In [4]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)
morphotopes = gpd.read_parquet(val_path)

In [5]:
def boundary_distance_metric(tessellation, clusters, morphotopes, segmentation_distance=10):
    cluster_boundaries = tessellation.dissolve(clusters)
    boundaries = cluster_boundaries.buffer(1e-6).boundary
    coords = morphotopes.segmentize(segmentation_distance).get_coordinates(index_parts=True)
    morphotopes_points = coords.set_geometry(gpd.points_from_xy(*coords.values.T), crs=morphotopes.crs)
    _, dist = boundaries.sindex.nearest(morphotopes_points.geometry, return_distance=True, max_distance=500, return_all=False)
    morphotopes_points["distance"] = dist
    return morphotopes_points.groupby(level=0)["distance"].describe().set_geometry(morphotopes.geometry)

In [6]:


tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False, path=val_path)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]


tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train[X_train.index >=0])), index=X_train[X_train.index >=0].index)
    .loc[tess_groups.index]
    .values
)

In [7]:
spatial_lags = [1, 2, 3, 4, 5]
kernels = ['gaussian', 'inverse', 'weights']
lag_types = ['_median', '_iqr', '_']
cluster_sizes = [50, 75, 100, 150, 250]
chars_to_drop = [
    [],
    ['stcSAl', 'stcOri'],
    [
       'stcSAl',
       'ltkOri',
         'stbOri',
         'stcOri',
         'stbCeA'
    ]
          
]
clips = [None, (-5,5), (-10, 10)]


linkage = 'ward'
metric = 'euclidean'
centroids = shapely.get_coordinates(tessellation.representative_point())

In [None]:
%%capture cap

results = []

for spatial_lag in spatial_lags:
    
    for kernel in kernels:

        lag = spatially_weighted_partial_lag(X_train, graph, centroids, kernel=kernel, k=spatial_lag, n_splits=10)

        for lag_type in lag_types:
            
            clustering_data = X_train.join(lag[[c for c in lag.columns if lag_type in c]], how='inner')
            
            for min_cluster_size in cluster_sizes:
                
                for to_drop in chars_to_drop:

                    for clip in clips:
                    
                        try:
                            region_cluster_labels = cluster_data(clustering_data, graph, to_drop, clip, min_cluster_size, linkage, metric)
                            ars = adjusted_rand_score(tess_groups.values, region_cluster_labels.iloc[tess_groups_ilocs].values)
                            bdm_df = boundary_distance_metric(tessellation[tessellation.index > -1], region_cluster_labels, morphotopes)
                            results.append([spatial_lag, kernel, lag_type, clip, min_cluster_size, to_drop, ars, bdm_df['mean'].mean(), bdm_df['std'].mean()])
                        except Exception as e:
                            print(f"spatial lag: {spatial_lag}, kernel: {kernel}, lag_type: {lag_type}, min_cluster_size: {min_cluster_size}, to_drop: {to_drop}")
                            print(e)

In [None]:
results = pd.DataFrame(results, columns= ['spatial_lag', 'kernel', 'lag_type', 'clip', 'min_cluster_size', 'dropped_cols', 'ars', 'bdm_df_mean', 'bdm_df_std'])


In [None]:
results.to_parquet(f'../data/cluster_log_{region_id}.pq')

In [None]:
# results = pd.read_parquet(f'../data/cluster_log_{region_id}.pq')

In [None]:
# results[results.min_cluster_size == 100].sort_values('ars', ascending=False).iloc[:30]