In [8]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree

CPU times: user 37 μs, sys: 2 μs, total: 39 μs
Wall time: 40.8 μs


In [9]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [10]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [11]:
graph.cardinalities.describe()

count    304554.000000
mean          6.751085
std           2.060782
min           1.000000
25%           6.000000
50%           7.000000
75%           8.000000
max          82.000000
Name: cardinalities, dtype: float64

In [12]:
# graph2 = graph.higher_order(lower_order=True, k=2, diagonal=True)
# graph2.cardinalities.describe()

In [13]:
# graph3 = graph.higher_order(lower_order=True, k=3, diagonal=True)
# graph3

In [14]:
graph.component_labels.value_counts()

component labels
25     176892
24      19069
28       3958
69       3867
99       3368
        ...  
766         1
798         1
799         1
800         1
814         1
Name: count, Length: 815, dtype: int64

In [15]:
from core.cluster_validation import print_distance, generate_neigbhourhood_groups

In [16]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [17]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')



X_train = X_train[X_train.index >= 0]



spatial_lag = 3


# lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}_lag_{spatial_lag}.parquet')

lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{spatial_lag}.parquet')



# X_train = X_train.join(lag[[c for c in lag.columns if '_median' in c]], how='inner')
X_train = X_train.join(lag, how='inner')


In [18]:
vals = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)


# X_train = X_train.clip(-10, 10)

In [19]:
X_train.shape

(299064, 248)

In [20]:
X_train = pd.read_parquet('../data/old_prague_data/old_prague_data.parquet')
X_train = X_train.set_index('uID').sort_index()


vals = np.nan_to_num(StandardScaler().fit_transform(X_train))
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)
# X_train = X_train.clip(-10, 10)



In [21]:
tessellation = gpd.read_file('../data/old_prague_data/prg_geometry.gpkg', 
                             layer='tessellation').set_index('uID').sort_index().to_crs(epsg=3035)
from libpysal.graph import Graph
graph = Graph.build_fuzzy_contiguity(tessellation, buffer=.001)

In [31]:
tess_groups = generate_neigbhourhood_groups(tessellation[tessellation.index.isin(X_train.index)], buffer=200)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train)), index=X_train.index)
    .loc[tess_groups.index]
    .values
)

In [32]:
neighbourhoods = X_train.loc[tess_groups.index].groupby(tess_groups.values).mean()
print_distance(neighbourhoods, metric='euclidean')

Unnamed: 0,holyne,housing estate,josefov,karlin,mala strana,malesice,nusle,stare mesto,vinohrady
holyne,0.0,15.763196,23.477808,21.249741,32.822463,39.467277,15.415362,30.016826,21.632481
housing estate,15.763196,0.0,18.441437,18.438411,30.078778,37.963489,11.492449,26.42099,18.316329
josefov,23.477808,18.441437,0.0,18.926597,26.823233,44.633596,16.793108,18.867431,14.566036
karlin,21.249741,18.438411,18.926597,0.0,26.741073,36.398416,17.541798,22.532061,16.099393
mala strana,32.822463,30.078778,26.823233,26.741073,0.0,46.67641,27.125837,14.9563,31.536093
malesice,39.467277,37.963489,44.633596,36.398416,46.67641,0.0,36.855085,46.884005,43.836244
nusle,15.415362,11.492449,16.793108,17.541798,27.125837,36.855085,0.0,23.659463,18.981047
stare mesto,30.016826,26.42099,18.867431,22.532061,14.9563,46.884005,23.659463,0.0,23.308281
vinohrady,21.632481,18.316329,14.566036,16.099393,31.536093,43.836244,18.981047,23.308281,0.0


In [33]:
from scipy.spatial.distance import pdist

In [34]:
for i, g in X_train.loc[tess_groups.index].groupby(tess_groups.values):
    print(i, np.median(pdist(g)))

holyne 16.06333030931265
housing estate 14.699064820261995
josefov 18.872078733066747
karlin 15.524820487991727
mala strana 27.194530697517518
malesice 17.874748385021295
nusle 15.404327013238294
stare mesto 23.32865195772304
vinohrady 11.416256476413336


In [36]:
# tessellation.loc[tess_groups.index].explore(column=tess_groups.values, categorical=True)

In [None]:
newgraph = graph.subgraph(X_train.index.values)

In [None]:
newgraph.component_labels.value_counts()

In [None]:
main_comp = 0

In [None]:
import lonboard
from sidecar import Sidecar
from libpysal.graph import Graph
from lonboard.colormap import apply_continuous_cmap
from core.cluster_validation import get_color

In [None]:
plotting = tessellation.loc[newgraph.component_labels[newgraph.component_labels == main_comp].index]

In [None]:
%%time
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.15)

Create a Sidecar view (assumes JupyterLab) for more comfortable experience.

In [None]:


sc = Sidecar(title='buildings')

Create a Map object

In [None]:
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)

Display map within the sidecar plugin

In [None]:
with sc:
    display(m)

In [None]:
plot_data = X_train.loc[newgraph.component_labels[newgraph.component_labels == main_comp].index]
plot_graph = newgraph.higher_order(k=3, lower_order=True, diagonal=True).subgraph(plot_data.index)

In [None]:
focals = plot_graph._adjacency.index.get_level_values(0).values
neighbours = plot_graph._adjacency.index.get_level_values(1).values
fvals = plot_data.loc[focals].values
nvals = plot_data.loc[neighbours].values

In [None]:
distances = np.sqrt(np.sum(np.pow(fvals - nvals, 2), axis=1))

In [None]:
# %%time
# from scipy.spatial.distance import cosine
# distances = [cosine(fvals[i], nvals[i]) for i in range(fvals.shape[0])]
# distances = np.array(distances)

In [None]:
pd.Series(distances).describe().iloc[1:]

In [None]:
threshold = 8


In [None]:
new_focals = focals[np.where(distances <= threshold)]
new_neighbours = neighbours[np.where(distances <= threshold)]
new_distances = distances[np.where(distances <= threshold)]

subgraph = Graph.from_arrays(new_focals, new_neighbours, new_distances)

In [None]:
# tessellation.loc[subgraph.component_labels[subgraph.component_labels == 78865].index].explore()

In [None]:
ntop = 100

top_components = subgraph.component_labels.value_counts()

labels = subgraph.component_labels.copy()
top_labels = top_components[:ntop].index.values
labels[~labels.isin(top_labels)] = -1
repl_dict = dict(zip(top_labels.tolist(), np.arange(ntop).tolist()))
labels = labels.replace(repl_dict)

In [None]:
layer.get_fill_color = get_color(labels)

### Hierarchical

In [None]:
# embedding = X_train[[c for c in X_train.columns if '_median' in c]]
embedding = X_train

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score

In [None]:
clustering_graph = graph.subgraph(X_train.index.values).transform('B').sparse

In [None]:
clusterer = AgglomerativeClustering(linkage='ward',
                                    connectivity=clustering_graph, 
                                    compute_full_tree=True, compute_distances=True)

In [None]:
%%time
model = clusterer.fit(embedding)

In [None]:
def get_linkage_matrix(model):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    return linkage_matrix

In [None]:
linkage_matrix = get_linkage_matrix(model)

In [None]:
fix, ax = plt.subplots(figsize=(40,40))
# Plot the corresponding dendrogram
_ = dendrogram(linkage_matrix, truncate_mode="level", p=5, ax=ax)

In [None]:
%%time
import lonboard
plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.15)

In [None]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [None]:
from core.cluster_validation import get_color

In [None]:
for t in range(100, 1250, 100):
    r = fcluster(linkage_matrix, t=t, criterion='distance')
    # r = pd.Series(r, index=X_train.index)
    # # ssplits = graph.describe(r, statistics=['nunique'])['nunique']
    print(t, ' - ', 
          adjusted_rand_score(tess_groups.values, r[tess_groups_ilocs]),
          # (ssplits > 1).sum() / ssplits.shape[0],
          davies_bouldin_score(X_train, r)
         )

In [None]:
clusters = fcluster(linkage_matrix, t=350, criterion='distance')

In [None]:
np.unique(clusters, return_counts=True)

In [None]:
layer.get_fill_color = get_color(clusters)

In [None]:
sklearn_tree = KDTree(X_train.values)
numba_tree = kdtree_to_numba(sklearn_tree)

In [None]:
from core.cluster_validation import generate_neigbhourhood_groups

In [None]:
groups = generate_neigbhourhood_groups(tessellation)

In [None]:
karlins = groups[groups == 'karlin'].index.values

In [None]:
tess_id = 261793

In [None]:
# tessellation.loc[graph[tess_id].index.values].explore()

In [None]:
graph[tess_id].index.values

In [None]:
xilocs = np.where(X_train.index.isin(graph[tess_id].index.values))[0]

In [None]:
num_neighbours = 15

In [None]:
nn_dists, nn_inds = parallel_tree_query(numba_tree, np.ascontiguousarray(X_train.iloc[xilocs].values), k=num_neighbours)

In [None]:
neigh_ids = nn_inds[3]
neigh_dists = nn_dists[3]

In [None]:
neighbours = tessellation.loc[X_train.iloc[neigh_ids].index].reset_index()
neighbours['nn_dists'] = neigh_dists
neighbours['nn_order'] = np.arange(num_neighbours)

In [None]:
intersection = np.intersect1d(X_train.iloc[neigh_ids].index, graph[tess_id].index.values)
union = np.union1d(X_train.iloc[neigh_ids].index, graph[tess_id].index.values)

print('graph intersection: ', intersection.shape[0] / graph[tess_id].index.values.shape[0],
      'total intersection: ', intersection.shape[0]/union.shape[0])

In [None]:
from scipy.spatial.distance import pdist, cdist

In [None]:
pdist(X_train.iloc[neigh_ids].values)

In [None]:
# m = neighbours.explore()
# m = tessellation.loc[[tess_id]].explore(color='red', m=m)
# m