In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from fast_hdbscan.numba_kdtree import NumbaKDTree, kdtree_to_numba, rdist, point_to_node_lower_bound_rdist
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_parquet('../data/morphotopes_data_69333_100_3__median_gaussian.pq')
sizes = data.iloc[:, -1]
data = data.iloc[:, :-1]

In [3]:
vals = StandardScaler().fit_transform(data)
component_data = pd.DataFrame(vals, columns=data.columns, index=data.index)
component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)

# component_data = component_data[component_data.index >= 0]
vals = np.nan_to_num(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)

In [156]:
min_cluster_size = 500
sample_weights = sizes.values.astype(np.float32)
allow_single_cluster = False

### weighted hsbcan

In [200]:
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.cluster_trees import (mst_to_linkage_tree_w_sample_weights, 
condense_tree, cluster_tree_from_condensed_tree, extract_eom_clusters,
get_cluster_label_vector, bfs_from_hierarchy, eliminate_branch, CondensedTree, score_condensed_tree_nodes, eom_recursion)
from sklearn.neighbors import KDTree

In [201]:
from fast_hdbscan import fast_hdbscan

In [202]:
# res = fast_hdbscan(component_data.values, sample_weights=sample_weights.astype(np.float32), min_samples=min_cluster_size, min_cluster_size=min_cluster_size , return_trees=True)

In [203]:
sklearn_tree = KDTree(component_data.values)
numba_tree = kdtree_to_numba(sklearn_tree)

In [204]:
edges = parallel_boruvka(
    numba_tree,
    min_samples=min_cluster_size,
    sample_weights=sample_weights,
)
sorted_mst = edges[np.argsort(edges.T[2])]

In [205]:
linkage_tree = mst_to_linkage_tree_w_sample_weights(sorted_mst, sample_weights)

In [218]:
@numba.njit(fastmath=True)
def condense_tree(hierarchy, min_cluster_size=10, bandwidth=1, sample_weights=None):
    root = 2 * hierarchy.shape[0]
    num_points = hierarchy.shape[0] + 1
    next_label = num_points + 1

    node_list = bfs_from_hierarchy(hierarchy, root, num_points)

    relabel = np.zeros(root + 1, dtype=np.int64)
    relabel[root] = num_points

    parents = np.ones(root, dtype=np.int64)
    children = np.empty(root, dtype=np.int64)
    lambdas = np.empty(root, dtype=np.float32)
    sizes = np.ones(root, dtype=np.int64)

    ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'

    if sample_weights is None:
        sample_weights = np.ones(num_points, dtype=np.float32)

    idx = 0

    for node in node_list:
        if ignore[node] or node < num_points:
            continue

        parent_node = relabel[node]
        l, r, d, _ = hierarchy[node - num_points]
        left = np.int64(l)
        right = np.int64(r)
        if d > 0.0:
            # gaussian lambda value
            lambda_value = np.exp(-(((d / bandwidth) / 2) ** 2)) / (np.sqrt(2) * np.pi)
            # lambda_value = 1.0 / d
        else:
            lambda_value = np.inf

        left_count = np.int64(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
        right_count = np.int64(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[right]

        # The logic here is in a strange order, but it has non-trivial performance gains ...
        # The most common case by far is a singleton on the left; and cluster on the right take care of this separately
        if left < num_points and right_count >= min_cluster_size:
            relabel[right] = parent_node
            parents[idx] = parent_node
            children[idx] = left
            lambdas[idx] = lambda_value
            idx += 1
        # Next most common is a small left cluster and a large right cluster: relabel the right node; eliminate the left branch
        elif left_count < min_cluster_size and right_count >= min_cluster_size:
            relabel[right] = parent_node
            idx = eliminate_branch(left, parent_node, lambda_value, parents, children, lambdas, sizes, idx, ignore,
                                   hierarchy, num_points)
        # Then we have a large left cluster and a small right cluster: relabel the left node; elimiate the right branch
        elif left_count >= min_cluster_size and right_count < min_cluster_size:
            relabel[left] = parent_node
            idx = eliminate_branch(right, parent_node, lambda_value, parents, children, lambdas, sizes, idx, ignore,
                                   hierarchy, num_points)
        # If both clusters are small then eliminate all branches
        elif left_count < min_cluster_size and right_count < min_cluster_size:
            idx = eliminate_branch(left, parent_node, lambda_value, parents, children, lambdas, sizes, idx, ignore,
                                   hierarchy, num_points)
            idx = eliminate_branch(right, parent_node, lambda_value, parents, children, lambdas, sizes, idx, ignore,
                                   hierarchy, num_points)
        # and finally if we actually have a legitimate cluster split, handle that correctly
        else:
            relabel[left] = next_label

            parents[idx] = parent_node
            children[idx] = next_label
            lambdas[idx] = lambda_value
            sizes[idx] = left_count
            next_label += 1
            idx += 1

            relabel[right] = next_label

            parents[idx] = parent_node
            children[idx] = next_label
            lambdas[idx] = lambda_value
            sizes[idx] = right_count
            next_label += 1
            idx += 1

    return CondensedTree(parents[:idx], children[:idx], lambdas[:idx], sizes[:idx])

In [228]:
from scipy.stats import gaussian_kde
e = gaussian_kde(sorted_mst[:, 2])
f = e.covariance_factor()
bw = f * sorted_mst[:, 2].std()
bw

np.float64(0.13399727643293322)

In [238]:
condensed_tree = condense_tree(linkage_tree, 
                               min_cluster_size=min_cluster_size,
                               bandwidth=2,
                               sample_weights=sample_weights)

In [239]:
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)

In [240]:
@numba.njit()
def score_condensed_tree_nodes(condensed_tree):
    result = {0: np.float64(0.0) for i in range(0)}

    for i in range(condensed_tree.parent.shape[0]):
        parent = condensed_tree.parent[i]
        if parent in result:
            result[parent] += condensed_tree.lambda_val[i] * condensed_tree.child_size[i]
        else:
            result[parent] = condensed_tree.lambda_val[i] * condensed_tree.child_size[i]

        if condensed_tree.child_size[i] > 1:
            child = condensed_tree.child[i]
            if child in result:
                result[child] -= condensed_tree.lambda_val[i] * condensed_tree.child_size[i]
            else:
                result[child] = -condensed_tree.lambda_val[i] * condensed_tree.child_size[i]

    return result

@numba.njit()
def extract_eom_clusters(condensed_tree, cluster_tree, allow_single_cluster=False):
    node_scores = score_condensed_tree_nodes(condensed_tree)
    selected_clusters = {node: False for node in node_scores}

    if len(cluster_tree.parent) == 0:
        return np.zeros(0, dtype=np.int64)

    cluster_tree_root = cluster_tree.parent.min()

    if allow_single_cluster:
        eom_recursion(cluster_tree_root, cluster_tree, node_scores, selected_clusters)
    elif len(node_scores) > 1:
        root_children = cluster_tree.child[cluster_tree.parent == cluster_tree_root]
        for child_node in root_children:
            eom_recursion(child_node, cluster_tree, node_scores, selected_clusters)

    return np.asarray([node for node, selected in selected_clusters.items() if selected])

In [241]:
selected_clusters = extract_eom_clusters(condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster)

In [242]:
clusters = get_cluster_label_vector(
        condensed_tree,
        selected_clusters,
        0,
        n_samples=data.shape[0],
    )

In [243]:
morphotopes = gpd.read_parquet('../data/morphotopes_69333_100_3__median_gaussian.pq')

In [244]:
morphotopes['cluster'] = clusters

In [245]:
layer.get_fill_color = get_color(morphotopes.cluster)

In [82]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(morphotopes, opacity=.08)



CPU times: user 559 ms, sys: 49 ms, total: 608 ms
Wall time: 608 ms


In [83]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [217]:
from core.cluster_validation import get_color
layer.get_fill_color = get_color(morphotopes.cluster)