In [3]:
import os

import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

In [4]:
import glob

In [5]:
!pip install fast_hdbscan



In [6]:
centroid_files = glob.glob('/data/uscuni-ulce/centroids/*.parquet')

In [7]:
res = None

for f in centroid_files:
    df = pd.read_parquet(f)
    res = pd.concat((res, df)).reset_index(drop=True)

In [19]:
import gc
data = res.iloc[:, [1, 2]].values
del res
del df
gc.collect()

In [8]:
from sklearn.neighbors import KDTree
from sklearn.cluster import DBSCAN
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.boruvka import parallel_boruvka

In [20]:
%%time
sklearn_tree = KDTree(data)


CPU times: user 20.6 s, sys: 267 ms, total: 20.9 s
Wall time: 20.7 s


In [23]:
%%time
numba_tree = kdtree_to_numba(sklearn_tree)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 11.4 µs


In [24]:
%%time
edges = parallel_boruvka(
    numba_tree, min_samples=1
)

CPU times: user 47min 37s, sys: 7.68 s, total: 47min 45s
Wall time: 3min 10s


In [25]:
sorted_mst = edges[np.argsort(edges.T[2])]

In [35]:
from fast_hdbscan.cluster_trees import get_cluster_labelling_at_cut, mst_to_linkage_tree

In [36]:
%%time
linkage_tree = mst_to_linkage_tree(sorted_mst)

CPU times: user 6.57 s, sys: 1.16 s, total: 7.73 s
Wall time: 7.59 s


In [53]:
%%time
epsilon = 250
clusters = get_cluster_labelling_at_cut(
            linkage_tree,
            epsilon, 2)

CPU times: user 7.37 s, sys: 1.35 s, total: 8.72 s
Wall time: 6.42 s


In [54]:
vals, counts = np.unique(clusters, return_counts=True)
idxs = np.argsort(counts)[-10:]
counts[idxs]

array([ 324181,  395525,  400167,  437219,  454098,  581896,  610748,
        615673,  690129, 3125331])

In [50]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2).astype(int)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [94]:
dists = ((data.iloc[[0], [1, 2]].values - data.iloc[1:, [1, 2]].values)**2).sum(axis=1)**(1/2)

In [95]:
dist_grid = ((data.iloc[[0], [3, 4]].values - data.iloc[1:, [3, 4]].values)**2).sum(axis=1)**(1/2)

In [96]:
pd.Series(dists).describe().iloc[1:]

mean    179092.288873
std     124926.395093
min         50.632440
25%      83659.753069
50%     154873.615802
75%     240605.405899
max     529213.354645
dtype: float64

In [97]:
pd.Series(dist_grid).describe().iloc[1:]

mean    179116.726038
std     124927.318729
min          0.000000
25%      83696.176735
50%     154898.224651
75%     240624.022076
max     529251.868206
dtype: float64