In [1]:

import numpy as np
import pandas as pd

In [2]:
import glob

In [3]:
centroid_files = glob.glob("/data/uscuni-ulce/centroids/*.parquet")

In [21]:
res = None

for f in centroid_files:
    df = pd.read_parquet(f)
    res = pd.concat((res, df)).reset_index(drop=True)

In [24]:
import gc

data = res.iloc[:, [1, 2]].values
ids = res['id']
del res
del df
gc.collect()

14804

In [6]:
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

In [7]:
%%time
sklearn_tree = KDTree(data)

CPU times: user 20.3 s, sys: 236 ms, total: 20.6 s
Wall time: 20.4 s


In [8]:
%%time
numba_tree = kdtree_to_numba(sklearn_tree)

CPU times: user 9 μs, sys: 0 ns, total: 9 μs
Wall time: 11 μs


In [9]:
%%time
edges = parallel_boruvka(numba_tree, min_samples=1)

CPU times: user 49min 52s, sys: 7.5 s, total: 50min
Wall time: 3min 24s


In [10]:
sorted_mst = edges[np.argsort(edges.T[2])]

In [17]:
# np.save('../data/sorted_mst_eubucco_centroids.npy', sorted_mst)

In [11]:
from fast_hdbscan.cluster_trees import get_cluster_labelling_at_cut, mst_to_linkage_tree

In [12]:
%%time
linkage_tree = mst_to_linkage_tree(sorted_mst)

CPU times: user 6.57 s, sys: 1.19 s, total: 7.76 s
Wall time: 7.67 s


In [74]:
%%time
epsilon = 400
clusters = get_cluster_labelling_at_cut(linkage_tree, epsilon, 2)

CPU times: user 6.73 s, sys: 1.23 s, total: 7.96 s
Wall time: 6.26 s


In [75]:
vals, counts = np.unique(clusters, return_counts=True)
idxs = np.argsort(counts)[-10:]
counts[idxs]

array([ 486607,  518308,  589589,  730388,  754610,  810908, 1636365,
       2231215, 3989463, 4478821])

In [79]:
clusters = pd.Series(clusters, index=ids, name=f'cluster_{epsilon}m')

In [81]:
import geopandas as gpd

In [82]:
%%time
grid_cores = gpd.GeoDataFrame(
    clusters,
    geometry=gpd.points_from_xy(data[:, 0], data[:, 1]),
    crs=3035,
)

CPU times: user 7.31 s, sys: 3.08 s, total: 10.4 s
Wall time: 10.4 s


In [84]:
%%time
grid_cores_dissolved = grid_cores.dissolve(f"cluster_{epsilon}m")

CPU times: user 27.2 s, sys: 896 ms, total: 28.1 s
Wall time: 28.1 s


In [88]:
%%time
convex_hulls = grid_cores_dissolved.convex_hull


CPU times: user 8.61 s, sys: 8.05 ms, total: 8.61 s
Wall time: 8.61 s


In [100]:
%%time
convex_hulls = grid_cores_dissolved.concave_hull(ratio=.02, allow_holes=False)


CPU times: user 2min 51s, sys: 56 ms, total: 2min 51s
Wall time: 2min 51s


In [101]:
convex_hulls = convex_hulls.to_frame().reset_index()
convex_hulls.columns = ['region_id', 'geometry']
convex_hulls = convex_hulls.set_geometry('geometry')

In [102]:
import lonboard

In [103]:
layer = lonboard.PolygonLayer.from_geopandas(
    convex_hulls[convex_hulls.geom_type == 'Polygon'].to_crs(4326), opacity=0.15)

In [105]:
# m = lonboard.Map([layer], basemap_style=lonboard.basemap.CartoBasemap.Positron)
# m

In [None]:
clusters.to_frame().to_parquet(f'../data/id_to_region_mst_{epsilon}m.parquet')

In [50]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2).astype(int)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [94]:
dists = ((data.iloc[[0], [1, 2]].values - data.iloc[1:, [1, 2]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [95]:
dist_grid = ((data.iloc[[0], [3, 4]].values - data.iloc[1:, [3, 4]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [96]:
pd.Series(dists).describe().iloc[1:]

mean    179092.288873
std     124926.395093
min         50.632440
25%      83659.753069
50%     154873.615802
75%     240605.405899
max     529213.354645
dtype: float64

In [97]:
pd.Series(dist_grid).describe().iloc[1:]

mean    179116.726038
std     124927.318729
min          0.000000
25%      83696.176735
50%     154898.224651
75%     240624.022076
max     529251.868206
dtype: float64