In [112]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")
graph_dir = data_dir + "neigh_graphs/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"

In [113]:
from core.cluster_validation import generate_validation_groups
from core.utils import used_keys

In [114]:
import glob

In [115]:
region_id = 69300

In [116]:
primary = pd.read_parquet(chars_dir + f'primary_chars/primary_chars_{region_id}.parquet')

In [119]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [None]:
X = pd.read_parquet('/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}.parquet')

In [205]:
data = X.loc[:, X.columns.str.contains('_median')].values
data.shape

(304554, 62)

In [206]:

# data = X.values
# data.shape

In [207]:
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

In [208]:
%%time
sklearn_tree = KDTree(data)

CPU times: user 1.37 s, sys: 12 ms, total: 1.38 s
Wall time: 1.36 s


In [209]:
%%time
numba_tree = kdtree_to_numba(sklearn_tree)

CPU times: user 563 μs, sys: 0 ns, total: 563 μs
Wall time: 567 μs


In [256]:
from fast_hdbscan.numba_kdtree import parallel_tree_query

In [273]:
k = 5

In [274]:
%%time
result = parallel_tree_query(numba_tree, data, k=k)

CPU times: user 1h 16min 15s, sys: 208 ms, total: 1h 16min 15s
Wall time: 4min 22s


In [275]:
# np.save('../data/prague_knn_100.npy', result[1])

In [276]:
knns = result[1]

In [277]:
pd.Series(result[0][result[0] > 0]).describe().iloc[1:]

mean      1.370761
std       0.840700
min       0.000197
25%       0.918144
50%       1.257618
75%       1.674024
max     186.227676
dtype: float64

In [278]:
pd.Series(result[0][result[0] > 0]).describe(percentiles=[.75, .85, .95]).iloc[1:]

mean      1.370761
std       0.840700
min       0.000197
50%       1.257618
75%       1.674024
85%       1.944311
95%       2.535178
max     186.227676
dtype: float64

In [279]:
knns

array([[     0,  24589,  24599,  24598],
       [     1,  24604,  24698,  24599],
       [     2,  25905,  24598,      0],
       ...,
       [304550,    994, 304548, 304551],
       [304550,    994, 304548, 304551],
       [304550,    994, 304548, 304551]], dtype=int32)

In [280]:
from scipy import sparse
from scipy.sparse.csgraph import connected_components

In [281]:
s = np.repeat(np.arange(knns.shape[0])[..., np.newaxis], k, axis=1)
b1 = s.flatten()
b2 = knns.flatten()

In [282]:
graph = sparse.csr_matrix((np.ones(b1.shape[0]), (b1, b2)), shape=(data.shape[0], data.shape[0]))

In [283]:
n_components, labels = connected_components(csgraph=graph, directed=True, connection='strong',  return_labels=True)

In [284]:
n_components

69782

In [285]:
_, counts = np.unique(labels, return_counts=True)
pd.Series(counts).describe()

count    69782.000000
mean         4.364363
std          6.225525
min          1.000000
25%          1.000000
50%          3.000000
75%          5.000000
max        238.000000
dtype: float64

In [286]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations/chars_{region_id}.parquet")

In [287]:
plotting = tessellation
plotting["label"] = labels

In [288]:
import glasbey
from lonboard import PolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12

def hex_to_rgb(hexa):
    return tuple(int(hexa[i : i + 2], 16) for i in (0, 2, 4))


gb_cols = glasbey.extend_palette(
    Set3_12.hex_colors, palette_size=plotting["label"].unique().shape[0] + 1
)
gb_cols = [hex_to_rgb(c[1:]) for c in gb_cols]

colors = apply_categorical_cmap(
    plotting["label"], cmap=dict(zip(np.unique(plotting["label"]), gb_cols, strict=False))
)

  Jp = Jp * self.KL
  h_rad = np.arctan2(b, a)
  r = np.hypot(a, b)
  M = (np.exp(self.c2*Mp) - 1) / self.c2
  / (np.sqrt(J / 100) * (1.64 - 0.29**self.n) ** 0.73)
  t = (C
  t = (C
  A = self.A_w * (J / 100) ** (1 / (self.c * self.z))


ValueError: cannot convert float NaN to integer

In [None]:
layer = PolygonLayer.from_geopandas(
    gdf=plotting[["geometry", "label"]], get_fill_color=colors, opacity=0.15
)

In [None]:
# m = Map(layer, basemap_style=CartoBasemap.Positron)
# m

In [None]:
np.unique(labels, return_counts=True)

In [9]:
%%time
edges = parallel_boruvka(numba_tree, min_samples=1)

CPU times: user 49min 52s, sys: 7.5 s, total: 50min
Wall time: 3min 24s


In [10]:
sorted_mst = edges[np.argsort(edges.T[2])]

In [17]:
# np.save('../data/sorted_mst_eubucco_centroids.npy', sorted_mst)

In [11]:
from fast_hdbscan.cluster_trees import get_cluster_labelling_at_cut, mst_to_linkage_tree

In [12]:
%%time
linkage_tree = mst_to_linkage_tree(sorted_mst)

CPU times: user 6.57 s, sys: 1.19 s, total: 7.76 s
Wall time: 7.67 s


In [74]:
%%time
epsilon = 400
clusters = get_cluster_labelling_at_cut(linkage_tree, epsilon, 2)

CPU times: user 6.73 s, sys: 1.23 s, total: 7.96 s
Wall time: 6.26 s


In [75]:
vals, counts = np.unique(clusters, return_counts=True)
idxs = np.argsort(counts)[-10:]
counts[idxs]

array([ 486607,  518308,  589589,  730388,  754610,  810908, 1636365,
       2231215, 3989463, 4478821])

In [79]:
clusters = pd.Series(clusters, index=ids, name=f'cluster_{epsilon}m')

In [81]:
import geopandas as gpd

In [82]:
%%time
grid_cores = gpd.GeoDataFrame(
    clusters,
    geometry=gpd.points_from_xy(data[:, 0], data[:, 1]),
    crs=3035,
)

CPU times: user 7.31 s, sys: 3.08 s, total: 10.4 s
Wall time: 10.4 s


In [84]:
%%time
grid_cores_dissolved = grid_cores.dissolve(f"cluster_{epsilon}m")

CPU times: user 27.2 s, sys: 896 ms, total: 28.1 s
Wall time: 28.1 s


In [88]:
%%time
convex_hulls = grid_cores_dissolved.convex_hull


CPU times: user 8.61 s, sys: 8.05 ms, total: 8.61 s
Wall time: 8.61 s


In [100]:
%%time
convex_hulls = grid_cores_dissolved.concave_hull(ratio=.02, allow_holes=False)


CPU times: user 2min 51s, sys: 56 ms, total: 2min 51s
Wall time: 2min 51s


In [101]:
convex_hulls = convex_hulls.to_frame().reset_index()
convex_hulls.columns = ['region_id', 'geometry']
convex_hulls = convex_hulls.set_geometry('geometry')

In [102]:
import lonboard

In [103]:
layer = lonboard.PolygonLayer.from_geopandas(
    convex_hulls[convex_hulls.geom_type == 'Polygon'].to_crs(4326), opacity=0.15)

In [105]:
# m = lonboard.Map([layer], basemap_style=lonboard.basemap.CartoBasemap.Positron)
# m

In [None]:
clusters.to_frame().to_parquet(f'../data/id_to_region_mst_{epsilon}m.parquet')

In [50]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2).astype(int)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [94]:
dists = ((data.iloc[[0], [1, 2]].values - data.iloc[1:, [1, 2]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [95]:
dist_grid = ((data.iloc[[0], [3, 4]].values - data.iloc[1:, [3, 4]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [96]:
pd.Series(dists).describe().iloc[1:]

mean    179092.288873
std     124926.395093
min         50.632440
25%      83659.753069
50%     154873.615802
75%     240605.405899
max     529213.354645
dtype: float64

In [97]:
pd.Series(dist_grid).describe().iloc[1:]

mean    179116.726038
std     124927.318729
min          0.000000
25%      83696.176735
50%     154898.224651
75%     240624.022076
max     529251.868206
dtype: float64