In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")
graph_dir = data_dir + "neigh_graphs/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"

In [2]:
from core.cluster_validation import generate_neigbhourhood_groups
from core.utils import used_keys

In [3]:
import glob

In [4]:
region_id = 69300

In [5]:
primary = pd.read_parquet(chars_dir + f'primary_chars/primary_chars_{region_id}.parquet')

In [6]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [7]:
# X = pd.read_parquet('/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}.parquet')
# data = X.loc[:, X.columns.str.contains('_median')].values
# data.shape

In [16]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')



X_train = X_train[X_train.index >= 0]

vals = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)


# X_train = X_train.clip(-10, 10)

In [27]:
data = np.ascontiguousarray(X_train.values)

In [28]:

# data = X.values
# data.shape

In [29]:
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

In [30]:
%%time
sklearn_tree = KDTree(data)

CPU times: user 1.26 s, sys: 8.41 ms, total: 1.27 s
Wall time: 1.26 s


In [31]:
%%time
numba_tree = kdtree_to_numba(sklearn_tree)

CPU times: user 0 ns, sys: 9 μs, total: 9 μs
Wall time: 10.3 μs


In [32]:
from fast_hdbscan.numba_kdtree import parallel_tree_query

In [33]:
k = 5

In [34]:
%%time
result = parallel_tree_query(numba_tree, data, k=k)

CPU times: user 5h 16min 5s, sys: 6.81 s, total: 5h 16min 12s
Wall time: 16min 40s


In [275]:
# np.save('../data/prague_knn_100.npy', result[1])

In [79]:
dists, knns = result[0], result[1]

In [162]:
pd.Series(dists[:, 1]).describe().iloc[1:]

mean     2.944037
std      1.346554
min      0.004227
25%      2.102059
50%      2.865647
75%      3.671932
max     80.164490
dtype: float64

In [163]:
pd.Series(dists[:, 2]).describe(percentiles=[.75, .85, .95]).iloc[1:]

mean      3.364537
std       1.376522
min       0.005325
50%       3.268719
75%       4.037583
85%       4.521188
95%       5.545440
max     135.846375
dtype: float64

In [154]:
mutual_neighbours = 2

In [155]:
from scipy import sparse
from scipy.sparse.csgraph import connected_components

In [156]:
s = np.repeat(np.arange(knns.shape[0])[..., np.newaxis], mutual_neighbours, axis=1)
b1 = s.flatten()
b2 = knns[:, 1:mutual_neighbours + 1].flatten()

In [157]:
graph = sparse.csr_matrix((np.ones(b1.shape[0]), (b1, b2)), shape=(data.shape[0], data.shape[0]))

In [158]:
n_components, labels = connected_components(csgraph=graph, directed=True, connection='strong',  return_labels=True)

In [159]:
n_components

161324

In [161]:
(pd.Series(labels).value_counts() >=2).sum()

np.int64(65520)

In [136]:
num_top_clusters = 50

In [137]:
top_clusters = pd.Series(labels).value_counts().iloc[:num_top_clusters].index.values
clusters = np.full(labels.shape[0], -1)
i = 0
for c in top_clusters:
    clusters[labels == c] = i
    i += 1

In [141]:
%%time
import lonboard
from core.cluster_validation import get_color
plotting = tessellation.loc[tessellation.index.isin(X_train.index)].copy()
# plotting = tessellation.loc[X_train.iloc[dists[:, 1] > 6].index].copy()
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.08)



CPU times: user 4.75 s, sys: 408 ms, total: 5.16 s
Wall time: 5.15 s


In [142]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [146]:
# layer.get_fill_color = get_color(clusters)

In [None]:
tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [9]:
%%time
edges = parallel_boruvka(numba_tree, min_samples=1)

CPU times: user 49min 52s, sys: 7.5 s, total: 50min
Wall time: 3min 24s


In [10]:
sorted_mst = edges[np.argsort(edges.T[2])]

In [17]:
# np.save('../data/sorted_mst_eubucco_centroids.npy', sorted_mst)

In [11]:
from fast_hdbscan.cluster_trees import get_cluster_labelling_at_cut, mst_to_linkage_tree

In [12]:
%%time
linkage_tree = mst_to_linkage_tree(sorted_mst)

CPU times: user 6.57 s, sys: 1.19 s, total: 7.76 s
Wall time: 7.67 s


In [74]:
%%time
epsilon = 400
clusters = get_cluster_labelling_at_cut(linkage_tree, epsilon, 2)

CPU times: user 6.73 s, sys: 1.23 s, total: 7.96 s
Wall time: 6.26 s


In [75]:
vals, counts = np.unique(clusters, return_counts=True)
idxs = np.argsort(counts)[-10:]
counts[idxs]

array([ 486607,  518308,  589589,  730388,  754610,  810908, 1636365,
       2231215, 3989463, 4478821])

In [79]:
clusters = pd.Series(clusters, index=ids, name=f'cluster_{epsilon}m')

In [81]:
import geopandas as gpd

In [82]:
%%time
grid_cores = gpd.GeoDataFrame(
    clusters,
    geometry=gpd.points_from_xy(data[:, 0], data[:, 1]),
    crs=3035,
)

CPU times: user 7.31 s, sys: 3.08 s, total: 10.4 s
Wall time: 10.4 s


In [84]:
%%time
grid_cores_dissolved = grid_cores.dissolve(f"cluster_{epsilon}m")

CPU times: user 27.2 s, sys: 896 ms, total: 28.1 s
Wall time: 28.1 s


In [88]:
%%time
convex_hulls = grid_cores_dissolved.convex_hull


CPU times: user 8.61 s, sys: 8.05 ms, total: 8.61 s
Wall time: 8.61 s


In [100]:
%%time
convex_hulls = grid_cores_dissolved.concave_hull(ratio=.02, allow_holes=False)


CPU times: user 2min 51s, sys: 56 ms, total: 2min 51s
Wall time: 2min 51s


In [101]:
convex_hulls = convex_hulls.to_frame().reset_index()
convex_hulls.columns = ['region_id', 'geometry']
convex_hulls = convex_hulls.set_geometry('geometry')

In [102]:
import lonboard

In [103]:
layer = lonboard.PolygonLayer.from_geopandas(
    convex_hulls[convex_hulls.geom_type == 'Polygon'].to_crs(4326), opacity=0.15)

In [105]:
# m = lonboard.Map([layer], basemap_style=lonboard.basemap.CartoBasemap.Positron)
# m

In [None]:
clusters.to_frame().to_parquet(f'../data/id_to_region_mst_{epsilon}m.parquet')

In [50]:
data[["x_100", "y_100"]] = np.around(data[["x", "y"]], decimals=-2).astype(int)
grid = data[["id", "x_100", "y_100"]].groupby(["x_100", "y_100"]).count().reset_index()

In [94]:
dists = ((data.iloc[[0], [1, 2]].values - data.iloc[1:, [1, 2]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [95]:
dist_grid = ((data.iloc[[0], [3, 4]].values - data.iloc[1:, [3, 4]].values) ** 2).sum(
    axis=1
) ** (1 / 2)

In [96]:
pd.Series(dists).describe().iloc[1:]

mean    179092.288873
std     124926.395093
min         50.632440
25%      83659.753069
50%     154873.615802
75%     240605.405899
max     529213.354645
dtype: float64

In [97]:
pd.Series(dist_grid).describe().iloc[1:]

mean    179116.726038
std     124927.318729
min          0.000000
25%      83696.176735
50%     154898.224651
75%     240624.022076
max     529251.868206
dtype: float64