In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree

CPU times: user 11 s, sys: 395 ms, total: 11.4 s
Wall time: 9.07 s


In [2]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [3]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
X_train = X_train[X_train.index >= 0]


In [7]:
X_train['sdbAre']

0         127.597664
1          52.955441
2          42.766552
3         125.082615
4          17.892685
             ...    
299059     75.225865
299060     99.143049
299061    208.004116
299062     75.241771
299063    116.559504
Name: sdbAre, Length: 299064, dtype: float64

In [13]:
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.cluster_trees import (
    cluster_tree_from_condensed_tree,
    condense_tree,
    extract_eom_clusters,
    get_cluster_label_vector,
    mst_to_linkage_tree,
)
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

In [36]:
min_cluster_size = 1000

In [50]:
vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

In [51]:
res = []
for i in range(X_train.shape[1]):
    sklearn_tree = KDTree(X_train.iloc[:, [i]])
    numba_tree = kdtree_to_numba(sklearn_tree)
    edges = parallel_boruvka(numba_tree, min_samples=min_cluster_size)
    sorted_mst = edges[np.argsort(edges.T[2])]
    linkage_tree = mst_to_linkage_tree(sorted_mst)
    
    condensed_tree = condense_tree(linkage_tree, min_cluster_size=min_cluster_size)
    cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
    selected_clusters = extract_eom_clusters(
        condensed_tree, cluster_tree, allow_single_cluster=False
    )
    clusters = get_cluster_label_vector(condensed_tree, selected_clusters, 0)
    res.append(clusters)

    print(X_train.columns[i], np.unique(clusters).shape[0])


sdbAre 11
sdbPer 4
sdbCoA 1
ssbCCo 46
ssbCor 16
ssbSqu 37
ssbERI 16
ssbElo 7
ssbCCM 3
ssbCCD 11
stbOri 43
mtbSWR 5
libNCo 6
ldbPWL 26
ltcBuA 27
mtbAli 3
mtbNDi 5
ltbIBD 3
stbCeA 45
stbSAl 28
sdsLen 22
sssLin 59
ldsMSL 24
ldsRea 129
ldsAre 52
sisBpM 10
sdsSPW 30
sdsSPO 77
sdsSWD 59
mtdDeg 6
lcdMes 123
linP3W 111
linP4W 107
linPDE 122
lcnClo 16
lddNDe 61
linWID 55
ldsCDL 52
xcnSCl 21
mtdMDi 43
sddAre 62
midRea 94
midAre 20
stcOri 4
sdcLAL 3
sdcAre 3
sscCCo 3
sscERI 3
mtcWNe 3
mdcAre 4
ltcWRB 4
sicCAR 3
stcSAl 55
ldkAre 58
ldkPer 70
lskCCo 85
lskERI 73
lskCWA 71
ltkOri 81
ltkWNB 71
likWBB 74
sdsAre 53
likWCe 87


In [55]:
from core.cluster_validation import get_feature_importance, generate_detailed_clusters

In [56]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [57]:
tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train)), index=X_train.index)
    .loc[tess_groups.index]
    .values
)

from sklearn.metrics import davies_bouldin_score
def check_score(data, example_clusters):
    groups = example_clusters[example_clusters.index.isin(data.index)]
    groups_ilocs = (
        pd.Series(np.arange(len(data)), index=data.index).loc[groups.index].values
    )
    return davies_bouldin_score(data.iloc[groups_ilocs], groups.values)

check_score(X_train, tess_groups)

4.3201251547917

In [58]:
imps = get_feature_importance(X_train.iloc[tess_groups_ilocs], tess_groups.values) 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 co

In [70]:
cutoff = 25
imps.iloc[:cutoff, 1::2].sum(axis=0)

cluster_commie blocks vn_vals        0.904575
cluster_fancy commie blocks_vals     0.920511
cluster_holyne_vals                  0.863776
cluster_housing blocks_vals          0.832241
cluster_housing houses_vals          0.820851
cluster_josefov_vals                 0.830666
cluster_karlin IT offices_vals       0.888425
cluster_karlin old_vals              0.916720
cluster_karlin river offices_vals    0.914667
cluster_karlin square_vals           0.836318
cluster_mala strana_vals             0.855983
cluster_malesice_vals                0.934919
cluster_prague castle_vals           0.813969
cluster_row houses1_vals             0.903431
cluster_row houses2_vals             0.983296
cluster_smickov_vals                 0.836964
cluster_stare mesto_vals             0.833912
cluster_vinohrady blocks_vals        0.925373
cluster_vinohrady squares_vals       0.960631
cluster_vinohrady villas_vals        0.885074
dtype: float64

In [73]:
un = np.unique(imps.iloc[:cutoff, 0::2].values)
[c for c in X_train.columns if c not in un]

['sdbCoA', 'ssbERI', 'mtdDeg', 'sscERI']