In [165]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree

CPU times: user 16 μs, sys: 0 ns, total: 16 μs
Wall time: 17.6 μs


In [166]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
val_path = '../data/prague_clusters.parquet'

In [167]:
# region_id = 'freiburg'
# buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
# chars_dir = '../data/freiburg/chars/'
# val_path = '../data/fbg_cluster_validation.pq'

In [168]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [169]:
graph.cardinalities.describe()

count    300997.000000
mean          6.744223
std           1.972381
min           1.000000
25%           6.000000
50%           7.000000
75%           8.000000
max          71.000000
Name: cardinalities, dtype: float64

In [170]:
from core.cluster_validation import print_distance, generate_neigbhourhood_groups, generate_detailed_clusters

In [171]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [172]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')



X_train = X_train[X_train.index >= 0]



spatial_lag = 3

# lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}_lag_{spatial_lag}.parquet')

lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{spatial_lag}_sw.parquet')


X_train = X_train.join(lag[[c for c in lag.columns if '_median' in c]], how='inner')

# X_train = X_train.join(lag, how='inner')


In [173]:
# for c in X_train.columns:
#     X_train[c] = X_train[c].clip(*np.percentile(X_train[c], [5, 95]))

In [174]:
to_drop = ['stcSAl',
 'ltkOri',
 'stbOri',
 'stcOri',
 'stbCeA',

#not in barcelona
 # 'ltcBuA', 'midRea', 'midAre', 'likWBB'
]


all_drop = []
for c in to_drop:
    all_drop += X_train.columns[X_train.columns.str.contains(c)].tolist()


X_train = X_train.drop(all_drop, axis=1)

In [175]:

# vals = StandardScaler().fit_transform(X_train)
# X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

# vals = np.nan_to_num(X_train)
# X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

In [176]:
# t1 = X_train[[c for c in X_train.columns if '_' not in c]]
# X_train = X_train[[c for c in X_train.columns if '_median' in c]]

# X_train = t1.join(t2)
# X_train.shape

In [177]:
# X_train = X_train.drop(stats.columns[stats.loc['std'] == 0], axis=1)

In [178]:
X_train.shape

(299064, 116)

In [179]:
tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False, path=val_path)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train)), index=X_train.index)
    .loc[tess_groups.index]
    .values
)

from sklearn.metrics import davies_bouldin_score

def check_score(data, example_clusters):
    groups = example_clusters[example_clusters.index.isin(data.index)]
    groups_ilocs = (
        pd.Series(np.arange(len(data)), index=data.index).loc[groups.index].values
    )
    return davies_bouldin_score(data.iloc[groups_ilocs], groups.values)

# check_score(X_train, tess_groups)

In [180]:
# tessellation.loc[tess_groups.index].explore()

In [181]:
from core.cluster_validation import print_distance
print_distance( pd.DataFrame(X_train.loc[tess_groups.index]).groupby(tess_groups.values).mean(), metric='euclidean')

Unnamed: 0,commie blocks vn,fancy commie blocks,holyne,housing blocks,housing houses,josefov,karlin IT offices,karlin old,karlin river offices,karlin square,mala strana,malesice,prague castle,row houses1,row houses2,smickov,stare mesto,vinohrady blocks,vinohrady squares,vinohrady villas
commie blocks vn,0.0,246723.864323,279841.506816,141748.614512,191851.278703,226638.249453,412708.941461,446018.290232,630313.317587,163512.859772,567142.348168,1608165.459634,163732.005086,271974.315302,366412.880579,163644.359544,212516.304979,363022.890532,221220.660531,249402.226468
fancy commie blocks,246723.864323,0.0,244626.830892,257280.800885,367746.118313,452101.386765,230622.303244,313115.43382,457481.201477,298650.467435,511598.147435,1364417.984585,116035.120703,271875.528668,277468.897355,288591.989253,437970.448035,184556.4312,342862.951859,285116.733629
holyne,279841.506816,244626.830892,0.0,386302.957733,467679.478415,496778.66024,455471.533828,550603.947569,409284.016281,421331.619034,296272.531695,1454648.453422,173886.547478,469388.355799,507184.881759,419013.250727,483412.646283,185757.23303,480468.576888,466553.970719
housing blocks,141748.614512,257280.800885,386302.957733,0.0,121771.573854,219132.146366,361262.858778,357761.52142,693500.72907,89313.769548,678819.232307,1596236.888619,232480.290791,163934.54458,270820.376965,71298.923864,206646.144352,422008.676496,115350.590409,135729.002202
housing houses,191851.278703,367746.118313,467679.478415,121771.573854,0.0,122304.288579,453849.869098,427828.347278,794509.381182,97153.710921,756929.166901,1701476.484388,326403.158284,214038.12468,335563.09396,90874.058434,113416.415924,523851.673401,77401.089793,173984.201434
josefov,226638.249453,452101.386765,496778.66024,219132.146366,122304.288579,0.0,563378.117936,548771.018194,853283.649218,199487.808257,768787.457038,1805341.967209,385665.959788,335054.653062,456597.607367,203464.603121,14929.993889,588095.040407,197771.621527,295601.381151
karlin IT offices,412708.941461,230622.303244,455471.533828,361262.858778,453849.869098,563378.117936,0.0,181190.571944,557796.771855,385382.280811,692874.621462,1264555.069808,319406.054439,268832.951474,188293.263798,370506.396407,550663.835031,337066.215163,398999.251295,306934.619048
karlin old,446018.290232,313115.43382,550603.947569,357761.52142,427828.347278,548771.018194,181190.571944,0.0,668673.754728,364086.655378,811443.108971,1327668.546453,399449.890275,216357.431856,97841.034062,349317.803887,538481.902652,455917.843438,356252.696248,254357.891435
karlin river offices,630313.317587,457481.201477,409284.016281,693500.72907,794509.381182,853283.649218,557796.771855,668673.754728,0.0,715813.001212,391064.08642,1138919.599562,488159.617721,708426.034889,681136.106483,718857.810609,839600.464407,293584.429826,780166.891366,724399.795187
karlin square,163512.859772,298650.467435,421331.619034,89313.769548,97153.710921,199487.808257,385382.280811,364086.655378,715813.001212,0.0,715950.701042,1624477.01901,270811.957753,161624.539319,276181.66824,37888.082127,188517.884995,454108.642373,86125.480664,126574.569431


In [182]:
from scipy.spatial.distance import pdist, cdist
for i, g in X_train.loc[tess_groups.index].groupby(tess_groups.values):
    print(i, np.mean(pdist(g)))

commie blocks vn nan
fancy commie blocks nan
holyne nan
housing blocks nan
housing houses nan
josefov nan
karlin IT offices nan
karlin old nan
karlin river offices nan
karlin square nan
mala strana nan
malesice nan
prague castle nan
row houses1 nan
row houses2 221643.2893416968
smickov nan
stare mesto nan
vinohrady blocks 213786.26425387833
vinohrady squares nan
vinohrady villas nan


In [183]:
# tessellation.loc[tess_groups.index].explore(column=tess_groups.values, categorical=True)

In [184]:
# np.min(cdist(X_train.loc[tess_groups[tess_groups == 'josefov'].index], 
#              X_train.loc[tess_groups[tess_groups == 'stare mesto'].index]))

In [185]:
mean_clusters = pd.DataFrame(X_train.loc[tess_groups.index]).groupby(tess_groups.values).mean()

In [186]:
(mean_clusters.loc['josefov'] - mean_clusters.loc['stare mesto']).abs().sort_values(ascending=False).iloc[:10]

ldkAre           8175.171864
ldkAre_median    8165.589945
ldsAre           6909.648682
ldsAre_median    4032.116312
midAre_median    3100.751373
mdcAre           1897.801800
sdsAre           1771.245909
sdsAre_median    1723.851031
sddAre_median    1716.373871
sddAre           1260.038013
dtype: float64

In [187]:
from core.utils import used_keys
used_keys['lcnClo']

'local closeness of street network'

In [188]:
# training_data = X_train.loc[tess_groups.index]
# tess_groups_ilocs = (
#     pd.Series(np.arange(len(training_data)), index=training_data.index)
#     .loc[tess_groups.index]
#     .values
# )
# training_data.shape

In [189]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from core.cluster_validation import get_linkage_matrix

In [190]:
q1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

# clustering_graph = q1.higher_order(k=3, lower_order=True, diagonal=True).subgraph(X_train.index.values)

clustering_graph = q1.copy()

In [191]:
graph_labels = q1.subgraph(X_train.index.values).component_labels
graph_labels.value_counts()

component labels
444    166757
97      18934
550      6503
443      3902
438      3828
        ...  
605         1
101         1
602         1
104         1
125         1
Name: count, Length: 821, dtype: int64

In [192]:
clustering_graph = clustering_graph.subgraph(graph_labels[graph_labels == 444].index.values)
core_ids = clustering_graph.unique_ids


In [193]:
# clustering_graph = q1.subgraph(q1.component_labels[q1.component_labels == 25].index.values)
# clustering_graph = clustering_graph.subgraph(clustering_graph.unique_ids[clustering_graph.unique_ids >= 0])
# core_ids = clustering_graph.unique_ids

In [194]:
training_data = X_train[X_train.index.isin(core_ids)]

In [195]:


# training_data = training_data[[c for c in training_data.columns if '_' not in c]]


In [196]:
# t1 = training_data[[c for c in training_data.columns if '_' not in c]]
# t2 = training_data[[c for c in training_data.columns if '_median' in c]]

# training_data = t1.join(t2)

In [197]:
# training_data = X_train[X_train.index >=0]
# clustering_graph = graph.subgraph(X_train.index)

In [198]:
training_data.shape

(166757, 116)

In [199]:
vals = StandardScaler().fit_transform(training_data)
training_data = pd.DataFrame(vals, columns=training_data.columns, index=training_data.index)

vals = np.nan_to_num(training_data)
training_data = pd.DataFrame(vals, columns=training_data.columns, index=training_data.index)

stats = training_data.describe()
training_data = training_data.drop(stats.columns[stats.loc['std'] == 0], axis=1)

In [200]:
%%time
clusterer = AgglomerativeClustering(
    linkage='ward',
    # metric='cityblock',
                                                  
    connectivity = clustering_graph.transform('B').sparse, 
    
                                    # connectivity=q1.subgraph(X_train.index.values).transform('B').sparse, 
                                    compute_full_tree=True,
                                    compute_distances=True)
model = clusterer.fit(training_data)

CPU times: user 5.93 s, sys: 76 ms, total: 6.01 s
Wall time: 6.01 s


In [201]:
linkage_matrix = get_linkage_matrix(model)

In [202]:
# fix, ax = plt.subplots(figsize=(40,40))
# # Plot the corresponding dendrogram
# _ = dendrogram(linkage_matrix, truncate_mode="level", p=5, ax=ax)

In [203]:
from sklearn.metrics import calinski_harabasz_score

In [204]:
tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False, path=val_path)
tess_groups = tess_groups[tess_groups.index.isin(training_data.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(training_data)), index=training_data.index)
    .loc[tess_groups.index]
    .values
)

In [205]:
first = linkage_matrix[0, 2]
last = linkage_matrix[-1, 2]
step = (last - first) / 25

In [206]:
res = []
i = 0
for i in range(0, 25):

    t = first + step *  (i + 1)
    if t >= linkage_matrix[-1, 2]:
        break
    
    r = fcluster(linkage_matrix, t=t, criterion='distance')
    r = pd.Series(r, index=training_data.index)
    ssplits = clustering_graph.describe(r, statistics=['nunique'])['nunique']

    res.append((t,
          adjusted_rand_score(tess_groups.values, r.iloc[tess_groups_ilocs]),
          (ssplits > 1).sum() / ssplits.shape[0],
          davies_bouldin_score(training_data, r),
          calinski_harabasz_score(training_data, r)
         ))
pd.DataFrame(res, columns = ['cutoff', 'rand', 'ssplits', 'db_score', 'ch_score']).set_index('cutoff')

Unnamed: 0_level_0,rand,ssplits,db_score,ch_score
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
58.786244,0.327807,0.351559,2.735153,244.685359
117.557434,0.518663,0.188202,3.705065,476.598565
176.328624,0.567873,0.114184,4.29337,743.956229
235.099814,0.60494,0.079457,4.612556,1065.064517
293.871005,0.539077,0.062204,5.396985,1340.57011
352.642195,0.485292,0.047866,5.804665,1702.922502
411.413385,0.437567,0.042577,5.71907,1921.071632
470.184575,0.309201,0.034733,5.144289,2218.160321
528.955766,0.309201,0.031267,5.914264,2468.587006
587.726956,0.204184,0.024383,6.510962,3971.260264


In [207]:
# res = []
# for t in range(5, 25, 1):

#     if t >= linkage_matrix[-1, 2]:
#         break
    
#     r = fcluster(linkage_matrix, t=t, criterion='distance')
#     r = pd.Series(r, index=training_data.index)
#     ssplits = clustering_graph.describe(r, statistics=['nunique'])['nunique']

#     res.append((t,
#           adjusted_rand_score(tess_groups.values, r.iloc[tess_groups_ilocs]),
#           (ssplits > 1).sum() / ssplits.shape[0],
#           davies_bouldin_score(training_data, r),
#           calinski_harabasz_score(training_data, r)
#          ))
# pd.DataFrame(res, columns = ['cutoff', 'rand', 'ssplits', 'db_score', 'ch_score']).set_index('cutoff')

In [208]:
# try hdbscan extraction
from fast_hdbscan.boruvka import parallel_boruvka
from fast_hdbscan.cluster_trees import (
    cluster_tree_from_condensed_tree,
    condense_tree,
    extract_eom_clusters,
    get_cluster_label_vector,
    mst_to_linkage_tree,
)
from fast_hdbscan.numba_kdtree import kdtree_to_numba
from sklearn.neighbors import KDTree

In [209]:
res = []

for min_cluster_size in range(25, 500, 25):


    condensed_tree = condense_tree(linkage_matrix, 
                                   min_cluster_size=min_cluster_size)
    cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
    selected_clusters = extract_eom_clusters(
        condensed_tree, cluster_tree, allow_single_cluster=False
    )
    r = get_cluster_label_vector(condensed_tree, selected_clusters, 0)


    r = pd.Series(r, index=training_data.index)
    ssplits = clustering_graph.describe(r, statistics=['nunique'])['nunique']

    res.append((min_cluster_size,
          adjusted_rand_score(tess_groups.values, r.iloc[tess_groups_ilocs]),
          (ssplits > 1).sum() / ssplits.shape[0],
          davies_bouldin_score(training_data, r),
          calinski_harabasz_score(training_data, r)
         ))

pd.DataFrame(res, columns = ['min__cluster_size', 'rand', 'ssplits', 'db_score', 'ch_score']).set_index('min__cluster_size')

Unnamed: 0_level_0,rand,ssplits,db_score,ch_score
min__cluster_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25,0.118227,0.60067,2.569605,103.302489
50,0.187996,0.452341,2.974373,148.520132
75,0.288916,0.377226,3.271861,190.306123
100,0.393281,0.323926,3.567746,222.257677
125,0.403216,0.297211,3.746784,251.881602
150,0.453353,0.270945,3.930556,281.574891
175,0.506431,0.252133,4.105786,305.14283
200,0.522036,0.238725,4.240837,320.041909
225,0.502539,0.228176,4.391548,337.84396
250,0.51521,0.22131,4.494005,346.930269


In [210]:


condensed_tree = condense_tree(linkage_matrix, 
                               min_cluster_size=275)
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
selected_clusters = extract_eom_clusters(
    condensed_tree, cluster_tree, allow_single_cluster=False
)
clusters = get_cluster_label_vector(condensed_tree, selected_clusters, 0)

In [211]:
plotting = tessellation.loc[training_data.index].reset_index()

In [212]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(plotting, opacity=.08)



CPU times: user 2.87 s, sys: 292 ms, total: 3.16 s
Wall time: 3.16 s


In [213]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [214]:
from core.cluster_validation import get_color

In [215]:
clusters = fcluster(linkage_matrix, t=101, criterion='distance')

In [216]:


condensed_tree = condense_tree(linkage_matrix, 
                               min_cluster_size=102)
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
selected_clusters = extract_eom_clusters(
    condensed_tree, cluster_tree, allow_single_cluster=False
)
clusters = get_cluster_label_vector(condensed_tree, selected_clusters, 0)

In [217]:
# np.unique(clusters, return_counts=True)

In [218]:
layer.get_fill_color = get_color(clusters)

In [158]:
new_data = training_data.groupby(clusters).mean()

In [157]:
from core.cluster_validation import get_feature_importance
from core.utils import used_keys

In [69]:
clusters_subset = [11597, 11615, 17742]
clusters_subset = np.where(np.isin(clusters, clusters[clusters_subset]))

In [70]:
imps = get_feature_importance(training_data.iloc[clusters_subset], clusters[clusters_subset])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [72]:
imps.loc[:10, [c for c in imps.columns if '_vals' not in c]]

Unnamed: 0,cluster_48,cluster_66,cluster_69
0,linWID_higher,lcdMes_higher,ssbCCD_lower
1,ldsMSL_lower,linPDE,ltkOri_lower
2,lddNDe,linP3W_higher,ssbSqu_lower
3,ltkOri_lower,mtbAli_median,lcdMes_median
4,ldsMSL,lskCWA_lower,linWID_higher
5,ldsMSL_median,lskERI_median,sicCAR_lower
6,midRea_median,ltkWNB_median,lskCWA_lower
7,ldkAre_higher,linWID_lower,stcOri_lower
8,lskCCo_lower,mtbNDi_higher,ssbSqu_median
9,ssbCCD_lower,linPDE_lower,sdcLAL_median


In [77]:
used_keys['ltkOri']

'orientation of enclosure'

In [112]:
imps = get_feature_importance(training_data, clusters)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 co

In [119]:
imps.loc[:10, [c for c in imps.columns if '_vals' not in c]]

Unnamed: 0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7
0,ldkPer,lcdMes,libNCo,lskCWA,ldsAre,ltkWNB,likWBB
1,ltkWNB,likWBB,likWBB,likWBB,midAre,ldkPer,lcdMes
2,lskCCo,ltkWNB,sdsSPO,ltkWNB,ltkOri,lskCCo,ltkWNB
3,ltkOri,lskERI,ldbPWL,lskCCo,linP4W,lskCWA,linWID
4,lskCWA,lskCWA,ltcBuA,ldkAre,sdsSPO,ltkOri,ltkOri
5,ldkAre,ldkAre,sicCAR,lskERI,ldsMSL,likWBB,lskCWA
6,likWBB,linP4W,mtbSWR,ldsAre,lddNDe,ldkAre,ldkAre
7,lcdMes,ldkPer,ssbSqu,lddNDe,lcdMes,lskERI,lddNDe
8,lskERI,ltkOri,lskERI,midRea,ldkPer,linP3W,linP3W
9,linP4W,linPDE,sdbPer,linPDE,likWBB,ldsAre,lcnClo


In [122]:
used_keys['lcdMes']

'local meshedness of street network'

In [None]:
imps[[c for c in imps.columns if '_vals' in c]].cumsum(axis=1)

In [96]:
josefov_joins = []

josefov_joins.append(np.isin(linkage_matrix[:, 0], 
                             tess_groups_ilocs[tess_groups == 'josefov']))
josefov_joins.append(np.isin(linkage_matrix[:, 1], 
                             tess_groups_ilocs[tess_groups == 'josefov']))


In [97]:
indxs = linkage_matrix[josefov_joins[0] | josefov_joins[1]]
indxs = np.union1d(indxs[:, 0], indxs[:, 1])
indxs = indxs[indxs <= X_train.shape[0]]

In [150]:
indxs = linkage_matrix[linkage_matrix[:, 2] <= 2]
indxs = np.union1d(indxs[:, 0], indxs[:, 1])
indxs = indxs[indxs < X_train.shape[0]]
indxs.shape

(64349,)

In [151]:
plotting = tessellation.loc[X_train.iloc[indxs].index]

In [42]:
cluster_means = training_data.groupby(clusters).mean()

In [68]:
c1 = 6
c2 = 10

(cluster_means.loc[c1] - cluster_means.loc[c2]).sort_values(ascending=False)

libNCo           6.284668
libNCo_median    5.846288
libNCo_higher    4.233946
linPDE_higher    3.014914
linPDE           2.122206
                   ...   
lcnClo          -1.414220
linWID_lower    -1.422183
linP3W          -1.489277
lcnClo_lower    -1.702326
linP3W_lower    -2.339139
Length: 248, dtype: float64

In [72]:
from core.utils import used_keys
used_keys['libNCo']

'number of courtyards within adjacent buildings'

In [169]:
bgraph = read_parquet(graph_dir + f"building_graph_{region_id}_knn1.parquet")

In [170]:
buildings_dir = '/data/uscuni-ulce/processed_data/buildings/'

buildings = gpd.read_parquet(
        buildings_dir + f"buildings_{region_id}.parquet"

)

In [178]:
buildings

Unnamed: 0,index,id,geometry
0,0,v0.1-CZE.12.2_1-35164,"POLYGON ((4614847.626 2975218.938, 4614848.235..."
1,1,v0.1-CZE.12.2_1-35123,"POLYGON ((4615276.357 2976034.184, 4615282.866..."
2,2,v0.1-CZE.12.2_1-35159,"POLYGON ((4615315.503 2975986.2, 4615322.056 2..."
3,3,v0.1-CZE.12.2_1-35166,"POLYGON ((4615222.339 2976016.91, 4615224.582 ..."
4,4,v0.1-CZE.12.2_1-35228,"POLYGON ((4615300.348 2975924.258, 4615301.6 2..."
...,...,...,...
299059,299060,v0.1-CZE.13.3_1-13696,"POLYGON ((4618611.169 3033535.197, 4618623.01 ..."
299060,299061,v0.1-CZE.13.3_1-13674,"POLYGON ((4618611.989 3033568.153, 4618617.119..."
299061,299062,v0.1-CZE.13.3_1-13591,"POLYGON ((4618614.831 3033550.704, 4618628.289..."
299062,299063,v0.1-CZE.13.3_1-13328,"POLYGON ((4618625.628 3033512.926, 4618625.634..."


In [181]:
buildings = buildings.join(X_train, how='inner').drop(['index', 'id'], axis=1)

In [185]:
r = buildings.dissolve(bgraph.component_labels, aggfunc='mean')

In [189]:
plotting = r

In [190]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.08)



CPU times: user 1.65 s, sys: 178 ms, total: 1.83 s
Wall time: 1.82 s


In [191]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [156]:
enclosures = gpd.read_parquet(f"/data/uscuni-ulce/processed_data/enclosures/enclosure_{region_id}.parquet")
encl_counts = tessellation.groupby('enclosure_index').count()
encl_counts.columns = ['tessellation']
enclosures['lieWCe'] = encl_counts['tessellation'] / enclosures.geometry.area

7        0.000325
8        0.000224
11       0.000030
12       0.000122
14       0.000638
           ...   
25157    0.000100
25158    0.000029
25159    0.000101
25160    0.000399
25161    0.000081
Length: 15958, dtype: float64

In [251]:
enclosures['lieWCe'] 

7        0.000325
8        0.000224
11       0.000030
12       0.000122
14       0.000638
           ...   
25157    0.000100
25158    0.000029
25159    0.000101
25160    0.000399
25161    0.000081
Name: lieWCe, Length: 15958, dtype: float64

In [164]:
encl_counts['tessellation']

enclosure_index
7        199
8         52
11         1
12        25
14       962
        ... 
25157      9
25158      1
25159     13
25160     15
25161      3
Name: tessellation, Length: 15935, dtype: int64

In [None]:
# Measure weighted cells within enclosure
merged = enclosures[['eID', 'ldeAre']].merge(encl_counts[['tessellation']], how='left', on='eID')
enclosures['lieWCe'] = merged['tessellation'] / merged['ldeAre']

In [1]:
import geopandas as gpd

In [11]:
gpd.read_parquet('/data/uscuni-ulce/cadastre_buildings_raw/buildings_germany_mv_0.pq').head()

Unnamed: 0,gml_id,identifier,oid,aktualit,gebnutzbez,funktion,gfkzshh,gmdschl,lagebeztxt,name,rellage,geometry
0,DEMVAL040000ACAEBL,urn:adv:oid:DEMVAL040000ACAEBL,DEMVAL040000ACAEBL,2014-11-04Z,Gebäude,Wohngebäude mit Handel und Dienstleistungen,31001_1120,13004000,,,,"MULTIPOLYGON (((33263213.163 5948562.945, 3326..."
1,DEMVAL040000ACAdBL,urn:adv:oid:DEMVAL040000ACAdBL,DEMVAL040000ACAdBL,2014-11-04Z,Gebäude,Wohngebäude mit Handel und Dienstleistungen,31001_1120,13004000,Lindenstr. 7,,,"MULTIPOLYGON (((33263198.96 5948568.636, 33263..."
2,DEMVAL040000ACAjBL,urn:adv:oid:DEMVAL040000ACAjBL,DEMVAL040000ACAjBL,2014-11-04Z,Gebäude,Schuppen,31001_2723,13004000,,,,"MULTIPOLYGON (((33263207.634 5948544.851, 3326..."
3,DEMVAL040000ACApBL,urn:adv:oid:DEMVAL040000ACApBL,DEMVAL040000ACApBL,2014-11-04Z,Gebäude,Garage,31001_2463,13004000,,,,"MULTIPOLYGON (((33263213.625 5948581.535, 3326..."
4,DEMVAL040000ACAuBL,urn:adv:oid:DEMVAL040000ACAuBL,DEMVAL040000ACAuBL,2014-11-04Z,Gebäude,Schuppen,31001_2723,13004000,,,,"MULTIPOLYGON (((33263212.119 5948542.451, 3326..."


In [12]:
gpd.read_parquet('/data/uscuni-ulce/cadastre_buildings_raw/buildings_germany_mv_10000.pq').head()

Unnamed: 0,gml_id,identifier,oid,aktualit,gebnutzbez,funktion,gfkzshh,gmdschl,lagebeztxt,name,rellage,geometry
0,DEMVAL04000dfzxjBL,urn:adv:oid:DEMVAL04000dfzxjBL,DEMVAL04000dfzxjBL,2014-11-04Z,Gebäude,Garage,31001_2463,13004000,,,,"MULTIPOLYGON (((33262346.344 5951813.908, 3326..."
1,DEMVAL04000dfzxpBL,urn:adv:oid:DEMVAL04000dfzxpBL,DEMVAL04000dfzxpBL,2014-11-04Z,Gebäude,Garage,31001_2463,13004000,,,,"MULTIPOLYGON (((33262350.432 5951814.077, 3326..."
2,DEMVAL04000dfzxuBL,urn:adv:oid:DEMVAL04000dfzxuBL,DEMVAL04000dfzxuBL,2014-11-04Z,Gebäude,Wohngebäude,31001_1000,13004000,"Seitenweg 5, 6",,,"MULTIPOLYGON (((33262357.645 5951805.808, 3326..."
3,DEMVAL04000dfzxzBL,urn:adv:oid:DEMVAL04000dfzxzBL,DEMVAL04000dfzxzBL,2014-11-04Z,Gebäude,Gebäude für Wirtschaft oder Gewerbe,31001_2000,13004000,,,,"MULTIPOLYGON (((33262377.065 5951815.087, 3326..."
4,DEMVAL04000dfzyCBL,urn:adv:oid:DEMVAL04000dfzyCBL,DEMVAL04000dfzyCBL,2014-11-04Z,Gebäude,Gebäude für Wirtschaft oder Gewerbe,31001_2000,13004000,,,,"MULTIPOLYGON (((33262421.737 5951538.959, 3326..."


In [13]:
gpd.read_parquet('/data/uscuni-ulce/cadastre_buildings_raw/buildings_germany_mv_20000.pq').head()

Unnamed: 0,gml_id,identifier,oid,aktualit,gebnutzbez,funktion,gfkzshh,gmdschl,lagebeztxt,anzahlgs,rellage,name,geometry
0,DEMVAL040000fCpABL,urn:adv:oid:DEMVAL040000fCpABL,DEMVAL040000fCpABL,2014-11-04Z,Sonstiges Bauwerk oder sonstige Einrichtung,Carport,51009_1611,13004000,,,,,"MULTIPOLYGON (((33257489.011 5951551.659, 3325..."
1,DEMVAL040000fCpDBL,urn:adv:oid:DEMVAL040000fCpDBL,DEMVAL040000fCpDBL,2014-11-04Z,Sonstiges Bauwerk oder sonstige Einrichtung,Carport,51009_1611,13004000,,,,,"MULTIPOLYGON (((33257499.872 5951546.253, 3325..."
2,DEMVAL040000fCpGBL,urn:adv:oid:DEMVAL040000fCpGBL,DEMVAL040000fCpGBL,2014-11-04Z,Sonstiges Bauwerk oder sonstige Einrichtung,Carport,51009_1611,13004000,,,,,"MULTIPOLYGON (((33257516.544 5951540.715, 3325..."
3,DEMVAL040000fCpaBL,urn:adv:oid:DEMVAL040000fCpaBL,DEMVAL040000fCpaBL,2014-11-04Z,Sonstiges Bauwerk oder sonstige Einrichtung,Carport,51009_1611,13004000,,,,,"MULTIPOLYGON (((33258550.944 5951297.674, 3325..."
4,DEMVAL040000fCpdBL,urn:adv:oid:DEMVAL040000fCpdBL,DEMVAL040000fCpdBL,2014-11-04Z,Sonstiges Bauwerk oder sonstige Einrichtung,Carport,51009_1611,13004000,,,,,"MULTIPOLYGON (((33258609.477 5951444.515, 3325..."
