In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree

CPU times: user 11.3 s, sys: 402 ms, total: 11.7 s
Wall time: 9.31 s


In [2]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [3]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [4]:
graph.cardinalities.describe()

count    304554.000000
mean          6.751085
std           2.060782
min           1.000000
25%           6.000000
50%           7.000000
75%           8.000000
max          82.000000
Name: cardinalities, dtype: float64

In [5]:
# graph2 = graph.higher_order(lower_order=True, k=2, diagonal=True)
# graph2.cardinalities.describe()

In [6]:
# graph3 = graph.higher_order(lower_order=True, k=3, diagonal=True)
# graph3

In [7]:
from core.cluster_validation import print_distance, generate_neigbhourhood_groups

In [8]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [9]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

vals = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)
X_train = X_train.clip(-10, 10)

X_train = X_train[X_train.index >= 0]
vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)


spatial_lag = 3
lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}_lag_{spatial_lag}.parquet')
# X_train = X_train.join(lag[[c for c in lag.columns if '_median' in c]], how='inner')

X_train = X_train.join(lag, how='inner')


In [10]:
X_train.shape

(286282, 236)

In [11]:
tess_groups = generate_neigbhourhood_groups(tessellation[tessellation.index.isin(X_train.index)])
neighbourhoods = X_train.loc[tess_groups.index].groupby(tess_groups.values).mean()


In [18]:

print_distance(neighbourhoods, metric='euclidean')

Unnamed: 0,holyne,housing estate,josefov,karlin,mala strana,malesice,nusle,stare mesto,vinohrady
holyne,0.0,9.408168,20.275634,20.267738,23.634079,21.807441,11.098592,25.711941,20.476095
housing estate,9.408168,0.0,17.041665,16.715115,23.373414,21.141172,7.284706,23.800751,16.517832
josefov,20.275634,17.041665,0.0,13.518879,16.117197,28.351109,13.391806,11.559087,12.283674
karlin,20.267738,16.715115,13.518879,0.0,18.345765,23.978051,15.000365,15.933493,9.447479
mala strana,23.634079,23.373414,16.117197,18.345765,0.0,30.024914,20.639448,10.694829,22.008131
malesice,21.807441,21.141172,28.351109,23.978051,30.024914,0.0,22.025995,31.566459,26.861895
nusle,11.098592,7.284706,13.391806,15.000365,20.639448,22.025995,0.0,20.765271,14.263284
stare mesto,25.711941,23.800751,11.559087,15.933493,10.694829,31.566459,20.765271,0.0,18.574415
vinohrady,20.476095,16.517832,12.283674,9.447479,22.008131,26.861895,14.263284,18.574415,0.0


In [19]:
from scipy.spatial.distance import pdist

In [20]:
for i, g in X_train.loc[tess_groups.index].groupby(tess_groups.values):
    print(i, np.median(pdist(g)))

holyne 14.03218150641602
housing estate 12.383752002444172
josefov 14.71117622126815
karlin 16.576160420588984
mala strana 23.47845901881677
malesice 18.480590087666315
nusle 15.86275441337353
stare mesto 19.204272279843416
vinohrady 12.115487750287807


In [158]:
# tessellation.loc[tess_groups.index].explore(column=tess_groups.values, categorical=True)

In [21]:
newgraph = graph.subgraph(X_train.index.values)

In [22]:
newgraph.component_labels.value_counts()

component labels
713    58362
384    24363
691    16572
66     15002
654    12865
       ...  
853        1
851        1
30         1
29         1
28         1
Name: count, Length: 884, dtype: int64

In [23]:
main_comp = 713

In [24]:
import lonboard
from sidecar import Sidecar
from libpysal.graph import Graph
from lonboard.colormap import apply_continuous_cmap
from core.cluster_validation import get_color

In [25]:
plotting = tessellation.loc[newgraph.component_labels[newgraph.component_labels == main_comp].index]

In [26]:
%%time
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.15)



CPU times: user 1.12 s, sys: 95.9 ms, total: 1.22 s
Wall time: 1.22 s


Create a Sidecar view (assumes JupyterLab) for more comfortable experience.

In [27]:


sc = Sidecar(title='buildings')

Create a Map object

In [28]:
m = lonboard.Map(layer)

Display map within the sidecar plugin

In [29]:
with sc:
    display(m)

In [30]:
plot_data = X_train.loc[newgraph.component_labels[newgraph.component_labels == main_comp].index]

In [31]:
plot_graph = newgraph.subgraph(plot_data.index)

In [32]:
focals = plot_graph._adjacency.index.get_level_values(0).values
neighbours = plot_graph._adjacency.index.get_level_values(1).values

In [33]:
fvals = plot_data.loc[focals].values
nvals = plot_data.loc[neighbours].values

In [34]:
distances = np.sqrt(np.sum(np.pow(fvals - nvals, 2), axis=1))

In [35]:
# %%time
# from scipy.spatial.distance import cosine
# distances = [cosine(fvals[i], nvals[i]) for i in range(fvals.shape[0])]
# distances = np.array(distances)

In [36]:
pd.Series(distances).describe().iloc[1:]

mean     6.548039
std      4.288149
min      0.000000
25%      4.305032
50%      6.484732
75%      8.661463
max     36.553408
dtype: float64

In [74]:
threshold = 10




In [75]:
new_focals = focals[np.where(distances <= threshold)]
new_neighbours = neighbours[np.where(distances <= threshold)]
new_distances = distances[np.where(distances <= threshold)]

subgraph = Graph.from_arrays(new_focals, new_neighbours, new_distances)

In [76]:
# tessellation.loc[subgraph.component_labels[subgraph.component_labels == 78865].index].explore()

In [77]:
ntop = 50

top_components = subgraph.component_labels.value_counts()

labels = subgraph.component_labels.copy()
top_labels = top_components[:ntop].index.values
labels[~labels.isin(top_labels)] = -1
repl_dict = dict(zip(top_labels.tolist(), np.arange(ntop).tolist()))
labels = labels.replace(repl_dict)

In [78]:
layer.get_fill_color = get_color(labels)

In [130]:
def print_distance(groups, metric='euclidean'):
    from scipy.spatial.distance import pdist, squareform
    vals = squareform(pdist(groups, metric=metric))
    df = pd.DataFrame(vals, index=neighbourhoods.index, columns=neighbourhoods.index)
    return df.style.background_gradient(axis=1, cmap="BuGn")

In [192]:
sklearn_tree = KDTree(X_train.values)
numba_tree = kdtree_to_numba(sklearn_tree)

In [193]:
from core.cluster_validation import generate_neigbhourhood_groups

In [194]:
groups = generate_neigbhourhood_groups(tessellation)

In [195]:
karlins = groups[groups == 'karlin'].index.values

In [196]:
tess_id = 261793

In [197]:
# tessellation.loc[graph[tess_id].index.values].explore()

In [198]:
graph[tess_id].index.values

array([261783, 261788, 261789, 261793, 261796, 261800, 261819])

In [199]:
xilocs = np.where(X_train.index.isin(graph[tess_id].index.values))[0]

In [200]:
num_neighbours = 15

In [201]:
nn_dists, nn_inds = parallel_tree_query(numba_tree, np.ascontiguousarray(X_train.iloc[xilocs].values), k=num_neighbours)

In [202]:
neigh_ids = nn_inds[3]
neigh_dists = nn_dists[3]

In [203]:
neighbours = tessellation.loc[X_train.iloc[neigh_ids].index].reset_index()
neighbours['nn_dists'] = neigh_dists
neighbours['nn_order'] = np.arange(num_neighbours)

In [204]:
intersection = np.intersect1d(X_train.iloc[neigh_ids].index, graph[tess_id].index.values)
union = np.union1d(X_train.iloc[neigh_ids].index, graph[tess_id].index.values)

print('graph intersection: ', intersection.shape[0] / graph[tess_id].index.values.shape[0],
      'total intersection: ', intersection.shape[0]/union.shape[0])

graph intersection:  0.2857142857142857 total intersection:  0.1


In [213]:
from scipy.spatial.distance import pdist, cdist

array([ -5490,  -5490,  -5490, ..., 299063, 299063, 299063])

In [211]:
pdist(X_train.iloc[neigh_ids].values)

array([4.87532534, 5.08568228, 5.11036757, 5.3147674 , 5.3152624 ,
       5.31833994, 5.33959303, 5.34781153, 5.39578952, 5.41562768,
       5.46998235, 5.4751668 , 5.49217131, 5.50442596, 5.52366858,
       5.57984761, 5.44336868, 4.68245219, 5.81689727, 5.0899599 ,
       4.16571699, 4.86319586, 6.39377622, 4.93296405, 5.88576494,
       5.6822102 , 5.95724033, 5.23491609, 4.23075501, 4.81238908,
       6.80725969, 6.2814186 , 4.59715491, 4.52383647, 1.98955439,
       5.07671347, 6.32379318, 5.44723154, 5.518125  , 4.73477739,
       3.94028481, 5.10790316, 5.81491372, 5.09931867, 5.38463478,
       5.85268679, 4.56028338, 5.33840232, 5.06143164, 3.29403048,
       3.70073265, 6.51971316, 6.62836994, 5.00794782, 5.37130085,
       5.17112032, 6.0416938 , 6.48024856, 4.76729681, 6.2082616 ,
       5.47793756, 5.79603626, 4.42257168, 5.81657086, 5.56597565,
       5.38856602, 5.73479756, 4.45769596, 4.87576705, 4.02241014,
       6.44749171, 6.31187573, 7.46502314, 5.92518848, 3.97234

In [380]:
# m = neighbours.explore()
# m = tessellation.loc[[tess_id]].explore(color='red', m=m)
# m

In [206]:
X_train.iloc[neigh_ids]

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,ssbCCM,ssbCCD,...,stcSAl,ldkAre,ldkPer,lskCCo,lskERI,lskCWA,ltkOri,ltkWNB,likWBB,sdsAre
261793,0.399284,1.170116,-0.02239,-1.81434,-0.637299,-0.305332,0.635863,-1.971834,2.203463,-0.550868,...,-0.844166,-0.500347,-0.555598,1.46855,1.051844,-0.695529,0.161561,0.047395,2.629321,-0.396684
116039,0.305006,0.725154,-0.02239,0.087126,-0.637299,-0.343731,0.609423,-0.624599,1.266895,-0.575372,...,-0.670972,-0.510141,-0.574152,1.807997,1.149733,-0.731008,0.142096,0.047964,1.566661,-0.21988
235733,0.10757,0.61778,-0.02239,-1.988388,-0.637299,-0.220512,0.623702,-2.050118,1.379239,-0.540936,...,-0.802506,-0.57458,-0.656832,1.395335,1.157807,-0.762063,0.044643,0.150431,3.237948,-0.323684
242470,0.05745,0.426782,-0.02239,-1.530537,-0.637299,-0.369077,0.601152,-1.809503,1.028426,-0.579041,...,-0.195215,-0.529519,-0.57512,0.260475,1.02539,-0.65786,-0.982306,0.048788,2.888012,-0.342812
236829,0.630815,1.350235,-0.02239,-1.014296,-0.637299,-0.042947,0.61146,-1.451214,2.309372,-0.391424,...,-0.959591,-0.572121,-0.645981,0.830897,0.964403,-0.735043,0.268851,0.099317,3.93744,-0.339094
282617,0.386274,0.99279,-0.02239,-0.984762,-0.637299,-0.383045,0.627505,-1.502184,1.796915,-0.586188,...,-0.936444,-0.541114,-0.602275,1.191243,1.052597,-0.721296,-0.553007,0.040427,3.019633,-0.411958
261789,-0.181041,-0.395971,-0.02239,-0.734107,-0.637299,-0.390733,0.625213,-1.337652,-0.25951,-0.59212,...,-0.843178,-0.500347,-0.555598,1.46855,1.051844,-0.695529,0.161561,0.047395,2.629321,-0.396684
230350,-0.129525,-0.177631,-0.02239,-0.97932,-0.637299,-0.276331,0.646393,-1.469099,0.077922,-0.57148,...,-0.780903,-0.594039,-0.700705,1.045393,1.062087,-0.76983,-0.665001,0.157594,1.818376,-0.407993
272435,0.630769,1.193292,-0.02239,0.02045,-0.637299,-0.112657,0.57505,-0.522858,1.920432,-0.471399,...,-0.468349,-0.565854,-0.639842,1.569107,1.067973,-0.749727,-0.38851,0.112164,3.350177,-0.269307
282590,0.233247,0.776884,-0.02239,-1.35258,-0.637299,-0.309804,0.582854,-1.68891,1.529188,-0.552438,...,-0.862768,-0.549967,-0.604904,0.983593,0.820175,-0.698835,-0.361934,0.079422,1.874727,0.0


In [44]:
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12

def get_color(labels_to_color):

    import glasbey
    
    def hex_to_rgb(hexa):
        return tuple(int(hexa[i : i + 2], 16) for i in (0, 2, 4))
    
    if labels_to_color.max() > 12:
        gb_cols = glasbey.extend_palette(
            Set3_12.hex_colors, palette_size=np.unique(labels_to_color).shape[0] + 1
        )
    else:
        gb_cols = Set3_12.hex_colors
    
    gb_cols = [hex_to_rgb(c[1:]) for c in gb_cols]
    
    colors = apply_categorical_cmap(
        labels_to_color, cmap=dict(zip(np.unique(labels_to_color), gb_cols, strict=False))
    )
    return colors