In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree

CPU times: user 11.1 s, sys: 383 ms, total: 11.5 s
Wall time: 9.08 s


In [2]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [3]:
# region_id = 'freiburg'
# buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
# chars_dir = '../data/freiburg/chars/'

In [4]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [5]:
graph.cardinalities.describe()

count    304554.000000
mean          6.751085
std           2.060782
min           1.000000
25%           6.000000
50%           7.000000
75%           8.000000
max          82.000000
Name: cardinalities, dtype: float64

In [6]:
from core.cluster_validation import print_distance, generate_neigbhourhood_groups, generate_detailed_clusters

In [7]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [9]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')



X_train = X_train[X_train.index >= 0]



spatial_lag = 1


# lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/context_chars_{region_id}_lag_{spatial_lag}.parquet')

lag = pd.read_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{spatial_lag}.parquet')



X_train = X_train.join(lag[[c for c in lag.columns if '_median' in c]], how='inner')

# X_train = X_train.join(lag, how='inner')


In [11]:
# for c in X_train.columns:
#     X_train[c] = X_train[c].clip(*np.percentile(X_train[c], [5, 95]))

In [12]:
to_drop = ['stcSAl',
 'ltkOri',
 'stbOri',
 'stcOri',
 'stbCeA',

#not in barcelona
 'ltcBuA', 'midRea', 'midAre', 'likWBB'
]


all_drop = []
for c in to_drop:
    all_drop += X_train.columns[X_train.columns.str.contains(c)].tolist()


X_train = X_train.drop(all_drop, axis=1)

In [14]:

vals = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

In [15]:
# t1 = X_train[[c for c in X_train.columns if '_' not in c]]
# t2 = X_train[[c for c in X_train.columns if '_median' in c]]

# X_train = t1.join(t2)
# X_train.shape

In [19]:
X_train = X_train[[c for c in X_train.columns if '_median' in c]]


In [20]:
# X_train = X_train.drop(stats.columns[stats.loc['std'] == 0], axis=1)

In [21]:
X_train.shape

(299064, 54)

In [22]:
tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train)), index=X_train.index)
    .loc[tess_groups.index]
    .values
)

from sklearn.metrics import davies_bouldin_score
def check_score(data, example_clusters):
    groups = example_clusters[example_clusters.index.isin(data.index)]
    groups_ilocs = (
        pd.Series(np.arange(len(data)), index=data.index).loc[groups.index].values
    )
    return davies_bouldin_score(data.iloc[groups_ilocs], groups.values)

check_score(X_train, tess_groups)

3.0651731848710613

In [23]:
# tessellation.loc[tess_groups.index].explore()

In [24]:
from core.cluster_validation import print_distance
print_distance( pd.DataFrame(X_train.loc[tess_groups.index]).groupby(tess_groups.values).mean(), metric='euclidean')

Unnamed: 0,commie blocks vn,fancy commie blocks,holyne,housing blocks,housing houses,josefov,karlin IT offices,karlin old,karlin river offices,karlin square,mala strana,malesice,prague castle,row houses1,row houses2,smickov,stare mesto,vinohrady blocks,vinohrady squares,vinohrady villas
commie blocks vn,0.0,4.441923,6.107191,4.942585,5.783314,8.189324,5.271118,9.26159,9.917141,6.052183,11.49842,6.643514,9.450746,7.633713,8.374431,7.530078,11.421425,5.059909,8.421288,5.843975
fancy commie blocks,4.441923,0.0,7.11297,4.155676,6.929423,9.45042,4.959502,9.235848,8.405695,7.245949,11.619347,5.513294,8.486804,8.565262,9.162312,8.17233,11.809285,6.02698,8.892897,6.668193
holyne,6.107191,7.11297,0.0,6.158452,4.393501,7.941864,6.710297,9.539992,11.126789,6.616856,10.92487,6.536003,8.753095,8.28277,8.598854,7.483976,10.795722,4.913697,8.606638,4.971454
housing blocks,4.942585,4.155676,6.158452,0.0,5.61426,7.654685,4.156918,8.364216,7.911509,6.315435,10.584063,5.326268,7.328921,8.590495,9.208565,6.718895,10.216122,4.447822,8.148796,5.570932
housing houses,5.783314,6.929423,4.393501,5.61426,0.0,6.506034,5.384909,8.474281,11.321667,4.780753,11.610249,7.14166,9.612544,7.220445,7.69678,5.698197,10.658717,4.66976,6.051525,2.335755
josefov,8.189324,9.45042,7.941864,7.654685,6.506034,0.0,8.074734,6.898853,12.597732,4.395707,8.65697,9.887239,8.516125,9.678453,10.326224,3.651283,6.664465,7.346577,5.725439,6.572689
karlin IT offices,5.271118,4.959502,6.710297,4.156918,5.384909,8.074734,0.0,8.116608,8.298454,5.986525,11.416264,4.582869,8.968921,7.687912,8.357099,6.360884,10.997797,3.921093,7.35142,4.975192
karlin old,9.26159,9.235848,9.539992,8.364216,8.474281,6.898853,8.116608,0.0,12.653454,5.279301,6.907057,9.192733,6.871229,8.680924,9.327161,4.477056,6.041232,8.243764,5.806301,8.026541
karlin river offices,9.917141,8.405695,11.126789,7.911509,11.321667,12.597732,8.298454,12.653454,0.0,11.606416,13.917129,7.825961,11.423155,12.864405,13.328001,11.629609,14.143174,8.350846,12.834765,10.924442
karlin square,6.052183,7.245949,6.616856,6.315435,4.780753,4.395707,5.986525,5.279301,11.606416,0.0,9.357597,7.800659,8.123637,7.024429,7.780639,2.480073,8.148621,5.717671,3.670372,4.69233


In [26]:
from scipy.spatial.distance import pdist, cdist
for i, g in X_train.loc[tess_groups.index].groupby(tess_groups.values):
    print(i, np.mean(pdist(g)))

commie blocks vn 6.3207648031311185
fancy commie blocks 6.935119126701156
holyne 7.308764573381597
housing blocks 7.883704219913125
housing houses 5.135322727983048
josefov 6.356178230906789
karlin IT offices 7.155537239802385
karlin old 8.050946434491378
karlin river offices 12.169244436929116
karlin square 6.938723552989892
mala strana 14.218476603531737
malesice 11.329911185540475
prague castle 16.388396204045836
row houses1 6.2293092467141635
row houses2 5.239432591678582
smickov 7.552513700331859
stare mesto 10.996734667724422
vinohrady blocks 5.918274571028474
vinohrady squares 4.868591841477332
vinohrady villas 5.068479625510638


In [27]:
# tessellation.loc[tess_groups.index].explore(column=tess_groups.values, categorical=True)

In [28]:
np.min(cdist(X_train.loc[tess_groups[tess_groups == 'josefov'].index], 
             X_train.loc[tess_groups[tess_groups == 'stare mesto'].index]))

1.4722669131095896

In [29]:
mean_clusters = pd.DataFrame(X_train.loc[tess_groups.index]).groupby(tess_groups.values).mean()

In [30]:
(mean_clusters.loc['josefov'] - mean_clusters.loc['stare mesto']).abs().sort_values(ascending=False).iloc[:10]

libNCo_median    4.900441
ssbERI_median    1.881506
ldbPWL_median    1.836020
lcnClo_median    1.223214
sscERI_median    1.189771
ssbCCo_median    1.164828
ssbCCD_median    1.081226
linPDE_median    1.058301
linP3W_median    0.966330
lcdMes_median    0.888653
dtype: float64

In [31]:
from core.utils import used_keys
used_keys['lcnClo']

'local closeness of street network'