In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree
from core.cluster_validation import print_distance, generate_neigbhourhood_groups
from core.utils import char_names

CPU times: user 11.4 s, sys: 493 ms, total: 11.9 s
Wall time: 9.53 s


## Download old data


In [2]:
# # r = pd.read_csv('https://figshare.com/ndownloader/files/31252825', compression='gzip')
# r = pd.read_csv('https://figshare.com/ndownloader/files/31250227', compression='gzip')
# r = r.drop([ 'mm_len', 'cdsbool', 'node_start', 'node_end', 'nID', 'nodeID', 'Unnamed: 0', 'bID',], axis=1).set_index('uID').sort_index()
# X_train.join(r).reset_index().to_parquet('../data/old_prague_data/old_prague_data.parquet')


# import requests 
# def download_url(url, save_path, chunk_size=128):
#     r = requests.get(url, stream=True)
#     with open(save_path, 'wb') as fd:
#         for chunk in r.iter_content(chunk_size=chunk_size):
#             fd.write(chunk)


# download_url('https://figshare.com/ndownloader/files/31252828', '../data/old_prague_geometries.zip') 
# import zipfile
# z = zipfile.ZipFile('../data/old_prague_geometries.zip')
# z.extractall("../data/old_prague_geometries/")



In [3]:
region_id = 'prague'
buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/old_prague_data/reprocessed_data/'
chars_dir = '../data/old_prague_data/reprocessed_data/chars/'
geometry_file ='../data/old_prague_data/prg_geometry.gpkg'

## Process old data through the current pipeline

need to reorder the data and reprocess the blocks to get rid of the dependancy on id columns

In [4]:
buildings = gpd.read_file(geometry_file, layer='buildings')
tessellations = gpd.read_file(geometry_file, layer='tessellation')
streets = gpd.read_file(geometry_file, layer='edges')

In [5]:
build_tess = pd.merge(buildings, tessellations, left_on='uID', right_on='uID').reset_index(drop=True)

In [6]:
# m = build_tess.iloc[:100].set_geometry('geometry_y').explore()
# m = build_tess.iloc[:100].set_geometry('geometry_x').explore(color='red', m=m)
# m

In [7]:
%%time
import momepy as mm
enclosures, ids = mm.generate_blocks(tessellation=build_tess['geometry_y'].to_frame(),
                            edges=streets,
                            buildings=build_tess['geometry_x'].to_frame())

CPU times: user 1min 29s, sys: 1.81 s, total: 1min 31s
Wall time: 1min 30s


In [8]:
build_tess['enclosure_index'] = ids

In [9]:
crs = buildings.crs

In [16]:
buildings = build_tess[['geometry_x']]
buildings.columns = ['geometry']
buildings = buildings.set_geometry('geometry').set_crs(crs).to_crs(epsg=3035)
buildings.to_parquet(buildings_dir + f"buildings_{region_id}.parquet")

In [18]:
tessellations = build_tess[['geometry_y', 'enclosure_index']]
tessellations.columns = ['geometry', 'enclosure_index']
tessellations = tessellations.set_geometry('geometry').set_crs(crs).to_crs(epsg=3035)
tessellations.to_parquet(tessellations_dir + f"tessellation_{region_id}.parquet")

In [22]:
streets.to_crs(epsg=3035)[['geometry']].to_parquet(streets_dir + f"streets_{region_id}.parquet")

In [31]:
enclosures.rename_geometry('geometry').to_crs(epsg=3035).to_parquet(enclosures_dir + f"enclosure_{region_id}.parquet")

### Push data through pipeline


In [32]:
from core.generate_ngraphs import process_region_graphs

In [33]:
%%time
process_region_graphs(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
)

Built tess graph knn=1
Built buildings graph knn=1
Built streets graph knn=1
Built enclosure graph knn=1


 There are 20 disconnected components.


Built nodes graph knn=1
CPU times: user 44.2 s, sys: 1.3 s, total: 45.5 s
Wall time: 45.4 s


## Characters

In [4]:
from core.generate_chars import process_single_region_chars

In [5]:
%%time
process_single_region_chars(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
    chars_dir
)

2024-08-05 13:55:13.466290 ----Processing ------ prague
Processing streets
Processing enclosures
Processing buildings


  angles = np.arccos(cosine_angle)
  angles = np.arccos(cosine_angle)
  return np.nanmean(np.abs(90 - degrees[true_angles]))
  angles = np.arccos(cosine_angle)
  return Series({"mean": np.nanmean(dists), "std": np.nanstd(dists)})
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Processing tessellation
CPU times: user 11min 19s, sys: 3.77 s, total: 11min 23s
Wall time: 11min 22s


In [6]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")
nodes = gpd.read_parquet(chars_dir + f"nodes_chars_{region_id}.parquet")


merged = pd.merge(
    tessellation.drop(columns=["geometry"]),
    buildings.drop(columns=["nodeID", "geometry", 'nID']),
    right_index=True,
    left_index=True,
    how="left",
)

merged = merged.merge(
    enclosures.drop(columns="geometry"),
    right_index=True,
    left_on="enclosure_index",
    how="left",
)

merged = merged.merge(streets.drop(columns="geometry"), on="nID", how="left")
merged = merged.merge(nodes.drop(columns="geometry"), on="nodeID", how="left")

merged = merged.drop(
    columns=[
        "nID",
        # "eID",
        "nodeID",
        "mm_len",
        "cdsbool",
        "node_start",
        "node_end",
        "x",
        "y",
        "enclosure_index",
        # "id",
        # "osm_id",
        # "index",  ## maybe keep
    ]
)
merged = merged.set_index(tessellation.index)

In [7]:
from core.utils import used_keys

In [8]:
primary = merged[list(used_keys.keys())]
primary.shape

(140315, 62)

In [9]:
primary.to_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

## Comparison

In [4]:
import lonboard
from core.utils import char_names
from mapclassify import classify
import matplotlib as mpl
from lonboard.colormap import apply_continuous_cmap
from sidecar import Sidecar
import momepy as mm
from core.utils import used_keys

In [6]:
old_prague_data = pd.read_parquet('../data/old_prague_data/old_prague_data.parquet')

In [7]:
new_prague_data = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [8]:
vals = {}
for c in new_prague_data.columns:
    if c not in old_prague_data.columns:
        print(c, 'not in old data')
        continue
        
    new_values = new_prague_data[c].values
    old_values = old_prague_data[c].values

    # skip nas
    nas = np.isnan(new_values) | np.isnan(old_values)
    new_values = new_values[~nas]
    old_values = old_values[~nas]

    vals[c] = np.corrcoef(new_values, old_values)[0,1]

ldsAre not in old data


In [11]:
vals = pd.Series(vals)
non_correlated = vals[ vals < .80]
non_correlated.sort_values()

ldsRea    0.432357
lddNDe    0.448191
likWBB    0.474663
sisBpM    0.482521
linWID    0.526898
sdsSPW    0.542902
stcSAl    0.563658
ltkOri    0.623173
stbSAl    0.678290
stbOri    0.682444
stcOri    0.692131
stbCeA    0.693603
sdsSWD    0.716397
mtbAli    0.782767
xcnSCl    0.788661
ltcBuA    0.796996
dtype: float64

In [13]:
[used_keys[c] for c in non_correlated.sort_values()[1: ].index]

['local node density of street network',
 'total of building areas within the enclosure, normalised by enclosure area',
 'buildings per meter of street segment',
 'local degree weighted node density of street network',
 'width of street profile',
 'street alignment of ETC',
 'orientation of enclosure',
 'street alignment of building',
 'orientation of building',
 'orientation of ETC',
 'cell alignment of building',
 'width deviation of street profile',
 'alignment of neighbouring buildings',
 'square clustering of street network',
 'level of building adjacency']

In [17]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")

In [163]:
c = 'sisBpM' # this is due to street atachment and outliers
new_values = new_prague_data[c].values
old_values = old_prague_data[c].values

# skip nas
nas = np.isnan(new_values) | np.isnan(old_values) | (new_values > 1)
new_values = new_values[~nas]
old_values = old_values[~nas]
print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.47426017561413764 (140299,)


In [39]:
new_values, old_values

(array([0.02647987, 0.0759601 , 0.06455886, ..., 0.12482955, 0.09237362,
        0.0490265 ]),
 array([0.02647958, 0.08507903, 0.07319508, ..., 0.11549315, 0.11549315,
        0.0776904 ]))

In [131]:
blg_nid = mm.get_nearest_street(buildings, streets)

In [132]:
norm_values = (blg_nid.value_counts() / streets.length).fillna(0)

In [133]:
new_values = pd.Series(0.0, index=blg_nid.index)
new_values.loc[blg_nid[~blg_nid.isna()].index] = norm_values.loc[blg_nid[~blg_nid.isna()].values].values

In [134]:
blg_nid.loc[new_values[new_values > 100].index]

115873    13525.0
dtype: float64

In [135]:
blg_nid[blg_nid == 13525]

115873    13525.0
dtype: float64

In [136]:
c = 'sisBpM' # this is not street profile
# skip nas

old_values = old_prague_data[c].values

nas = np.isnan(new_values) | np.isnan(old_values) | (new_values > 1)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values.values, old_values)[0,1], new_values.shape)

0.566507730319834 (140289,)


In [142]:
c = 'sdsAre' ## wrong function call
new_values = new_prague_data[c].values
old_values = old_prague_data[c].values

# skip nas
nas = np.isnan(new_values) | np.isnan(old_values) | (new_values > 1000)
new_values = new_values[~nas]
old_values = old_values[~nas]
print(np.corrcoef(new_values, old_values)[0,1])

0.0003960158131754729


In [144]:
new_values, old_values

(array([ 69.52126932, 140.07100718, 433.31757827, ..., 105.42647526,
        193.53507041, 610.28173117]),
 array([140512.90018443,   2149.02240926,  23804.69458061, ...,
          4768.48813425,   4768.48813425, 165178.71929707]))

In [156]:
tess_nid = mm.get_nearest_street(
    tessellation, streets
)


r = mm.describe_agg(
    tessellation.geometry.area, tess_nid, statistics=["count", "sum"]
)["sum"]

In [157]:
new_values = pd.Series(0.0, index=tess_nid.index)
new_values.loc[tess_nid[~tess_nid.isna()].index] = r.loc[tess_nid[~tess_nid.isna()].values].values

In [161]:
old_values = old_prague_data[c].values
print(np.corrcoef(new_values.values, old_values)[0,1])

0.9325711978637471


In [168]:
c = 'ltcWRB' # missing weighting
new_values = new_prague_data[c].values
old_values = old_prague_data[c].values

nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.3709241389248268 (140315,)


In [170]:
new_values, old_values

(array([1., 7., 4., ..., 4., 5., 8.]),
 array([4.66207372e-06, 4.98967630e-05, 7.90916122e-05, ...,
        2.14705039e-04, 2.53273374e-04, 1.07831087e-05]))

In [172]:
q1 = read_parquet(graph_dir + f'tessellation_graph_{region_id}_knn1.parquet')

In [180]:
from core.utils import partial_apply
def partial_block_count(partial_focal, partial_higher, y):
    return partial_higher.describe(
        y.loc[partial_higher.unique_ids], statistics=["nunique"]
    )["nunique"]

block_counts = partial_apply(
    q1,
    higher_order_k=3,
    n_splits=30,
    func=partial_block_count,
    y=tessellation.geometry.area,
)

In [181]:
def partial_tess_area(partial_focal, partial_higher, y):
    return partial_higher.describe(
        y.loc[partial_higher.unique_ids], statistics=["sum"]
    )["sum"]

block_sums = partial_apply(
    q1,
    higher_order_k=3,
    n_splits=30,
    func=partial_tess_area,
    y=tessellation.geometry.area,
)

In [183]:
c = 'ltcWRB'

new_values = (block_counts / block_sums).values

old_values = old_prague_data[c].values

nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.993995413058081 (140315,)


In [184]:
c = 'ldsRea' # this is a mistake in the original code ldsRea, overwrites ldsAre
new_values = new_prague_data[c].values
old_values = old_prague_data[c].values

nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.43235673978534844 (140315,)


In [186]:
new_values = new_prague_data['ldsAre'].values
old_values = old_prague_data['ldsRea'].values

nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.9488393632590233 (140315,)


In [187]:
new_values, old_values

(array([1016050.21608919,  325556.13043329,  450921.84507893, ...,
         145407.71169884,  147016.42751407,  850895.6161568 ]),
 array([1006508.27732186,  350439.6350221 ,  415939.44711489, ...,
         141141.46195491,  141141.46195491,  454043.49986155]))

In [36]:
# c = 'lddNDe' ## lddWNI should have the same issue, probably due to attaching nodes to buildings, streets and cells...


new_values = new_prague_data['lddNDe'].values
old_values = old_prague_data['linWID'] .values

nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]

print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.49498992446907103 (140315,)


In [19]:
new_values, old_values

(array([0.00202637, 0.00691966, 0.00377668, ..., 0.00753816, 0.00753816,
        0.00300644]),
 array([0.00332259, 0.01247859, 0.01512655, ..., 0.02363852, 0.02363852,
        0.00939324]))

In [20]:
graph = mm.gdf_to_nx(streets, preserve_index=True)
graph = mm.node_degree(graph)

In [21]:
res = mm.node_density(graph, radius=5, verbose=False)

In [22]:
import networkx as nx

In [23]:
nodes, edges, sw = mm.nx_to_gdf(res, spatial_weights=True)

 There are 20 disconnected components.


In [24]:
nodes.loc[nodes.nodeID == 9310, 'node_density']

9310    0.002026
Name: node_density, dtype: float64

In [25]:
nodes_w5 = mm.sw_high(k=5, weights=sw)

  nodes_w5 = mm.sw_high(k=5, weights=sw)


In [26]:
old_calc = mm.NodeDensity(nodes, edges, nodes_w5, verbose=False).series

  old_calc = mm.NodeDensity(nodes, edges, nodes_w5, verbose=False).series


In [27]:
assert np.allclose(old_calc, nodes['node_density'])

In [28]:
nvs = np.sort(np.unique(new_values))
ovs = np.sort(np.unique(old_values))

In [29]:
old_nodeid = mm.get_node_id(tessellation, nodes, edges, "nodeID", "nID", verbose=False)

  old_nodeid = mm.get_node_id(tessellation, nodes, edges, "nodeID", "nID", verbose=False)


In [30]:
%%time
tess_nid = mm.get_nearest_street(
    tessellation, streets
)
new_node_id = mm.get_nearest_node(
        tessellation, nodes, edges,  tess_nid
    )

CPU times: user 14.6 s, sys: 0 ns, total: 14.6 s
Wall time: 14.6 s


In [31]:
(new_node_id == old_nodeid).sum() / new_node_id.shape[0]

np.float64(0.9787406905890318)

In [None]:
old_calc = mm.NodeDensity(nodes, edges, nodes_w5, verbose=False).series

In [None]:
# difference between buildings and tessellation attachment below

In [45]:
### the rest is building alignment stuff


In [46]:
'''stcSAl
stbSAl
ltkOri
stbOri
stcOri
stbCeA
sdsSWD
mtbAli'''.split('\n')

['stcSAl',
 'stbSAl',
 'ltkOri',
 'stbOri',
 'stcOri',
 'stbCeA',
 'sdsSWD',
 'mtbAli']

In [None]:
## street profile stuff might be to do with different segment length
## why is orientation so different ?
## ltcWRB - ? I got different enclosures
## other street and node stuff ?

## maybe some indexing thing again ?

## have to check also the merging of the data is correct - the final merged data to the ETC should make sense

In [51]:
from core.utils import used_keys
column = 'sdsAre'
used_keys[column]

'area covered by edge-attached ETCs'

In [10]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")
nodes = gpd.read_parquet(chars_dir + f"nodes_chars_{region_id}.parquet")

In [11]:
%%time
tessellation['geometry'] = tessellation.simplify(1)

CPU times: user 15.6 s, sys: 13.8 ms, total: 15.7 s
Wall time: 15.7 s


In [42]:
plotting = pd.merge(tessellation[['geometry']], 
                    new_prague_data[non_correlated.index],
                    right_index=True, left_index=True)

In [43]:
%%time
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.1)



CPU times: user 675 ms, sys: 51.8 ms, total: 727 ms
Wall time: 725 ms


In [44]:
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [52]:
classifier = classify(plotting[column], 'equalinterval', k=20)
normalizer = mpl.colors.Normalize(0, classifier.bins.shape[0])
vals = normalizer(classifier.yb)
layer.get_fill_color = apply_continuous_cmap(vals, mpl.colormaps['viridis'])

In [53]:
column


'sdsAre'

In [74]:
percentage_diffs = pd.Series((old_prague_data[column] - new_prague_data[column]).abs() / old_prague_data[column])
percentage_diffs[percentage_diffs > .5].index.values.shape

(136528,)

In [75]:
old_prague_data[column] - new_prague_data[column]

0         140443.378915
1           2008.951402
2          23371.377002
3          25973.586637
4          39397.415858
              ...      
140310     21927.197728
140311     88212.780343
140312      4663.061659
140313      4574.953064
140314    164568.437566
Name: sdsAre, Length: 140315, dtype: float64

In [38]:
new_prague_data.loc[[43649,  93420,  97915, 106396, 118841, 123672], column]

43649     9.256229e+04
93420     5.782298e+01
97915     5.069084e+00
106396    5.649317e+07
118841    3.820619e-01
123672    6.079401e+00
Name: sisBpM, dtype: float64

In [66]:
# tessellation.loc[[43649,  93420,  97915, 106396, 118841, 123672]].explore()

In [3]:
X_train = pd.read_parquet('../data/old_prague_data/old_prague_data.parquet')
X_train = X_train.set_index('uID').sort_index()

In [12]:
tessellation = gpd.read_file('../data/old_prague_data/prg_geometry.gpkg', 
                             layer='buildings').set_index('uID').sort_index().to_crs(epsg=3035)
from libpysal.graph import Graph
graph = Graph.build_fuzzy_contiguity(tessellation, buffer=.001)

In [23]:
plotting = tessellation.join(X_train, how='inner')



building_chars = ['ssbCCo', 'ssbCor', 'ssbSqu',
       'ssbCCM', 'ssbCCD', 'sdbAre', 'sdbPer', 'sdbCoA', 'ssbERI', 'ssbElo',
       'stbOri', 'mtbSWR', 'libNCo', 'ldbPWL', 'ltcBuA', 'mtbAli', 'mtbNDi',
       'ltbIBD', 'stbCeA', 'stbSAl']

plotting  = plotting[building_chars + ['geometry']]
# plotting = plotting[[c for c in plotting.columns if '_' not in c]]

In [22]:
region = 69300
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
buildings = gpd.read_parquet(f"{chars_dir}buildings_chars_{region}.parquet")

In [33]:
gdf = buildings.sjoin(plotting.to_crs(epsg=3035), how='inner')

In [130]:
vals = {}
for c in building_chars:
    if c == '': break
    new_values = gdf[c + '_left'].values
    old_values = gdf[c + '_right'].values
    vals[c] = np.corrcoef(new_values, old_values)[0,1]
    if c == 'ltcBuA': break

In [61]:
vals

{'ssbCCo': np.float64(0.5874129317304231),
 'ssbCor': np.float64(0.3358528301770491),
 'ssbSqu': np.float64(0.43030460814673804),
 'ssbCCM': np.float64(nan),
 'ssbCCD': np.float64(nan),
 'sdbAre': np.float64(0.4874463523459785),
 'sdbPer': np.float64(0.5247616904969362),
 'sdbCoA': np.float64(0.32875297036623863),
 'ssbERI': np.float64(0.4040099734890492),
 'ssbElo': np.float64(0.5962058263714757),
 'stbOri': np.float64(0.6574709179879735),
 'mtbSWR': np.float64(0.7057998910581672),
 'libNCo': np.float64(0.9403093425618054),
 'ldbPWL': np.float64(0.9134632024527115),
 'ltcBuA': np.float64(0.6936343323901484),
 'mtbAli': np.float64(0.5995580588598617),
 'mtbNDi': np.float64(0.6284560523138889),
 'ltbIBD': np.float64(0.7910494631754461),
 'stbCeA': np.float64(0.3353265648792697),
 'stbSAl': np.float64(0.4008147508457259)}

array([1.        , 1.        , 1.        , ..., 0.75      , 0.77777778,
       0.77777778])

array([1.        , 1.        , 1.        , ..., 0.67346939, 0.6875    ,
       0.5625    ])

In [68]:
plotting = gdf.copy()

In [69]:
%%time
import lonboard
layer = lonboard.PolygonLayer.from_geopandas(plotting, opacity=.15)



CPU times: user 1.28 s, sys: 100 ms, total: 1.39 s
Wall time: 1.38 s


In [70]:
from sidecar import Sidecar
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [71]:
from core.utils import char_names
from mapclassify import classify
import matplotlib as mpl
from lonboard.colormap import apply_continuous_cmap

plotting.columns

Index(['index', 'id', 'geometry', 'ssbCCo_left', 'ssbCor_left', 'ssbSqu_left',
       'ssbCCM_left', 'ssbCCD_left', 'sdbAre_left', 'sdbPer_left',
       'sdbCoA_left', 'ssbERI_left', 'ssbElo_left', 'stbOri_left',
       'mtbSWR_left', 'libNCo_left', 'ldbPWL_left', 'ltcBuA_left',
       'mtbAli_left', 'mtbNDi_left', 'ltbIBD_left', 'stbCeA_left', 'nID',
       'stbSAl_left', 'nodeID', 'uID', 'ssbCCo_right', 'ssbCor_right',
       'ssbSqu_right', 'ssbCCM_right', 'ssbCCD_right', 'sdbAre_right',
       'sdbPer_right', 'sdbCoA_right', 'ssbERI_right', 'ssbElo_right',
       'stbOri_right', 'mtbSWR_right', 'libNCo_right', 'ldbPWL_right',
       'ltcBuA_right', 'mtbAli_right', 'mtbNDi_right', 'ltbIBD_right',
       'stbCeA_right', 'stbSAl_right'],
      dtype='object')

In [161]:

column = 'stbSAl_right'

classifier = classify(plotting[column], 'quantiles', k=20)
normalizer = mpl.colors.Normalize(0, classifier.bins.shape[0])
vals = normalizer(classifier.yb)
layer.get_fill_color = apply_continuous_cmap(vals, mpl.colormaps['viridis'])

In [162]:
char_names[column.split('_')[0]]

'street alignment of building'

In [150]:
for c in building_chars:
    print(char_names[c])

circular compactness of building
corners of building
squareness of building
centroid - corner mean distance of building
centroid - corner distance deviation of building
area of building
perimeter of building
courtyard area of building
equivalent rectangular index of building
elongation of building
orientation of building
shared walls ratio of buildings
number of courtyards within adjacent buildings
perimeter wall length of adjacent buildings
level of building adjacency
alignment of neighbouring buildings
mean distance between neighbouring buildings
mean inter-building distance
cell alignment of building
street alignment of building


In [151]:
plotting.loc[plotting['uID'] == 139832, ['mtbAli_left', 'mtbAli_right']]

Unnamed: 0,mtbAli_left,mtbAli_right
250903,6.865233,15.705758


In [152]:
plotting.loc[plotting['uID'] == 1637, ['mtbAli_left', 'mtbAli_right']]

Unnamed: 0,mtbAli_left,mtbAli_right
259403,2.46992,8.480825
259408,3.732664,8.480825


In [163]:
vals = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

vals = np.nan_to_num(X_train)
X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)


# X_train = X_train.clip(-10, 10)

In [167]:
X_train = X_train[[c for c in X_train.columns if '_' not in c]]
X_train.head()

Unnamed: 0_level_0,stcOri,sdcLAL,sdcAre,sscCCo,sscERI,stcSAl,sicCAR,sicFAR,mtcWNe,mdcAre,...,ldsMSL,ldsRea,ldkAre,ldkPer,lskCCo,lskERI,lskCWA,ltkOri,ltkWNB,likWBB
uID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.825688,-0.786247,-0.44253,-0.451685,-1.274222,-0.141566,-0.72149,-0.06472,-0.439435,1.74374,...,4.531367,2.137539,1.1529,1.183337,-0.220904,-1.022269,1.090058,0.818561,-1.593575,-1.182433
1,-1.127246,-0.189693,-0.173219,0.600316,-0.563265,-0.713182,2.679297,2.532736,1.481949,-0.356004,...,-0.328878,0.000974,0.584194,0.776684,-0.311741,-0.884127,0.741569,-0.756939,-0.839673,-0.788743
2,-0.475648,-0.76237,-0.42012,0.069817,-0.46978,0.599144,-0.58432,-0.617202,-0.074388,-0.599903,...,1.197299,0.214282,-0.283929,-0.131219,-1.290892,-0.464161,0.05577,0.843529,-0.656292,-0.650109
3,-0.818247,-0.509175,-0.320073,0.520287,0.401367,-0.384563,-0.660649,-0.418102,-0.162181,-0.424048,...,2.364452,4.025627,-0.1255,-0.173832,2.005441,0.989704,-0.513183,-0.915369,-0.780289,-0.38344
4,0.838593,1.662779,1.211467,0.287516,1.122736,1.945001,-1.191274,-0.716759,-1.401671,1.852296,...,-0.272797,-0.018644,-0.491407,-0.69141,0.406371,1.329854,-0.700744,0.886679,-0.305596,-1.131909


In [168]:
tess_groups = generate_neigbhourhood_groups(tessellation[tessellation.index.isin(X_train.index)], buffer=200)
tess_groups = tess_groups[tess_groups.index.isin(X_train.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(X_train)), index=X_train.index)
    .loc[tess_groups.index]
    .values
)

In [171]:
neighbourhoods = X_train.loc[tess_groups.index].groupby(tess_groups.values).mean()
print_distance(neighbourhoods, metric='euclidean')

Unnamed: 0,holyne,housing estate,josefov,karlin,mala strana,malesice,nusle,stare mesto,vinohrady
holyne,0.0,6.952326,12.082053,10.023715,13.184114,19.741305,6.518297,14.075976,11.572726
housing estate,6.952326,0.0,9.029941,8.452315,11.273443,19.545098,4.68111,11.733279,9.043545
josefov,12.082053,9.029941,0.0,9.979376,9.748899,24.102852,8.681833,6.712387,6.29975
karlin,10.023715,8.452315,9.979376,0.0,10.243241,19.165014,9.018954,10.344686,6.976078
mala strana,13.184114,11.273443,9.748899,10.243241,0.0,23.488353,10.641655,5.79308,11.571782
malesice,19.741305,19.545098,24.102852,19.165014,23.488353,0.0,19.87854,24.938207,22.724833
nusle,6.518297,4.68111,8.681833,9.018954,10.641655,19.87854,0.0,11.095765,9.557223
stare mesto,14.075976,11.733279,6.712387,10.344686,5.79308,24.938207,11.095765,0.0,8.989595
vinohrady,11.572726,9.043545,6.29975,6.976078,11.571782,22.724833,9.557223,8.989595,0.0


In [172]:
from scipy.spatial.distance import pdist

In [173]:
for i, g in X_train.loc[tess_groups.index].groupby(tess_groups.values):
    print(i, np.median(pdist(g)))

holyne 8.244984263448703
housing estate 8.74870145309343
josefov 8.553119044645014
karlin 8.354128769765401
mala strana 14.482514103835978
malesice 11.533082137324413
nusle 8.694874644400908
stare mesto 11.430023617143105
vinohrady 5.211477105320186


### Tesselation filtering vs building atachment

In [4]:
import momepy as mm

In [24]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
streets_dir = '/data/uscuni-ulce/processed_data/streets/'
buildings_dir = '/data/uscuni-ulce/processed_data/buildings/'

tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)
streets = gpd.read_parquet(
        streets_dir + f"streets_{region_id}.parquet"
)
buildings = gpd.read_parquet(
        buildings_dir + f"buildings_{region_id}.parquet"
)

In [6]:
graph = mm.gdf_to_nx(streets, preserve_index=True)
graph = mm.node_degree(graph)

In [7]:
import networkx as nx

In [9]:
nodes, edges, sw = mm.nx_to_gdf(graph, spatial_weights=True)

 There are 304 disconnected components.


In [12]:
%%time
old_tess_nid = mm.get_nearest_street(
    tessellation, streets
)


CPU times: user 16.4 s, sys: 1.98 ms, total: 16.4 s
Wall time: 16.4 s


In [13]:
tessellation['nID'] = tess_nid
streets["nID"] = edges.index.values

In [20]:
%%time
old_node_id = mm.get_nearest_node(
    tessellation, nodes, edges, tess_nid
)

CPU times: user 432 ms, sys: 18 μs, total: 432 ms
Wall time: 431 ms


In [31]:
old_blg_nid = old_tess_nid[old_tess_nid.index >= 0]

In [32]:
old_blg_node_id = old_node_id[old_node_id.index >= 0]

In [25]:
new_blg_nid = mm.get_nearest_street(
    buildings, streets
)

In [27]:
new_blg_node_id = mm.get_nearest_node(
    buildings, nodes, edges, new_blg_nid
)

In [35]:
(old_blg_nid == new_blg_nid).sum() / old_blg_nid.shape[0]

np.float64(0.8173936013696065)

In [37]:
(old_blg_node_id == new_blg_node_id).sum() / new_blg_node_id.shape[0]

np.float64(0.8985601744108285)

In [None]:
# difference between buildings and tessellation attachment

In [248]:
blg_nid1 = tess_nid[tess_nid >= 0]