In [2]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from fast_hdbscan.numba_kdtree import kdtree_to_numba
from fast_hdbscan.numba_kdtree import parallel_tree_query
from sklearn.neighbors import KDTree
from core.cluster_validation import print_distance, generate_neigbhourhood_groups
from core.utils import char_names
import momepy as mm

CPU times: user 11.5 s, sys: 514 ms, total: 12 s
Wall time: 9.59 s


## Download old data


In [3]:
import requests 
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

# download_url('https://figshare.com/ndownloader/files/38855088', '../data/barcelona/old_barcelona.parquet')

# download_url('https://opendata-ajuntament.barcelona.cat/data/dataset/6fd03b36-7503-42f0-9082-9040cb812423/resource/a11a534d-4aeb-497f-b6bd-2e170b4724aa/download', '../data/barcelona/streets_barcelona.zip')

# download_url(
#     "https://opendata-ajuntament.barcelona.cat/data/dataset/808daafa-d9ce-48c0-925a-fa5afdb1ed41/resource/cd800462-f326-429f-a67a-c69b7fc4c50a/download", '../data/barcelona/admin_units.zip')

In [None]:
import osmnx as ox

In [None]:
streets = gpd.read_file('../data/barcelona/streets_barcelona.zip', layer = 'BCN_GrafVial_Trams_ETRS89_SHP')

In [None]:
bounds = streets.to_crs(4326).total_bounds
tags = {'natural': ['water', 'coastline', 'spring']}

water = ox.geometries_from_bbox(bounds[3], bounds[1], bounds[2], bounds[0], tags)
water = water.to_crs(streets.crs)
water[['natural', 'geometry']].to_parquet("../data/barcelona/water.pq")

tags = {'railway': True}
railway = ox.geometries_from_bbox(bounds[3], bounds[1], bounds[2], bounds[0], tags)
railway = railway.to_crs(streets.crs)
railway = railway[railway.geom_type == 'LineString']
railway = railway[railway.tunnel != 'yes']
railway = railway[~railway.railway.isin(['miniature', 'tram'])]
railway[['railway', 'geometry']].to_parquet("../data/barcelona/railway.pq")

In [4]:
region_id = 'barcelona'
buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/barcelona/'
chars_dir = '../data/barcelona/chars/'
old_file = '../data/barcelona/old_barcelona.parquet'

## Process old data through the current pipeline

In [5]:
streets = gpd.read_file('../data/barcelona/streets_barcelona.zip', layer = 'BCN_GrafVial_Trams_ETRS89_SHP')

In [6]:
water = gpd.read_parquet('../data/barcelona/water.pq')
railway = gpd.read_parquet('../data/barcelona/railway.pq')
admin = gpd.read_file("../data/barcelona/0301100100_UNITATS_ADM_POLIGONS.json")

In [7]:
extended_railway = mm.extend_lines(railway, 30, target=streets, extension=.1)

In [8]:
enclosures = mm.enclosures(streets, limit=admin.iloc[[0]], additional_barriers=[extended_railway])

In [9]:
old_gdf = gpd.read_parquet(old_file)

In [10]:
buildings = old_gdf[['uID', 'buildings']]

In [11]:
tessellation = old_gdf[['tessellation', 'tID', 'buildings']]

In [13]:
inp, res = enclosures.sindex.query(tessellation.representative_point().geometry, predicate='within')

In [14]:
inp == np.arange(82375)

array([ True,  True,  True, ...,  True,  True,  True])

In [15]:
tessellation['enclosure_index'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [16]:
buildings.columns = ['uID', 'geometry']
buildings = buildings.set_geometry('geometry')
buildings.dropna().to_crs(epsg=3035).to_parquet(buildings_dir + f"buildings_{region_id}.parquet")

In [17]:
new_index = np.array(tessellation.index.values)
new_index[tessellation.isna()['buildings']] = -new_index[tessellation.isna()['buildings']]
tessellation = tessellation[['tessellation', 'tID', 'enclosure_index']]
tessellation.columns = ['geometry', 'tID', 'enclosure_index']
tessellation = tessellation.set_geometry('geometry')
tessellation.index = new_index

tessellation.to_crs(epsg=3035).to_parquet(tessellations_dir + f"tessellation_{region_id}.parquet")

In [18]:
streets.to_crs(epsg=3035)[['geometry']].to_parquet(streets_dir + f"streets_{region_id}.parquet")

In [19]:
enclosures.to_crs(epsg=3035).to_parquet(enclosures_dir + f"enclosure_{region_id}.parquet")

### Push data through pipeline


In [20]:
from core.generate_ngraphs import process_region_graphs

In [21]:
%%time
process_region_graphs(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
)

Built tess graph knn=1
Built buildings graph knn=1
Built streets graph knn=1
Built enclosure graph knn=1
Built nodes graph knn=1
CPU times: user 20.3 s, sys: 624 ms, total: 20.9 s
Wall time: 20.9 s


 There are 41 disconnected components.


## Characters

In [24]:
from core.generate_chars import process_single_region_chars, process_building_chars

In [25]:
%%time
process_single_region_chars(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
    chars_dir
)

2024-08-06 17:09:12.504845 ----Processing ------ barcelona
Processing streets


  return lib.distance(a, b, **kwargs)
  return lib.distance(a, b, **kwargs)


Processing enclosures
Processing buildings


  angles = np.arccos(cosine_angle)
  angles = np.arccos(cosine_angle)
  return np.nanmean(np.abs(90 - degrees[true_angles]))
  angles = np.arccos(cosine_angle)
  return Series({"mean": np.nanmean(dists), "std": np.nanstd(dists)})
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  (geoms.distance(geometry.geometry, align=True)).groupby(level=0).mean()


Processing tessellation


  return lib.distance(a, b, **kwargs)
  return lib.distance(a, b, **kwargs)


CPU times: user 5min 50s, sys: 1.81 s, total: 5min 52s
Wall time: 5min 51s


In [26]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")
nodes = gpd.read_parquet(chars_dir + f"nodes_chars_{region_id}.parquet")


merged = pd.merge(
    tessellation.drop(columns=["geometry"]),
    buildings.drop(columns=["nodeID", "geometry", 'nID']),
    right_index=True,
    left_index=True,
    how="left",
)

merged = merged.merge(
    enclosures.drop(columns="geometry"),
    right_index=True,
    left_on="enclosure_index",
    how="left",
)

merged = merged.merge(streets.drop(columns="geometry"), on="nID", how="left")
merged = merged.merge(nodes.drop(columns="geometry"), on="nodeID", how="left")

merged = merged.drop(
    columns=[
        "nID",
        # "eID",
        "nodeID",
        "mm_len",
        "cdsbool",
        "node_start",
        "node_end",
        "x",
        "y",
        "enclosure_index",
        # "id",
        # "osm_id",
        # "index",  ## maybe keep
    ]
)
merged = merged.set_index(tessellation.index)

In [27]:
from core.utils import used_keys

In [28]:
primary = merged[list(used_keys.keys())]
primary.shape

(82375, 62)

In [29]:
primary.to_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

## Comparison

In [30]:
import lonboard
from core.utils import char_names
from mapclassify import classify
import matplotlib as mpl
from lonboard.colormap import apply_continuous_cmap
from sidecar import Sidecar
import momepy as mm
from core.utils import used_keys

In [36]:
old_prague_data = pd.read_parquet(old_file)

In [32]:
new_prague_data = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [173]:
graph_keys = {
            "degree": "mtdDeg",
            "meshedness": "lcdMes",
            "local_closeness": "lcnClo",
            "proportion_3": "linP3W",
            "proportion_4": "linP4W",
            "proportion_0": "linPDE",
        }

for c,new_c in graph_keys.items():
    if c in old_prague_data.columns:
        old_prague_data[new_c] = old_prague_data[c]

In [174]:
vals = {}
for c in new_prague_data.columns:

    old_c = c
    if 'k' in c:
        old_c = old_c.replace('k', 'e')
    
    if old_c not in old_prague_data.columns:
        print(c, 'not in old data')
        continue
        
    new_values = new_prague_data[c].values
    old_values = old_prague_data[old_c].values

    # skip nas
    nas = np.isnan(new_values) | np.isnan(old_values)
    new_values = new_values[~nas]
    old_values = old_values[~nas]

    vals[c] = np.corrcoef(new_values, old_values)[0,1]

ltcBuA not in old data
ldsRea not in old data
ldsAre not in old data
midRea not in old data
midAre not in old data
ltcWRB not in old data
likWBB not in old data


In [180]:
vals = pd.Series(vals)
non_correlated = vals[ vals < .80]
non_correlated.sort_values()

mdcAre    0.125303
ldsMSL    0.417514
ltbIBD    0.559449
sdsAre    0.587823
mtbSWR    0.597466
lddNDe    0.639213
sddAre    0.727459
stcSAl    0.746526
linWID    0.770307
stbSAl    0.771552
dtype: float64

In [177]:
[used_keys[c] for c in non_correlated.sort_values().index]

['area covered by neighbouring cells',
 'mean segment length within 3 steps',
 'mean inter-building distance',
 'area covered by edge-attached ETCs',
 'shared walls ratio of buildings',
 'local node density of street network',
 'area covered by node-attached ETCs',
 'street alignment of ETC',
 'local degree weighted node density of street network',
 'street alignment of building']

In [181]:
c = 'mdcAre' # this is due to the tesellation topology issues
c = 'ldsMSL' # this is due to a bug in the momepy.SegmentsLengths code, if you pick .series it returns the sum
c = 'ltbIBD' # this is due to the tesellation topology issues
c = 'sdsAre' # because it uses all the edges that touch a tessellation in general is higher - network ratio - get nearest street 
c = 'mtbSWR' # we capped it at 1, old can be higher + we use a larger buffer
c = 'lddNDe' # in 20% of cases the attached edge, and node are different - 0.79100455
c = 'sddAre' # no idea
...

Ellipsis

In [40]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")

In [219]:
c = 'lddNDe' 


new_values = new_prague_data[c].values
old_values = old_prague_data[c].values

# skip nas
nas = np.isnan(new_values) | np.isnan(old_values)
new_values = new_values[~nas]
old_values = old_values[~nas]
print(np.corrcoef(new_values, old_values)[0,1], new_values.shape)

0.6392134760710421 (82375,)


In [186]:
used_keys[c]

'local node density of street network'

In [187]:
np.sum(old_values - new_values)

502.80513484632695

In [188]:
graph = mm.gdf_to_nx(streets, preserve_index=True)
graph = mm.node_degree(graph)
res = mm.node_density(graph, radius=5, verbose=False)

In [189]:
nodes, edges, sw = mm.nx_to_gdf(res, spatial_weights=True)

 There are 41 disconnected components.


In [202]:
nodes.index = nodes.index + 1

In [210]:
c = 'degree'
c = 'degree'

r =  old_prague_data[['nodeID', c]].groupby('nodeID').first()

In [211]:
assert (nodes.loc[r.index, c] == r.values[:, 0]).all()

In [223]:
tess_nid = mm.get_nearest_street(
    tessellation, streets
)

In [229]:
(tess_nid.values == old_prague_data['edgeID_primary'].values).sum() / old_prague_data['edgeID_primary'].shape

array([0.79100455])

In [231]:
links = mm.get_network_ratio(tessellation, streets)

In [239]:
tessellation[['edgeID_keys', 'edgeID_values']] = links
keys = tessellation.edgeID_values.apply(lambda a: np.argmax(a))
tess_nid = np.array([inds[i] for inds, i in zip(tessellation.edgeID_keys, keys)])

In [241]:
(tess_nid == old_prague_data['edgeID_primary'].values).sum() / old_prague_data['edgeID_primary'].shape

array([0.96996662])

In [124]:
plotting = streets.copy()

In [125]:
%%time
layer = lonboard.PathLayer.from_geopandas(plotting, width_min_pixels=1)

CPU times: user 58 ms, sys: 7.98 ms, total: 65.9 ms
Wall time: 65.9 ms




In [126]:
sc = Sidecar(title='Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [134]:
column = c
classifier = classify(streets[column], 'equalinterval', k=20)
normalizer = mpl.colors.Normalize(0, classifier.bins.shape[0])
vals = normalizer(classifier.yb)
layer.get_color = apply_continuous_cmap(vals, mpl.colormaps['viridis'])

ValueError: arange: cannot compute length

In [161]:
old_prague_data[c]

0          2118.532010
1          3833.421785
2          2118.532010
3          3833.421785
4          1593.936143
             ...      
82370     80307.253955
82371     95125.787387
82372     20724.432870
82373    126712.007579
82374     12903.849147
Name: sddAre, Length: 82375, dtype: float64

In [162]:
new_prague_data[c]

 0          4616.841622
 1          4616.841622
 2          4616.841622
 3          4616.841622
 4          4009.166100
              ...      
 82370     62656.839672
-82371     62656.839672
-82372     24762.223841
-82373    126797.127656
-82374     12912.474772
Name: sddAre, Length: 82375, dtype: float64