In [1]:
import gc
import glob

import geopandas as gpd
import momepy as mm
import numpy as np
import pandas as pd
import shapely
from libpysal.graph import Graph, read_parquet
import datetime

In [3]:
regions_buildings_dir = '/data/uscuni-ulce/regions/buildings/'
buildings_dir = '/data/uscuni-ulce/processed_data/buildings/'
streets_dir = '/data/uscuni-ulce/processed_data/streets/'
enclosures_dir = '/data/uscuni-ulce/processed_data/enclosures/'
tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
graph_dir = '/data/uscuni-ulce/processed_data/neigh_graphs/'
chars_dir = '/data/uscuni-ulce/processed_data/chars/'

regions_datadir = "/data/uscuni-ulce/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")

In [4]:
### used for testing
# osm_ids = ['v0.1-DEU.9.8.13.1_1-194', 'v0.1-DEU.9.8.13.1_1-232',
#        'v0.1-DEU.9.8.13.1_1-214', 'v0.1-DEU.9.8.13.1_1-273',
#        'v0.1-DEU.9.8.13.1_1-211', 'v0.1-DEU.9.8.13.1_1-188',
#        'v0.1-DEU.9.8.13.1_1-192', 'v0.1-DEU.9.8.13.1_1-276',
#        'v0.1-DEU.9.8.13.1_1-191', 'v0.1-DEU.9.8.13.1_1-184',
#        'v0.1-DEU.9.8.13.1_1-245', 'v0.1-DEU.9.8.13.1_1-275']

In [4]:
# prague - 69333
# freiburg - 5883

In [5]:
region_name = 69333
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
region_id, region_hull = region_hulls.loc[region_name].name, region_hulls.loc[region_name].convex_hull

In [7]:
# ## freiburg - buildings come from a different source so we dont use the building pre-processing pipeline - go directly to streets
# gdf = gpd.read_parquet('../data/freiburg/buildings_freiburg.parquet').to_crs(epsg=3035)
# region_id = region_name = 'freiburg'
# region_hull = gdf.union_all().convex_hull
# region_hulls = gpd.GeoDataFrame({'geometry': [region_hull]}, index=[region_id], crs=gdf.crs)
# buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
# chars_dir = '../data/freiburg/chars/'

## Buildings


In [8]:
from core.generate_buildings import read_region_buildings, process_region_buildings

In [9]:
# ## need to link eubucco building polygons to regions, this will change in the future
# building_region_mapping = pd.read_parquet(
#     regions_datadir + "regions/" + "id_to_region.parquet", engine="pyarrow"
# )
# typed_dict = pd.Series(
#     np.arange(building_region_mapping["id"].values.shape[0]),
#     index=building_region_mapping["id"].values,
# )
# region_ids = building_region_mapping.groupby("region")["id"].unique()
# del building_region_mapping  # its 2/3 gb

In [10]:
%%time

buildings = gpd.read_parquet(regions_buildings_dir + f'buildings_{region_id}.pq')

CPU times: user 218 ms, sys: 156 ms, total: 374 ms
Wall time: 360 ms


In [12]:
%%time

buildings = process_region_buildings(buildings, True, simplification_tolerance=.1, merge_limit=25)

Percent polygons:  1.0
Final polygons:  457127 , dropped:  0.02918639964321368
CPU times: user 1min 14s, sys: 324 ms, total: 1min 14s
Wall time: 1min 15s


In [13]:
buildings.to_parquet(buildings_dir + f"buildings_{region_id}.parquet")

## Streets

In [17]:
from core.generate_streets import process_region_streets, read_overture_region_streets

In [18]:
## overture is indexed based on 4326
overture_hull = region_hulls.loc[[region_name], ].to_crs(epsg=4326).convex_hull.iloc[0]

In [19]:
%%time
## processs streets
streets = process_region_streets(overture_hull, region_id, buildings_dir)


  additions, splits = snap_to_targets(
  additions, splits = snap_to_targets(
can only convert an array of size 1 to a Python scalar
  roads_cleaned = simplify_singletons(
  additions, splits = snap_to_targets(
  nx_gx_cluster(
  nx_gx_cluster(
  nx_gx_cluster(
  additions, splits = snap_to_targets(


CPU times: user 9min 37s, sys: 4.67 s, total: 9min 42s
Wall time: 12min 42s


In [20]:
# streets.explore(tiles='cartodbpositron', prefer_canvas=True)

In [22]:
## save streets
streets.to_parquet(streets_dir + f'streets_{region_id}.parquet')

In [1]:
import geopandas as gpd
# gpd.read_parquet(streets_dir + f'streets_{region_id}.parquet').explore()

In [2]:
# gpd.read_parquet(enclosures_dir + f'enclosure_{region_id}.parquet').explore()

## Enclosures & Tessellation

In [23]:
from core.generate_elements import process_region_elements

In [24]:
%%time
enclosures, tesselations = process_region_elements(buildings_dir, streets_dir, region_id)

---- Processing region:  69333 2024-09-13 19:56:49.895025




Retrying tesselation with less buildings, potentially changing building data.




Dropping 2 buildings due to tesselation problems
CPU times: user 3min 30s, sys: 4.47 s, total: 3min 35s
Wall time: 4min 50s


In [25]:
enclosures.to_parquet(enclosures_dir + f"enclosure_{region_id}.parquet")
print("Processed enclosures")

## save files
tesselations.to_parquet(
    tessellations_dir + f"tessellation_{region_id}.parquet"
)
print("processed tesselations")

Processed enclosures
processed tesselations


In [26]:
# import lonboard
# layer = lonboard.PolygonLayer.from_geopandas(tesselations, opacity=0.15)
# m = lonboard.Map([layer])
# m

In [27]:
# layer = lonboard.PolygonLayer.from_geopandas(enclosures, opacity=0.15)
# m = lonboard.Map([layer])
# m

## Graphs

In [28]:
from core.generate_ngraphs import process_region_graphs

In [29]:
%%time
process_region_graphs(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
)

Built tess graph knn=1
Built buildings graph knn=1
Built streets graph knn=1
Built enclosure graph knn=1


 There are 243 disconnected components.


Built nodes graph knn=1
CPU times: user 43.5 s, sys: 834 ms, total: 44.3 s
Wall time: 44.2 s


## Characters

In [30]:
from core.generate_chars import process_single_region_chars, process_street_chars

In [31]:
%%time
process_single_region_chars(
    region_id,
    graph_dir,
    buildings_dir,
    streets_dir,
    enclosures_dir,
    tessellations_dir,
    chars_dir
)

2024-09-13 20:03:29.594669 ----Processing ------ 69333
Processing streets
Processing enclosures
Processing buildings


  return np.nanmean(np.abs(90 - degrees[true_angles]))
  return Series({"mean": np.nanmean(dists), "std": np.nanstd(dists)})
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Processing tessellation
CPU times: user 10min 21s, sys: 3.83 s, total: 10min 24s
Wall time: 10min 23s


## Generate primary data

In [32]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")
nodes = gpd.read_parquet(chars_dir + f"nodes_chars_{region_id}.parquet")

In [33]:

merged = pd.merge(
    tessellation.drop(columns=["geometry"]),
    buildings.drop(columns=["nodeID", "geometry", 'nID']),
    right_index=True,
    left_index=True,
    how="left",
)

merged = merged.merge(
    enclosures.drop(columns="geometry"),
    right_on="eID",
    left_on="enclosure_index",
    how="left",
)

merged = merged.merge(streets.drop(columns="geometry"), on="nID", how="left")
merged = merged.merge(nodes.drop(columns="geometry"), on="nodeID", how="left")

merged = merged.drop(
    columns=[
        "nID",
        "eID",
        "nodeID",
        "mm_len",
        "cdsbool",
        "node_start",
        "node_end",
        "x",
        "y",
        "enclosure_index",
        # "id",
        # "osm_id",
        "index",  ## maybe keep
    ]
)
merged = merged.set_index(tessellation.index)

In [34]:
from core.utils import used_keys

In [35]:
primary = merged[list(used_keys.keys())]
primary.shape

(460909, 63)

In [36]:
primary.to_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

----

In [12]:
%%time
streets = read_overture_region_streets(overture_hull, region_id)

CPU times: user 3.85 s, sys: 1.97 s, total: 5.82 s
Wall time: 2min 50s


In [14]:
## service road removed
approved_roads = ['living_street',
                 'motorway',
                 'motorway_link',
                 'pedestrian',
                 'primary',
                 'primary_link',
                 'residential',
                 'secondary',
                 'secondary_link',
                 'tertiary',
                 'tertiary_link',
                 'trunk',
                 'trunk_link',
                 'unclassified']
streets = streets[streets['class'].isin(approved_roads)]

In [16]:
streets.columns

Index(['id', 'geometry', 'bbox', 'version', 'sources', 'subtype', 'class',
       'names', 'connector_ids', 'connectors', 'routes', 'subclass',
       'subclass_rules', 'access_restrictions', 'level_rules', 'destinations',
       'prohibited_transitions', 'road_surface', 'road_flags', 'speed_limits',
       'width_rules'],
      dtype='object')

In [114]:
streets.loc[~streets.road_flags.isna(), 'level_rules']

26850     [{'value': 1, 'between': [0.050136164, 0.05940...
109595    [{'value': 1, 'between': [0.915219696, 0.94196...
109620    [{'value': 1, 'between': [0.151529713, 0.27965...
109827    [{'value': 1, 'between': [0.375746636, 0.41757...
109851    [{'value': 1, 'between': [0.509854034, 0.54010...
                                ...                        
209391    [{'value': 1, 'between': [0.670127379, 0.94802...
209376    [{'value': 1, 'between': [0.772954601, 0.86785...
209412    [{'value': 1, 'between': [0.004426714, 0.01168...
199133                                                 None
200798    [{'value': 1, 'between': [0.658627462, 0.80559...
Name: level_rules, Length: 2690, dtype: object

In [69]:
to_filter = streets.loc[~streets.road_flags.isna(), ].set_crs(epsg=4236).to_crs(epsg=3035)

In [106]:
def to_drop_tunnel(row):
    tunnel_length = row.geometry.length
    flags = row.road_flags

    total_tunnel_proportion = -1
    for flag in flags:
        if 'values' in flag and ('is_tunnel' in flag['values']) :
            # between could be missing to show the whole thing is a tunnel
            total_tunnel_proportion = 0.0 if total_tunnel_proportion < 0 else total_tunnel_proportion
            # betweencould be None to indicate the whole thing is a tunnel 
            if ('between' in flag) and (flag['between'] is not None):
                s,e = flag['between'][0], flag['between'][1]
                total_tunnel_proportion += (e - s)
    
    if (total_tunnel_proportion*tunnel_length) > 50:
        return True
    elif total_tunnel_proportion == 0.0:
        return True
    return False

In [110]:
tunnels_to_drop = to_filter.apply(to_drop_tunnel, axis=1)

In [111]:
streets = streets.drop(to_filter[tunnels_to_drop].index)

In [118]:
streets = streets.set_crs(epsg=4326).to_crs(epsg=3035)

(49026, 17)

In [120]:
# streets[(~streets.level_rules.isna()) & (streets.road_flags.isna())].explore()

In [123]:
# def to_drop_tunnel(tunnel):
#     tunnel_length = tunnel.geometry.length
#     tunnel = json.loads(tunnel.road)
#     if 'flags' in tunnel:
#         total_tunnel_proportion = 0.0
#         for flag in tunnel['flags']:
#             if 'between' in flag:
#                 s,e = flag['between'][0], flag['between'][1]
#                 total_tunnel_proportion += (e - s)
#         if (total_tunnel_proportion*tunnel_length) > 50:
#             return True
#         elif total_tunnel_proportion == 0.0:
#             return True
#     return False

In [1]:
import sgeop
import geopandas
roads = geopandas.read_parquet("/data/uscuni-ulce/processed_data/streets/streets_69300.parquet")
buildings = geopandas.read_parquet(
    "/data/uscuni-ulce/processed_data/buildings/buildings_69300.parquet", columns=["geometry"]
)


In [2]:
%%time
new_roads = sgeop.simplify_network(
    roads,
    exclusion_mask=buildings.geometry,
    artifact_threshold_fallback=7,
)

  additions, splits = snap_to_targets(
can only convert an array of size 1 to a Python scalar
  roads_cleaned = simplify_singletons(


CPU times: user 5min 28s, sys: 2.15 s, total: 5min 30s
Wall time: 5min 25s


In [7]:
# new_roads.explore(tiles='cartodbpositron', prefer_canvas=True)