In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
import momepy as mm

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")
graph_dir = data_dir + "neigh_graphs/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"

In [2]:
from core.cluster_validation import generate_enc_groups
from core.utils import used_keys

In [3]:
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "regions_hull.parquet"
    ).to_crs('epsg:4326')

In [4]:
for region_id, region_hull in region_hulls.iterrows():
    region_hull = region_hull["convex_hull"]
    if region_id == 69300: break
region_id

69300

In [5]:
hull_boundary = region_hull.bounds

In [7]:
# !conda install -c conda-forge overturemaps -y

In [8]:
## from overturemaps-py
from typing import List, Optional

import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyarrow.fs as fs
import json
import os
import sys
from typing import Optional
import pyarrow.parquet as pq
import shapely.wkb

def record_batch_reader(overture_type, bbox=None) -> Optional[pa.RecordBatchReader]:
    """
    Return a pyarrow RecordBatchReader for the desired bounding box and s3 path
    """
    path = _dataset_path(overture_type)

    if bbox:
        xmin, ymin, xmax, ymax = bbox
        filter = (
            (pc.field("bbox", "xmin") < xmax)
            & (pc.field("bbox", "xmax") > xmin)
            & (pc.field("bbox", "ymin") < ymax)
            & (pc.field("bbox", "ymax") > ymin)
        )
    else:
        filter = None

    dataset = ds.dataset(
        path, filesystem=fs.S3FileSystem(anonymous=True, region="us-west-2")
    )
    batches = dataset.to_batches(filter=filter)

    # to_batches() can yield many batches with no rows. I've seen
    # this cause downstream crashes or other negative effects. For
    # example, the ParquetWriter will emit an empty row group for
    # each one bloating the size of a parquet file. Just omit
    # them so the RecordBatchReader only has non-empty ones. Use
    # the generator syntax so the batches are streamed out
    non_empty_batches = (b for b in batches if b.num_rows > 0)

    geoarrow_schema = geoarrow_schema_adapter(dataset.schema)
    reader = pa.RecordBatchReader.from_batches(geoarrow_schema, non_empty_batches)
    return reader


def geoarrow_schema_adapter(schema: pa.Schema) -> pa.Schema:
    """
    Convert a geoarrow-compatible schema to a proper geoarrow schema

    This assumes there is a single "geometry" column with WKB formatting

    Parameters
    ----------
    schema: pa.Schema

    Returns
    -------
    pa.Schema
    A copy of the input schema with the geometry field replaced with
    a new one with the proper geoarrow ARROW:extension metadata

    """
    geometry_field_index = schema.get_field_index("geometry")
    geometry_field = schema.field(geometry_field_index)
    geoarrow_geometry_field = geometry_field.with_metadata(
        {b"ARROW:extension:name": b"geoarrow.wkb"}
    )

    geoarrow_schema = schema.set(geometry_field_index, geoarrow_geometry_field)

    return geoarrow_schema


type_theme_map = {
    "locality": "admins",
    "locality_area": "admins",
    "administrative_boundary": "admins",
    "building": "buildings",
    "building_part": "buildings",
    "division": "divisions",
    "division_area": "divisions",
    "place": "places",
    "segment": "transportation",
    "connector": "transportation",
    "infrastructure": "base",
    "land": "base",
    "land_cover": "base",
    "land_use": "base",
    "water": "base",
}


def _dataset_path(overture_type: str) -> str:
    """
    Returns the s3 path of the Overture dataset to use. This assumes overture_type has
    been validated, e.g. by the CLI

    """
    # Map of sub-partition "type" to parent partition "theme" for forming the
    # complete s3 path. Could be discovered by reading from the top-level s3
    # location but this allows to only read the files in the necessary partition.
    theme = type_theme_map[overture_type]
    return f"overturemaps-us-west-2/release/2024-06-13-beta.1/theme={theme}/type={overture_type}/"

In [9]:
%%time

type_ = 'segment'
output_format = 'geoparquet'

# download(hull_boundary, output_format, f'../data/prague_overture_{type_}.{output_format}', type_)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 4.05 μs


In [10]:
%%time
batches = record_batch_reader(type_, hull_boundary).read_all()

CPU times: user 2.3 s, sys: 1.39 s, total: 3.69 s
Wall time: 2min 30s


In [19]:
gdf = gpd.GeoDataFrame.from_arrow(batches).set_crs(epsg=4326).to_crs(epsg=3035)

In [14]:
gdf = gdf.iloc[gdf.sindex.query(region_hull, predicate='intersects')]

In [15]:
## service road removed
query = "living_street|motorway|motorway_link|pedestrian|primary|primary_link|residential|secondary|secondary_link|tertiary|tertiary_link|trunk|trunk_link|unclassified"
approved_roads = query.split('|')

In [16]:
gdf = gdf[gdf['class'].isin(approved_roads)]

In [17]:
gdf = gdf.sort_values('geometry').reset_index(drop=True)

In [18]:
gdf = gdf.sort_values('id')[['id', 'geometry', 'class']].reset_index(drop=True)

In [19]:
gdf.to_parquet(data_dir + f"streets/streets_{region_id}.parquet")

In [12]:
region_hulls = gpd.read_parquet(regions_datadir + "regions/" + "regions_hull.parquet")

In [13]:
for region_id, region_hull in region_hulls.to_crs('epsg:4326').iterrows():
        region_hull = region_hull["convex_hull"]

        if region_id == 69300: break

In [14]:
from core.generate_streets import process_region_streets, read_overture_region_streets

In [16]:
%%time
streets = process_region_streets(region_hull, region_id)

CPU times: user 3.1 s, sys: 1.42 s, total: 4.52 s
Wall time: 2min 33s


In [18]:
streets.to_parquet(data_dir + f"streets/streets_{region_id}.parquet")

NameError: name 'gpd' is not defined