In [1]:
# Downloading data from Overture Maps on Aamzon S3
### Relying on https://pypi.org/project/overturemapsdownloader/

In [42]:
import duckdb
from typing import Any
import json
import geopandas as gpd

from osgeo import ogr
from shapely import wkb


In [34]:
con = duckdb.connect()
print(con.execute('SELECT 42').fetchall())
# To install an extension (this is usually done only once)
try:
    con.install_extension('httpfs')
    con.install_extension('spatial')
    con.load_extension('httpfs')
    con.load_extension('spatial')
    con.execute("SET s3_region='us-west-2'")
except Exception as e:
    print(f"Failed to install extension: {e}")

[(42,)]


In [35]:
# Define the query to read from S3 and filter the data
query = '''
SELECT
    type,
    subType,
    localityType,
    adminLevel,
    isoCountryCodeAlpha2,
    JSON(names) AS names,
    JSON(sources) AS sources,
    ST_GeomFromWkb(geometry) AS geometry
FROM read_parquet(
    's3://overturemaps-us-west-2/release/2023-07-26-alpha.0/theme=admins/type=*/*',
    filename=true,
    hive_partitioning=1
)
WHERE adminLevel = 2
    AND ST_GeometryType(ST_GeomFromWkb(geometry)) IN ('POLYGON','MULTIPOLYGON')
LIMIT 5
'''

In [36]:
# Execute the query and store the result
df = con.execute(query).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

IOException: IO Error: Connection error for HTTP HEAD to 'https://overturemaps-us-west-2.s3.amazonaws.com/release/2023-07-26-alpha.0/theme%3Dadmins/type%3Dlocality/20230725_211237_00132_5p54t_e608b636-1b9e-4e3c-ad00-d93fb09ed323'

In [None]:
# Try to convert it to a Shapely geometry object
def try_wkb_loads(data):
    try:
        return wkb.loads(data)
    except Exception as e:
        print(f"Failed to convert geometry: {e}")
        return None
    
df['geometry'] = df['geometry'].apply(try_wkb_loads)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=4326)

In [None]:
gdf

In [13]:
import dask.dataframe as dd
import geopandas as gpd
import dask_geopandas as dgpd 

from shapely.geometry import box

In [14]:
iso3 = 'KHM'

In [28]:
df = dd.read_parquet(
    's3://overturemaps-us-west-2/release/2023-07-26-alpha.0/theme=places/type=place/*',
    columns=['bbox', 'geometry'],
    engine='pyarrow',
    index='id',
    dtype_backend='pyarrow',
    storage_options={"anon": True},
    parquet_file_extensions=False,
)

In [30]:
# Get extent of selected country as a bounding box polygon
world_filepath = gpd.datasets.get_path('naturalearth_lowres')
world = gpd.read_file(world_filepath)

sel_country = world[world['iso_a3'] == iso3]
country_box = box(*sel_country.total_bounds)
str(country_box)

'POLYGON ((107.61454796756243 10.48654368737523, 107.61454796756243 14.570583807834282, 102.34809939983302 14.570583807834282, 102.34809939983302 10.48654368737523, 107.61454796756243 10.48654368737523))'

In [25]:
geometry = df["geometry"].map_partitions(gpd.GeoSeries.from_wkt, meta=gpd.GeoSeries(name="geometry")) #.set_crs(4326)
gdf = dgpd.from_dask_dataframe(df, geometry=geometry)

clipped_gdf = gdf[gdf.geometry.within(country_box)]
print(clipped_gdf.head())

AttributeError: 'Series' object has no attribute 'within'

In [24]:
geometry.__class__

dask_expr._collection.Series