# Experiment with STAC-GeoParquet

The S2S solution is currently built on a POSTGRES database and associated API. This notebook is focused on supporting the transition to geoparquet and STAC (may not work).

1. Search through existing S2S parquet results and convert to geoparquet  
2. Generate STAC catalog for new geoparquet inventory  
3. Convert STAC to geo-parquet


import sys, os

In [41]:
import sys, os
import pystac, pystac_client
import h3

import boto3
from botocore.exceptions import ClientError

import pandas as pd
import geopandas as gpd

from shapely.geometry import Polygon
from tqdm.notebook import tqdm

# Supress InsecureRequestWarning: Unverified HTTPS request is being made.
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [7]:
# Set up S3 client, do not verify SSL certificates
s3 = boto3.client('s3', verify=False)

# Get a list of all WorldPop parquet solutions in the S2S bucket
bucket = "wbg-geography01"
prefix = "Space2Stats/h3_stats_data/GLOBAL/WorldPop_2025_Demographics/"

# List all parquet files in the specified S3 bucket and prefix, including pagination

parquet_files = []
continuation_token = None

while True:
    if continuation_token:
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
    else:
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    parquet_files.extend([obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.parquet')])
    continuation_token = response.get('NextContinuationToken')
    if not continuation_token:
        break
print(f"Found {len(parquet_files)} parquet files.")




Found 7828 parquet files.


In [10]:
# Build STAC catalog
new_catalog = pystac.Catalog(id="Space2Stats", description="Hexagonal statistics from Space2Stats")

collection = pystac.Collection(
    id='WorldPop_2025_Demographics',
    description='Summarize demographic and summary population from WorldPops repository. ' \
        'Demographic breakdowns include age and gender for the year 2025. ' \
        'Total population counts are also included for 2015-2030.',
    extent=pystac.Extent(
            spatial=pystac.SpatialExtent([[ -180, -90, 180, 90 ]]),
            temporal=pystac.TemporalExtent([['2015-01-01T00:00:00Z', '2030-12-31T23:59:59Z']])
        )
    )

new_catalog.add_child(collection)

In [36]:
def h3_to_shapely_polygon(h3_index):
    # Get the H3 cell boundary coordinates
    geo_boundary = h3.cell_to_boundary(h3_index)

    # Shapely Polygon expects (long, lat) pairs, so flip the coordinates
    flipped_coords = [(lon, lat) for lat, lon in geo_boundary]

    polygon = Polygon(flipped_coords)
    return polygon


In [66]:
# Loop through parquet files and create output geoparquet STAC items
import datetime


out_prefix = "Space2Stats/h3_stats_data/GLOBAL/WorldPop_2025_Demographics_geoparquet/"
for parquet_file in tqdm(parquet_files):
    file_base = parquet_file.replace(prefix, "")
    out_file = out_prefix + file_base.replace(".parquet", ".geoparquet")
    try:
        s3.head_object(Bucket=bucket, Key=out_file)
        file_exists = True  # Object exists
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            file_exists = False  # Object does not exist

    # If the file does not exist, create it
    if not file_exists:
        # Open the existing parquet file as a data frame
        s3_path = f"s3://{bucket}/{parquet_file}"
        df = pd.read_parquet(s3_path)
        # Add a geometry column using h3
        df['geometry'] = df['shape_id'].apply(h3_to_shapely_polygon)
        # Write out the geoparquet to S3
        df.replace(-1, pd.NA, inplace=True)
        gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=4326)
        gdf.to_parquet(f"s3://{bucket}/{out_file}", engine='pyarrow', compression='gzip')
    else:
        pass
        # Right now, we don't actually need to read it back in
        #gdf = gpd.read_parquet(f"s3://{bucket}/{out_file}")

    # Get geometry for parent H3 cell
    parent_cell = parquet_file.split("/")[-2]
    parent_polygon = h3_to_shapely_polygon(parent_cell)
    # Create a STAC item for the geoparquet file
    item = pystac.Item(
        id=file_base.replace(".parquet", ""),
        geometry=parent_polygon,
        bbox=list(parent_polygon.bounds),
        datetime=datetime.datetime.fromisoformat('2025-01-01T00:00:00'),
        properties={}
    )   
    # Add an asset to the item (e.g., a Cloud-Optimized GeoTIFF)
    item.add_asset(
        key='image',
        asset=pystac.Asset(
            href=f"s3://{bucket}/{out_file}",
            media_type=pystac.MediaType.PARQUET,
            roles=['data']
        )
    )
    collection.add_item(item)
    

  0%|          | 0/7828 [00:00<?, ?it/s]

In [67]:
item