# Creating a STAC of Near-Continuous 4D Point Clouds

This notebook demonstrates creating STAC catalogs for high-frequency temporal datasets using the topo4d extension along with standard STAC extensions for better interoperability:

- **Topo4D Extension**: For topographic-specific metadata
- **Projection Extension**: For coordinate reference system information 
- **Point Cloud Extension**: For point count and point cloud metadata
- **Timestamps Extension**: For publication and creation timestamps
- **Common Metadata**: For instrument/sensor information
- 
## Workflow:
- Ingest metadata from data and user input
- Create a STAC Item with multiple extensions
- Create more Items (28 hourly acquisitions)
- Build the Collection
- Update the Collection
- Write the Collection locally

In [1]:
import json
from datetime import datetime
from pathlib import Path
import pandas as pd
from datetime import datetime, timezone, timedelta
import pytz
from dateutil import parser
import numpy as np

import pystac
from topo4d_ext import DataType, Topo4DExtension, ProductMeta, TrafoMeta
from builder import extract_metadata_from_las, make_item_asset

from tqdm import tqdm
import os

from typing import Dict, Any

## Define dirs and paths

In [2]:
data_dir = "../demo/kijkduin/data"
aux_dir = "../demo/kijkduin/auxilary"

## Extract metadata from las/laz file

In [3]:
file_dirs = os.listdir(data_dir)
file_paths = []

for f_dir in file_dirs:
    f_paths = Path(data_dir).glob(f"{f_dir}/*")
    for f_path in f_paths:
        if f_path.suffix.lower() in [".las", ".laz"]:
            file_paths.append(f_path)

# file_paths = [f for f in Path(f_dir).glob("*") if f.suffix.lower() in [".las", ".laz"]]

for file_path in file_paths:
    print(f"Processing {file_path}")
    meta_las = extract_metadata_from_las(file_path, if_save=True)

Processing ..\demo\kijkduin\data\161111\161111_200058.laz
Processing ..\demo\kijkduin\data\161111\161111_210058.laz
Processing ..\demo\kijkduin\data\161111\161111_220103.laz
Processing ..\demo\kijkduin\data\161111\161111_230104.laz
Processing ..\demo\kijkduin\data\161112\161112_000104.laz
Processing ..\demo\kijkduin\data\161112\161112_010104.laz
Processing ..\demo\kijkduin\data\161112\161112_020106.laz
Processing ..\demo\kijkduin\data\161112\161112_030107.laz
Processing ..\demo\kijkduin\data\161112\161112_040108.laz
Processing ..\demo\kijkduin\data\161112\161112_050109.laz
Processing ..\demo\kijkduin\data\161112\161112_060112.laz
Processing ..\demo\kijkduin\data\161112\161112_070110.laz
Processing ..\demo\kijkduin\data\161112\161112_080111.laz
Processing ..\demo\kijkduin\data\161112\161112_090111.laz
Processing ..\demo\kijkduin\data\161112\161112_100112.laz
Processing ..\demo\kijkduin\data\161112\161112_110113.laz
Processing ..\demo\kijkduin\data\161112\161112_120115.laz
Processing ..\

In [4]:
meta_las = extract_metadata_from_las(file_paths[0], if_save=True)
file_id = Path(file_paths[0]).stem.split(".")[0]

## Ingest metadata

In [5]:
global_trafo_path = f"{aux_dir}/_Global transformation matrix/Kijkduin_Global_Transformation.txt"
with open(global_trafo_path, "r") as f:
    # read trafo meta from txt file as array
    global_trafo_mat = [np.array(line.strip().split(), dtype=float).tolist() for line in f if line.strip()]

meta_userinput = {
    "id": file_id,
    "data_type": DataType.POINTCLOUD,
    "native_crs": "EPSG:7415",
    "sensor": "RIEGL VZ2000",
    "acquisition_mode": "TLS",
    "datetime": datetime.strptime(file_id, "%y%m%d_%H%M%S").isoformat(),
    "timezone": "Europe/Amsterdam",
    "asset_url": file_paths[0].as_posix(),
    "global_trafo": global_trafo_mat 
}

# merge meta_las and meta_userinput
metadata = meta_las
metadata.update(meta_userinput)

print(json.dumps(metadata, indent=2))

{
  "id": "161111_200058",
  "header": {
    "DEFAULT_POINT_FORMAT": "<PointFormat(3, 0 bytes of extra dims)>",
    "DEFAULT_VERSION": [
      1,
      2
    ],
    "are_points_compressed": true,
    "creation_date": "2021-04-01",
    "evlrs": [],
    "extra_header_bytes": null,
    "extra_vlr_bytes": null,
    "file_source_id": 0,
    "generating_software": "txt2las (version 171231)",
    "global_encoding": "<laspy.header.GlobalEncoding object at 0x000002154BEDFFD0>",
    "major_version": 1,
    "maxs": [
      162.585,
      1304.714,
      95.185
    ],
    "minor_version": 4,
    "mins": [
      -627.258,
      -1236.375,
      -42.409
    ],
    "number_of_evlrs": 0,
    "number_of_points_by_return": [
      1123106,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0
    ],
    "offset_to_point_data": 2065,
    "offsets": [
      0.0,
      0.0,
      0.0
    ],
    "point_count": 1123106,
    "point_format"

## Create a STAC Item from metadata

### Item `id`

In [6]:
def get_id(metadata: Dict[str, Any]) -> str:
    return metadata.get("id")

In [7]:
item_id = get_id(metadata)
item_id

'161111_200058'

### Item `datetime`

In [8]:
def get_datetime(metadata: Dict) -> datetime:
    dt_str = metadata.get("datetime")
    if not dt_str:
        raise ValueError("Missing 'datetime' in metadata")

    dt = parser.parse(dt_str)

    tz_str = metadata.get("timezone")
    if tz_str and tz_str.startswith("UTC"):
        sign = 1 if "+" in tz_str else -1
        offset_str = tz_str[3:].replace("+", "").replace("-", "")
        parts = offset_str.split(":")
        hours = int(parts[0]) if parts[0] else 0
        minutes = int(parts[1]) if len(parts) > 1 else 0
        offset = timedelta(hours=sign * hours, minutes=sign * minutes)
        tz = timezone(offset)
        dt = dt.replace(tzinfo=tz)
    elif tz_str and "/" in tz_str:
        tz = pytz.timezone(tz_str)
        dt = tz.localize(dt)
    elif dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    return dt

In [9]:
item_datetime = get_datetime(metadata)
item_datetime

datetime.datetime(2016, 11, 11, 20, 0, 58, tzinfo=<DstTzInfo 'Europe/Amsterdam' CET+1:00:00 STD>)

### Item `bbox`, `geometry` and CRS information

Extract spatial information with global transformation and prepare for standard STAC Projection extension:

In [10]:
from shapely.geometry import box, mapping
from pyproj import CRS, Transformer

def get_geo(metadata: Dict[str, Any]) -> Dict[str, Any]:
    header = metadata.get("header")
    vlrs = metadata.get("vlrs", [])
    native_crs = metadata.get("native_crs", None)
    global_trafo = metadata.get("global_trafo", None)

    if not header or "mins" not in header or "maxs" not in header:
        raise ValueError("Invalid header format. 'mins' and 'maxs' are required.")

    xmin, ymin, zmin = header["mins"]
    xmax, ymax, zmax = header["maxs"]

    if global_trafo:
        # Apply global transformation matrix to the bounding box
        trafo_matrix = np.array(global_trafo)
        bbox_corners = np.array([[xmin, ymin, zmin, 1],
                                 [xmax, ymin, zmin, 1],
                                 [xmin, ymax, zmin, 1],
                                 [xmax, ymax, zmin, 1]])

        transformed_corners = bbox_corners @ trafo_matrix.T
        xmin, ymin, zmin = transformed_corners.min(axis=0)[:3]
        xmax, ymax, zmax = transformed_corners.max(axis=0)[:3]

    crs = None
    for vlr in vlrs:
        if vlr.get("user_id", "").lower() == "lasf_projection":
            if "wkt" in vlr and vlr["wkt"]:
                crs = CRS.from_wkt(vlr["wkt"])
                break
            elif "epsg_code" in vlr and vlr["epsg_code"]:
                crs = CRS.from_epsg(vlr["epsg_code"])
                break

    if not crs and native_crs:
        try:
            crs = CRS.from_user_input(native_crs)
        except Exception:
            crs = None

    bbox = [xmin, ymin, xmax, ymax]

    if crs and crs.to_epsg() != 4326:
        try:
            transformer = Transformer.from_crs(crs, CRS.from_epsg(4326), always_xy=True)

            min_lon, min_lat = transformer.transform(xmin, ymin)
            max_lon, max_lat = transformer.transform(xmax, ymax)

            # geom in WGS84
            bbox = [min_lon, min_lat, max_lon, max_lat]
            geom = mapping(box(min_lon, min_lat, max_lon, max_lat))
        except Exception:
            pass 
    else:
        # geom in native CRS
        geom = mapping(box(xmin, ymin, xmax, ymax))

    return {
        "bbox": bbox,
        "geometry": geom,
        "native_crs": crs.to_string() if crs else None
    }

In [11]:
item_geo = get_geo(metadata)
print(json.dumps(item_geo, indent=2))

{
  "bbox": [
    4.2032353058274445,
    52.060984149209446,
    4.236990109111119,
    52.082514395889724
  ],
  "geometry": {
    "type": "Polygon",
    "coordinates": [
      [
        [
          4.236990109111119,
          52.060984149209446
        ],
        [
          4.236990109111119,
          52.082514395889724
        ],
        [
          4.2032353058274445,
          52.082514395889724
        ],
        [
          4.2032353058274445,
          52.060984149209446
        ],
        [
          4.236990109111119,
          52.060984149209446
        ]
      ]
    ]
  },
  "native_crs": "EPSG:7415"
}


### Create the `item`

In [17]:
item = pystac.Item(
    id=item_id,
    geometry=item_geo["geometry"],
    bbox=item_geo["bbox"],
    datetime=item_datetime.replace(tzinfo=None),
    properties={}
)

# Add standard STAC extensions first
item.stac_extensions.extend([
    "https://stac-extensions.github.io/projection/v2.0.0/schema.json",
    "https://stac-extensions.github.io/pointcloud/v2.0.0/schema.json", 
    "https://stac-extensions.github.io/timestamps/v1.1.0/schema.json"
])

In [14]:
# item.validate() # Uncomment this line to validate the item

### Adding STAC Extensions

The updated topo4d approach leverages multiple standard STAC extensions for better interoperability:

In [15]:
topo4d_ext = Topo4DExtension.ext(item, add_if_missing=True)
print(f"Topo4D Extension added: {Topo4DExtension.has_extension(item)}")

Topo4D Extension added: True


In [24]:
# Add standard STAC extensions
item.stac_extensions.extend([
    "https://stac-extensions.github.io/projection/v2.0.0/schema.json",
    "https://stac-extensions.github.io/pointcloud/v2.0.0/schema.json", 
    "https://stac-extensions.github.io/timestamps/v1.1.0/schema.json"
])

# Apply topo4d extension
topo4d_ext = Topo4DExtension.ext(item, add_if_missing=True)
topo4d_ext.tz = metadata.get("timezone", "UTC")
topo4d_ext.data_type = DataType(metadata.get("data_type", "pointcloud"))
topo4d_ext.acquisition_mode = metadata.get("acquisition_mode", None)
topo4d_ext.global_trafo = metadata.get("global_trafo", None)

# Use standard STAC extensions for formerly topo4d properties
item.properties["proj:code"] = item_geo.get("native_crs", None)
item.properties["instruments"] = [metadata.get("sensor")] if metadata.get("sensor") else []
item.properties["pc:count"] = metadata.get("header", {}).get("point_count", None)
item.properties["pc:type"] = 'lidar' if metadata.get('acquisition_mode') == 'ULS' or 'TLS' else 'other'

In [25]:
item.properties

{'topo4d:tz': 'Europe/Amsterdam',
 'topo4d:data_type': 'pointcloud',
 'topo4d:acquisition_mode': 'TLS',
 'topo4d:global_trafo': [[0.708647477,
   0.705554059,
   0.003496133,
   75124.664999332],
  [-0.705561978, 0.708645222, 0.002060151, 454173.49757747503],
  [-0.00102397, -0.003926659, 0.999991766, 37.853088875],
  [0.0, 0.0, 0.0, 1.0]],
 'proj:code': 'EPSG:7415',
 'instruments': ['RIEGL VZ2000'],
 'pc:count': 1123106,
 'pc:type': 'lidar'}

### Adding assets

In [26]:
asset_name, item_asset = make_item_asset(asset_url=metadata.get("asset_url"), user_input=metadata)

item.add_asset(
    key=asset_name,
    asset=item_asset
)

item

## Create more items

In [27]:
def item_from_metadata(metadata: Dict[str, Any]) -> pystac.Item:
    item_id = get_id(metadata)
    item_datetime = get_datetime(metadata)
    item_geo = get_geo(metadata)

    item = pystac.Item(
        id=item_id,
        geometry=item_geo["geometry"],
        bbox=item_geo["bbox"],
        datetime=item_datetime.replace(tzinfo=None),
        properties={}
    )

    # Add standard STAC extensions
    item.stac_extensions.extend([
        "https://stac-extensions.github.io/projection/v2.0.0/schema.json",
        "https://stac-extensions.github.io/pointcloud/v2.0.0/schema.json", 
        "https://stac-extensions.github.io/timestamps/v1.1.0/schema.json"
    ])

    # Apply topo4d extension
    topo4d_ext = Topo4DExtension.ext(item, add_if_missing=True)
    topo4d_ext.tz = metadata.get("timezone", "UTC")
    topo4d_ext.data_type = DataType(metadata.get("data_type", "pointcloud"))
    topo4d_ext.acquisition_mode = metadata.get("acquisition_mode", None)  # Fixed typo: was acquisition_date
    topo4d_ext.global_trafo = metadata.get("global_trafo", None)

    # Use standard STAC extensions for formerly topo4d properties
    item.properties["proj:code"] = item_geo.get("native_crs", None)
    item.properties["instruments"] = [metadata.get("sensor")] if metadata.get("sensor") else []
    item.properties["pc:count"] = metadata.get("header", {}).get("point_count", None)
    item.properties["pc:type"] = 'lidar' if metadata.get('acquisition_mode') in ['ULS', 'TLS'] else 'other'

    # Add timestamps if available
    if metadata.get("header", {}).get("creation_date"):
        try:
            creation_date = datetime.strptime(metadata["header"]["creation_date"], "%Y-%m-%d").isoformat() + "Z"
            item.properties["timestamps:published"] = creation_date
        except:
            pass

    asset_name, item_asset = make_item_asset(asset_url=metadata.get("asset_url"), user_input=metadata)

    item.add_asset(
        key=asset_name,
        asset=item_asset
    )

    return item

In [28]:
global_trafo_path = f"{aux_dir}/_Global transformation matrix/Kijkduin_Global_Transformation.txt"
with open(global_trafo_path, "r") as f:
    # read trafo meta from txt file as array
    global_trafo_mat = [np.array(line.strip().split(), dtype=float).tolist() for line in f if line.strip()]

item_list = []
metadata_list = []

for file_path in tqdm(file_paths, desc="Processing files"):
    meta_las = extract_metadata_from_las(file_path, if_save=True)
    file_id = Path(file_path).stem.split(".")[0]

    meta_userinput = {
        "id": file_id,
        "data_type": DataType.POINTCLOUD,
        "native_crs": "EPSG:7415",
        "sensor": "RIEGL VZ2000",
        "acquisition_mode": "TLS",
        "datetime": datetime.strptime(file_id, "%y%m%d_%H%M%S").isoformat(),
        "timezone": "Europe/Amsterdam",
        "asset_url": file_path.as_posix(),
        "global_trafo": global_trafo_mat 
    }

    metadata = meta_las
    metadata.update(meta_userinput)

    metadata_list.append(metadata)

    item = item_from_metadata(metadata)
    # item.validate() # Uncomment when the Schema url is public

    item.set_self_href(f"{data_dir}/{item.id}.json")
    item_list.append(item)

Processing files: 100%|██████████| 28/28 [00:00<00:00, 71.98it/s]


In [29]:
item_list[0]

### Adding TrafoMeta

Note: TrafoMeta.create() now requires reference_epoch as a mandatory parameter:

In [30]:
file_paths[0].stem

'161111_200058'

In [31]:
trafo_paths = [f"{f.parent}/{f.stem}_trafomat.txt" for f in file_paths]
ref_id = "161111_200058"
# find the match id in item_list
if ref_id in [item.id for item in item_list]:
    reference_epoch = [item for item in item_list if item.id == ref_id][0]

    reference_epoch_link = pystac.Link(
        rel=reference_epoch.id,
        target=reference_epoch.get_self_href(),
        title="Reference Epoch",
        media_type=pystac.MediaType.JSON)

    for idx, trafo_path in enumerate(trafo_paths):
        with open(trafo_path, "r") as f:
            # read trafo meta from txt file as array
            trafo_mat = [np.array(line.strip().split(), dtype=float).tolist() for line in f if line.strip()]

        trafo_meta = TrafoMeta.create(
            reference_epoch=reference_epoch_link.to_dict(),
            transformation=trafo_mat,
            )

        item = item_list[idx]
        Topo4DExtension.ext(item, add_if_missing=True).trafometa = trafo_meta

### Adding ProductMeta

ProductMeta no longer includes lastupdate - use standard timestamps extension instead:

In [32]:
for idx, meta in enumerate(metadata_list):
    item = item_list[idx]
    if "product_name" in meta:
        product_meta = ProductMeta.create(
            product_name=meta["product_name"],
            param=meta.get("param", None),
            derived_from=meta.get("derived_from", None),
            product_level=meta.get("product_level", None),
        )
        Topo4DExtension.ext(item, add_if_missing=True).productmeta = product_meta
        
        # Use timestamps extension for publication date instead of ProductMeta.lastupdate
        if meta.get('header', {}).get("creation_date"):
            try:
                creation_date = datetime.strptime(meta['header']["creation_date"], "%Y-%m-%d").isoformat() + "Z"
                item.properties["timestamps:published"] = creation_date
            except:
                pass

## Create Collection

### Collection `id`

In [33]:
collection_id = f"Kijkduin"
collection_id

'Kijkduin'

### Collection `title`

In [34]:
collection_title = "Kijkduin TLS Point Clouds Collection"
collection_title

'Kijkduin TLS Point Clouds Collection'

### Collection `description`

In [35]:
collection_desc = f'''### {collection_title}

A collection of point cloud from Kijkduin Sandy Beach, The Netherland.
'''
print(collection_desc)

### Kijkduin TLS Point Clouds Collection

A collection of point cloud from Kijkduin Sandy Beach, The Netherland.



### Collection `license`

In [36]:
collection_license = "CC-BY-4.0"
collection_license

'CC-BY-4.0'

### Collection `provider`

In [37]:
collection_providers = [
    pystac.Provider(
        name="TU Delft",
        roles=[
            pystac.ProviderRole.PROCESSOR,
            pystac.ProviderRole.PRODUCER,
            pystac.ProviderRole.LICENSOR,
            pystac.ProviderRole.HOST
            ],
        url="https://coastscan.citg.tudelft.nl/",
    ),
]

### Collection `extend`

In [38]:
spatial_extent = pystac.SpatialExtent([[-180.0, -90.0, 180.0, 90.0]])
temporal_extent = pystac.TemporalExtent([[datetime(2013, 6, 1), None]])
collection_extent = pystac.Extent(spatial_extent, temporal_extent)

In [39]:
collection = pystac.Collection(
    id=collection_id,
    title=collection_title,
    description=collection_desc,
    extent=collection_extent,
    license=collection_license,
    providers=collection_providers,
)

In [40]:
collection.to_dict()

{'type': 'Collection',
 'id': 'Kijkduin',
 'stac_version': '1.1.0',
 'description': '### Kijkduin TLS Point Clouds Collection\n\nA collection of point cloud from Kijkduin Sandy Beach, The Netherland.\n',
 'links': [],
 'title': 'Kijkduin TLS Point Clouds Collection',
 'extent': {'spatial': {'bbox': [[-180.0, -90.0, 180.0, 90.0]]},
  'temporal': {'interval': [['2013-06-01T00:00:00Z', None]]}},
 'license': 'CC-BY-4.0',
 'providers': [{'name': 'TU Delft',
   'roles': ['processor', 'producer', 'licensor', 'host'],
   'url': 'https://coastscan.citg.tudelft.nl/'}]}

In [41]:
# add items to the collection
for item in item_list:
    collection.add_item(item)

### Collection `summaries`

In [42]:
# add collection summaries
collection.summaries.add("num_items", {"count": len(item_list)})
collection.summaries.add("timestamp_list", {"list": [item.datetime.isoformat() for item in item_list]})
collection.summaries.add("temporal_resolution", {"resolution": "1 hour"})

In [43]:
collection.update_extent_from_items()
collection.extent.to_dict()
collection.to_dict()

{'type': 'Collection',
 'id': 'Kijkduin',
 'stac_version': '1.1.0',
 'description': '### Kijkduin TLS Point Clouds Collection\n\nA collection of point cloud from Kijkduin Sandy Beach, The Netherland.\n',
 'links': [{'rel': 'item',
   'href': 'c:/Users/jiapan/02_projects/2025_4DWORKS/4D-WORKS/demo/kijkduin/data/161111_200058.json',
   'type': 'application/geo+json'},
  {'rel': 'item',
   'href': 'c:/Users/jiapan/02_projects/2025_4DWORKS/4D-WORKS/demo/kijkduin/data/161111_210058.json',
   'type': 'application/geo+json'},
  {'rel': 'item',
   'href': 'c:/Users/jiapan/02_projects/2025_4DWORKS/4D-WORKS/demo/kijkduin/data/161111_220103.json',
   'type': 'application/geo+json'},
  {'rel': 'item',
   'href': 'c:/Users/jiapan/02_projects/2025_4DWORKS/4D-WORKS/demo/kijkduin/data/161111_230104.json',
   'type': 'application/geo+json'},
  {'rel': 'item',
   'href': 'c:/Users/jiapan/02_projects/2025_4DWORKS/4D-WORKS/demo/kijkduin/data/161112_000104.json',
   'type': 'application/geo+json'},
  {'rel

### Save STAC

In [44]:
from pathlib import Path

root_path = str(Path(f"{data_dir}"))
print(f"Root path for collection: {root_path}")

from pystac.layout import TemplateLayoutStrategy

# Set up a flatten layout strategy
strategy = TemplateLayoutStrategy(
    item_template="${id}.json"
)

collection.normalize_hrefs(root_path, strategy=strategy)

Root path for collection: ..\demo\kijkduin\data


In [140]:
# collection.validate_all() # Uncomment when the Schema url is public

In [45]:
collection.save(pystac.CatalogType.SELF_CONTAINED)

In [46]:
collection.describe()

* <Collection id=Kijkduin>
  * <Item id=161111_200058>
  * <Item id=161111_210058>
  * <Item id=161111_220103>
  * <Item id=161111_230104>
  * <Item id=161112_000104>
  * <Item id=161112_010104>
  * <Item id=161112_020106>
  * <Item id=161112_030107>
  * <Item id=161112_040108>
  * <Item id=161112_050109>
  * <Item id=161112_060112>
  * <Item id=161112_070110>
  * <Item id=161112_080111>
  * <Item id=161112_090111>
  * <Item id=161112_100112>
  * <Item id=161112_110113>
  * <Item id=161112_120115>
  * <Item id=161112_130115>
  * <Item id=161112_140116>
  * <Item id=161112_150118>
  * <Item id=161112_160118>
  * <Item id=161112_170118>
  * <Item id=161112_180119>
  * <Item id=161112_190130>
  * <Item id=161112_200120>
  * <Item id=161112_210121>
  * <Item id=161112_220123>
  * <Item id=161112_230123>
