In [30]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

# Example: Fetch the first 10 records from the Glue (S3) Iceberg table
#
# Prerequisites:
#   - AWS credentials with permission to call Glue + read underlying S3 data are available in your environment
#   - (Optional) Override defaults with env vars: AWS_REGION, AWS_ACCOUNT_ID, S3_TABLES_BUCKET,
#       GLUE_TABLE_NAME, GLUE_NAMESPACE
#
# This reuses the existing helper in `table_config.get_glue_table`.
# If you just want to experiment locally without AWS, see Cell 2 (you can switch it to use get_local_table).

from config import (
    S3_TABLES_BUCKET,
    GLUE_TABLE_NAME,
    GLUE_NAMESPACE,
    AWS_REGION,
    AWS_ACCOUNT_ID,
)
from table_config import get_glue_table

# Load the Iceberg table from the Glue catalog (S3-backed)
table = get_glue_table(
    s3_tables_bucket=S3_TABLES_BUCKET,
    table_name=GLUE_TABLE_NAME,
    namespace=GLUE_NAMESPACE,
    region=AWS_REGION,
    account_id=AWS_ACCOUNT_ID,
)
print(f"Loaded Iceberg table: {GLUE_NAMESPACE}.{GLUE_TABLE_NAME}")


env: AWS_PROFILE=platform-developer
Using wellcomecollection_catalogue.ebsco_adapter_table in wellcomecollection_catalogue catalog
Using wellcomecollection_catalogue.ebsco_adapter_table in wellcomecollection_catalogue catalog
Loaded Iceberg table: wellcomecollection_catalogue.ebsco_adapter_table
Loaded Iceberg table: wellcomecollection_catalogue.ebsco_adapter_table


In [31]:
# Retrieve the first 10 data rows (excluding any projection to keep all columns)
first_10 = table.scan(
    selected_fields=("namespace", "id", "content"),
    limit=10,
).to_arrow()

print(f"Fetched {first_10.num_rows} rows")

# Display nicely (Arrow -> pandas DataFrame) if pandas is available
try:
    display(first_10.to_pandas())  # type: ignore[name-defined]
except Exception:
    # Fallback: print as list of dicts
    from pprint import pprint
    pprint(first_10.to_pylist())

Fetched 0 rows


Unnamed: 0,namespace,id,content


In [None]:
# Delete all rows using pyiceberg's row-level delete API only (no fallback).
# Run the table-loading cell first so `table` is defined.

# WARNING: This will irreversibly delete all data in the table!!!
# DO NOT run this cell if you are not absolutely sure what you're doing.

# from pyiceberg.expressions import AlwaysTrue

# try:
#     _ = table.schema()
# except NameError as e:  # pragma: no cover
#     raise RuntimeError("`table` is not defined. Run the table-loading cell first.") from e

# before_count = table.scan().count()
# print(f"Rows before delete: {before_count}")

# with table.transaction() as tx:  # type: ignore[attr-defined]
#     try:
#         tx.delete(delete_filter=AlwaysTrue())  # type: ignore[attr-defined]
#     except Exception as e:
#         raise RuntimeError("Row-level delete failed.") from e

# after_count = table.scan().count()
# print(f"Rows after delete:  {after_count}")
# assert after_count == 0, "Delete operation failed: table not empty"
# print("All rows deleted successfully via row-level delete.")


Rows before delete: 183740
Rows after delete:  0
All rows deleted successfully via row-level delete.
Rows after delete:  0
All rows deleted successfully via row-level delete.
