In [21]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

# Example: Fetch the first 10 records from the S3 Tables Iceberg REST API backed table
#
# Prerequisites:
#   - AWS credentials with permission to call S3 Tables (Iceberg REST API) + read underlying S3 data are available
#   - (Optional) Override defaults with env vars: AWS_REGION, AWS_ACCOUNT_ID, S3_TABLES_BUCKET,
#       GLUE_TABLE_NAME, GLUE_NAMESPACE
#
# This reuses the existing helper in `table_config.get_rest_api_table`.
# If you just want to experiment locally without AWS, see Cell 2 (you can switch it to use get_local_table).

from adapters.ebsco.config import (
    AWS_ACCOUNT_ID,
    AWS_REGION,
    REST_API_NAMESPACE,
    REST_API_TABLE_NAME,
    S3_TABLES_BUCKET,
)
from adapters.ebsco.table_config import get_rest_api_table

# Load the Iceberg table via the S3 Tables Iceberg REST API
table = get_rest_api_table(
    s3_tables_bucket=S3_TABLES_BUCKET,
    table_name=REST_API_TABLE_NAME,
    namespace=REST_API_NAMESPACE,
    region=AWS_REGION,
    account_id=AWS_ACCOUNT_ID,
)
print(f"Loaded Iceberg table: {REST_API_NAMESPACE}.{REST_API_TABLE_NAME}")

env: AWS_PROFILE=platform-developer
Using wellcomecollection_catalogue.ebsco_adapter_table in wellcomecollection_catalogue catalog
Loaded Iceberg table: wellcomecollection_catalogue.ebsco_adapter_table


In [None]:
# Retrieve the first 10 data rows (excluding any projection to keep all columns)
first_10 = table.scan(
    selected_fields=("namespace", "id", "content"),
    limit=10,
).to_arrow()

print(f"Fetched {first_10.num_rows} rows")

# Display nicely (Arrow -> pandas DataFrame) if pandas is available
try:
    display(first_10.to_pandas())  # type: ignore[name-defined]
except Exception:
    # Fallback: print as list of dicts
    from pprint import pprint

    pprint(first_10.to_pylist())

In [None]:
# Delete all rows using pyiceberg's row-level delete API only (no fallback).
# Run the table-loading cell first so `table` is defined.

# WARNING: This will irreversibly delete all data in the table!!!
# DO NOT run this cell if you are not absolutely sure what you're doing.


try:
    _ = table.schema()
except NameError as e:  # pragma: no cover
    raise RuntimeError(
        "`table` is not defined. Run the table-loading cell first."
    ) from e

before_count = table.scan().count()
print(f"Rows before delete: {before_count}")

# with table.transaction() as tx:  # type: ignore[attr-defined]
#     try:
#         tx.delete(delete_filter=AlwaysTrue())  # type: ignore[attr-defined]
#     except Exception as e:
#         raise RuntimeError("Row-level delete failed.") from e

# after_count = table.scan().count()
# print(f"Rows after delete:  {after_count}")
# assert after_count == 0, "Delete operation failed: table not empty"
# print("All rows deleted successfully via row-level delete.")

In [23]:
# get a record with a specific id
record_id = "ebs1109790e"  # Replace with an actual record ID
record = table.scan(
    selected_fields=("namespace", "id", "content"),
    row_filter=f"id = '{record_id}'",
).to_arrow()

if record.num_rows == 0:
    print(f"No record found with id: {record_id}")
else:
    print(f"Record with id {record_id}:")
    try:
        display(record.to_pandas())  # type: ignore[name-defined]
    except Exception:
        from pprint import pprint

        pprint(record.to_pylist())


Record with id ebs1109790e:


Unnamed: 0,namespace,id,content
0,ebsco,ebs1109790e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."


In [24]:
# parse the XML content of the record and pretty print it
xml_value = record["content"].to_pylist()[0]

import xml.etree.ElementTree as ET

root = ET.fromstring(xml_value)

ET.indent(root)
ET.dump(root)

<ns0:record xmlns:ns0="http://www.loc.gov/MARC21/slim">
  <ns0:leader>00000pam a22000003a 4500</ns0:leader>
  <ns0:controlfield tag="001">ebs1109790e</ns0:controlfield>
  <ns0:controlfield tag="003">EBZ</ns0:controlfield>
  <ns0:controlfield tag="006">m     o  d  ||||||</ns0:controlfield>
  <ns0:controlfield tag="007">cr|unu||||||||</ns0:controlfield>
  <ns0:controlfield tag="008">831121s1984    nyu     ob    001 0 eng  </ns0:controlfield>
  <ns0:datafield tag="020" ind1=" " ind2=" ">
    <ns0:subfield code="z">9780195033823</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="020" ind1=" " ind2=" ">
    <ns0:subfield code="a">9780198020493 (online)</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="020" ind1=" " ind2=" ">
    <ns0:subfield code="a">9781280522949 (online)</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="035" ind1=" " ind2=" ">
    <ns0:subfield code="a">(EBZ)ebs1109790e</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="040" ind1=" " ind2=" ">
    <

In [17]:
# Transform the record and print the transformed output

%load_ext autoreload
%autoreload 2

import json
from adapters.ebsco.steps.transformer import transform

(transformed_record, _) = transform(record_id, xml_value)
print(json.dumps(transformed_record[0].model_dump(), indent=2))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[Period(id=Unidentifiable(canonical_id=None, type='Unidentifiable'), label='1984-©2003', type='Period', range=DateTimeRange(from_time='1984-01-01T00:00:00Z', to_time='2003-12-31T23:59:59.999999Z', label='1984-©2003'))]
{
  "version": 1761755395,
  "type": "Visible",
  "data": {
    "title": "IMA journal of mathematics applied in medicine and biology",
    "otherIdentifiers": [
      {
        "identifierType": {
          "id": "issn"
        },
        "ontologyType": "Work",
        "value": "1471-6879"
      }
    ],
    "alternativeTitles": [
      "IMA journal of mathematics applied in medicine and biology (Online)",
      "Institute of Mathematics and Its Applications journal of mathematics applied in medicine and biology",
      "Journal of mathematics applied in medicine and biology",
      "Mathematical Medicine and Biology: A Journal of the IMA"
    ],
    "format": {
      "id": "j",
    

In [22]:
transformed_record

[VisibleSourceWork(version=1761723669, type='Visible', data=WorkData(title='Science, reform, and politics in Victorian Britain : the Social Science Association, 1857-1886 / Lawrence Goldman.', other_identifiers=[SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9780511039348 (online)'), SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9780511052354 (online)'), SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9780511157677 (online)'), SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9780511176784 (online)'), SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9780511490545 (online)'), SourceIdentifier(identifier_type=Id(id='isbn'), ontology_type='Work', value='9781280416521 (online)')], alternative_titles=['Science, reform, and politics in Victorian Britain (Online)', 'Science, Reform, and Politics in Victorian Britain: The Social Science Association 18