In [1]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

# Example: Fetch the first 10 records from the S3 Tables Iceberg REST API backed table
#
# Prerequisites:
#   - AWS credentials with permission to call S3 Tables (Iceberg REST API) + read underlying S3 data are available
#   - (Optional) Override defaults with env vars: AWS_REGION, AWS_ACCOUNT_ID, S3_TABLES_BUCKET,
#       GLUE_TABLE_NAME, GLUE_NAMESPACE
#
# This reuses the existing helper in `table_config.get_rest_api_table`.
# If you just want to experiment locally without AWS, see Cell 2 (you can switch it to use get_local_table).

from adapters.ebsco.config import (
    AWS_ACCOUNT_ID,
    AWS_REGION,
    REST_API_NAMESPACE,
    REST_API_TABLE_NAME,
    S3_TABLES_BUCKET,
)
from adapters.ebsco.table_config import get_rest_api_table

# Load the Iceberg table via the S3 Tables Iceberg REST API
table = get_rest_api_table(
    s3_tables_bucket=S3_TABLES_BUCKET,
    table_name=REST_API_TABLE_NAME,
    namespace=REST_API_NAMESPACE,
    region=AWS_REGION,
    account_id=AWS_ACCOUNT_ID,
)
print(f"Loaded Iceberg table: {REST_API_NAMESPACE}.{REST_API_TABLE_NAME}")

env: AWS_PROFILE=platform-developer
Using wellcomecollection_catalogue.ebsco_adapter_table in wellcomecollection_catalogue catalog
Loaded Iceberg table: wellcomecollection_catalogue.ebsco_adapter_table


In [2]:
# Retrieve the first 10 data rows (excluding any projection to keep all columns)
first_10 = table.scan(
    selected_fields=("namespace", "id", "content"),
    limit=10,
).to_arrow()

print(f"Fetched {first_10.num_rows} rows")

# Display nicely (Arrow -> pandas DataFrame) if pandas is available
try:
    display(first_10.to_pandas())  # type: ignore[name-defined]
except Exception:
    # Fallback: print as list of dicts
    from pprint import pprint

    pprint(first_10.to_pylist())

Fetched 10 rows


Unnamed: 0,namespace,id,content
0,ebsco,ebs977679e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
1,ebsco,ebs467156e,
2,ebsco,ebs28841802e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
3,ebsco,ebs28841803e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
4,ebsco,ebs28841804e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
5,ebsco,ebs28841805e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
6,ebsco,ebs28841806e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
7,ebsco,ebs28841807e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
8,ebsco,ebs28841808e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."
9,ebsco,ebs28841809e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."


In [3]:
# Delete all rows using pyiceberg's row-level delete API only (no fallback).
# Run the table-loading cell first so `table` is defined.

# WARNING: This will irreversibly delete all data in the table!!!
# DO NOT run this cell if you are not absolutely sure what you're doing.


try:
    _ = table.schema()
except NameError as e:  # pragma: no cover
    raise RuntimeError(
        "`table` is not defined. Run the table-loading cell first."
    ) from e

before_count = table.scan().count()
print(f"Rows before delete: {before_count}")

# with table.transaction() as tx:  # type: ignore[attr-defined]
#     try:
#         tx.delete(delete_filter=AlwaysTrue())  # type: ignore[attr-defined]
#     except Exception as e:
#         raise RuntimeError("Row-level delete failed.") from e

# after_count = table.scan().count()
# print(f"Rows after delete:  {after_count}")
# assert after_count == 0, "Delete operation failed: table not empty"
# print("All rows deleted successfully via row-level delete.")

Rows before delete: 183718


In [4]:
# get a record with a specific id
record_id = "ebs2706851e"  # Replace with an actual record ID
record = table.scan(
    selected_fields=("namespace", "id", "content"),
    row_filter=f"id = '{record_id}'",
).to_arrow()

if record.num_rows == 0:
    print(f"No record found with id: {record_id}")
else:
    print(f"Record with id {record_id}:")
    try:
        display(record.to_pandas())  # type: ignore[name-defined]
    except Exception:
        from pprint import pprint

        pprint(record.to_pylist())


Record with id ebs2706851e:


Unnamed: 0,namespace,id,content
0,ebsco,ebs2706851e,"<record xmlns=""http://www.loc.gov/MARC21/slim""..."


In [5]:
# parse the XML content of the record and pretty print it
xml_value = record["content"].to_pylist()[0]

import xml.etree.ElementTree as ET

root = ET.fromstring(xml_value)

ET.indent(root)
ET.dump(root)

<ns0:record xmlns:ns0="http://www.loc.gov/MARC21/slim">
  <ns0:leader>00000cas a22000003a 4500</ns0:leader>
  <ns0:controlfield tag="001">ebs2706851e</ns0:controlfield>
  <ns0:controlfield tag="003">EBZ</ns0:controlfield>
  <ns0:controlfield tag="006">m     o  d  ||||||</ns0:controlfield>
  <ns0:controlfield tag="007">cr|unu||||||||</ns0:controlfield>
  <ns0:controlfield tag="008">841012d19141916nyumr p o     0   a0eng c</ns0:controlfield>
  <ns0:datafield tag="022" ind1=" " ind2=" ">
    <ns0:subfield code="a">2768-2692</ns0:subfield>
    <ns0:subfield code="y">2641-0796</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="035" ind1=" " ind2=" ">
    <ns0:subfield code="a">(OCoLC)11258754</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="035" ind1=" " ind2=" ">
    <ns0:subfield code="a">(EBZ)ebs2706851e</ns0:subfield>
  </ns0:datafield>
  <ns0:datafield tag="040" ind1=" " ind2=" ">
    <ns0:subfield code="a">IUL</ns0:subfield>
    <ns0:subfield code="b">eng</ns0:subfield>
  

In [6]:
# Transform the record and print the transformed output

%load_ext autoreload
%autoreload 2

import json
from adapters.ebsco.steps.transformer import transform

(transformed_record, _) = transform(record_id, xml_value)
print(json.dumps(transformed_record[0].model_dump(), indent=2))

{
  "version": 1761659063,
  "type": "Visible",
  "data": {
    "title": "The New York times current history of the European war.",
    "otherIdentifiers": [
      {
        "identifierType": {
          "id": "issn"
        },
        "ontologyType": "Work",
        "value": "2768-2692"
      }
    ],
    "alternativeTitles": [
      "New York times current history of the European war. (Online)",
      "Current history of the European war",
      "New York times current history, the European war",
      "Current history"
    ],
    "format": {
      "id": "j",
      "label": "E-journals"
    },
    "description": null,
    "physicalDescription": null,
    "lettering": null,
    "createdDate": null,
    "subjects": [
      {
        "id": {
          "canonicalId": null,
          "type": "Unidentifiable"
        },
        "label": "World War, 1914-1918 - Periodicals",
        "type": "Subject",
        "concepts": [
          {
            "id": {
              "sourceIdentifier": {


In [7]:
transformed_record

[VisibleSourceWork(version=1761659063, type='Visible', data=WorkData(title='The New York times current history of the European war.', other_identifiers=[SourceIdentifier(identifier_type=Id(id='issn'), ontology_type='Work', value='2768-2692')], alternative_titles=['New York times current history of the European war. (Online)', 'Current history of the European war', 'New York times current history, the European war', 'Current history'], format=Format(id='j', label='E-journals'), description=None, physical_description=None, lettering=None, created_date=None, subjects=[Subject(id=Unidentifiable(canonical_id=None, type='Unidentifiable'), label='World War, 1914-1918 - Periodicals', type='Subject', concepts=[Concept(id=Identifiable(source_identifier=SourceIdentifier(identifier_type=Id(id='label-derived'), ontology_type='Concept', value='world war, 1914-1918'), other_identifiers=[], canonical_id=None, type='Identifiable', identifiedType='Identified'), label='World War, 1914-1918', type='Concep