In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

from adapters.ebsco.helpers import build_adapter_table as build_adapter_table_ebsco
from adapters.axiell.helpers import build_adapter_table as build_adapter_table_axiell

# If false will attempt to use a local Iceberg table instead of using the S3 Tables REST API
USE_REST_API_TABLE = False

# Load the Iceberg table via the S3 Tables Iceberg REST API
ebsco_adapter_table = build_adapter_table_ebsco(use_rest_api_table=USE_REST_API_TABLE)
axiell_adapter_table = build_adapter_table_axiell(use_rest_api_table=USE_REST_API_TABLE)
print(f"Adapter tables loaded")

print(ebsco_adapter_table)
print(axiell_adapter_table)

# Choose which adapter table to query
table = ebsco_adapter_table

In [None]:
# Retrieve the first 10 data rows (excluding any projection to keep all columns)
first_10 = table.scan(
    selected_fields=("namespace", "id", "content", "changeset", "last_modified"),
    limit=10,
).to_arrow()

print(f"Fetched {first_10.num_rows} rows")
display(first_10.to_pandas())

In [None]:
# Delete all rows using pyiceberg's row-level delete API only (no fallback).
# Run the table-loading cell first so `table` is defined.

# WARNING: This will irreversibly delete all data in the table!!!
# DO NOT run this cell if you are not absolutely sure what you're doing.


try:
    _ = table.schema()
except NameError as e:  # pragma: no cover
    raise RuntimeError(
        "`table` is not defined. Run the table-loading cell first."
    ) from e

before_count = table.scan().count()
print(f"Rows before delete: {before_count}")

# with table.transaction() as tx:  # type: ignore[attr-defined]
#     try:
#         tx.delete(delete_filter=AlwaysTrue())  # type: ignore[attr-defined]
#     except Exception as e:
#         raise RuntimeError("Row-level delete failed.") from e

# after_count = table.scan().count()
# print(f"Rows after delete:  {after_count}")
# assert after_count == 0, "Delete operation failed: table not empty"
# print("All rows deleted successfully via row-level delete.")

In [None]:
# get a record with a specific id
record_id = "collect:15101"  # Replace with an actual record ID
record = table.scan(
    selected_fields=("namespace", "id", "content"),
    row_filter=f"id = '{record_id}'",
).to_arrow()

if record.num_rows == 0:
    print(f"No record found with id: {record_id}")
else:
    print(f"Record with id {record_id}:")
    try:
        display(record.to_pandas())  # type: ignore[name-defined]
    except Exception:
        from pprint import pprint

        pprint(record.to_pylist())


In [None]:
# parse the XML content of the record and pretty print it
xml_value = record["content"].to_pylist()[0]

import xml.etree.ElementTree as ET

root = ET.fromstring(xml_value)

ET.indent(root)
ET.dump(root)

In [None]:
# Transform the record and print the transformed output

%load_ext autoreload
%autoreload 2

import json
from adapters.ebsco.steps.transformer import transform

# (transformed_record, _) = transform(record_id, xml_value)
# print(json.dumps(transformed_record[0].model_dump(), indent=2))