In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

from adapters.ebsco.helpers import build_adapter_table as build_adapter_table_ebsco
from adapters.axiell.runtime import AXIELL_CONFIG
from adapters.folio.runtime import FOLIO_CONFIG

# If false will attempt to use a local Iceberg table instead of using the S3 Tables REST API
USE_REST_API_TABLE = False

# Load the Iceberg table via the S3 Tables Iceberg REST API
ebsco_adapter_table = build_adapter_table_ebsco(use_rest_api_table=USE_REST_API_TABLE)
axiell_adapter_table = AXIELL_CONFIG.build_adapter_table(use_rest_api_table=USE_REST_API_TABLE)
folio_adapter_table = FOLIO_CONFIG.build_adapter_table(use_rest_api_table=USE_REST_API_TABLE)

tables = {
       "ebsco_adapter_table":  ebsco_adapter_table,
       "axiell_adapter_table": axiell_adapter_table,
       "folio_adapter_table":  folio_adapter_table,
}

print(f"Adapter tables loaded")

for table in tables.values():
    print(table)

In [None]:
# Summarise each adapter table: total records + earliest/latest last_modified
# Note: computing earliest/latest requires scanning the `last_modified` column and may take a while on large tables.

from __future__ import annotations

import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc


def _table_summary(name: str, t):
    # Count is usually metadata-driven (fast-ish), but can still take time depending on table/files.
    count = t.scan().count()
    
    earliest = None
    latest = None
    try:
        lm = t.scan(selected_fields=("last_modified",)).to_arrow()["last_modified"]
        # Drop nulls if present
        earliest = pc.min(lm).as_py()
        latest = pc.max(lm).as_py()
    except Exception as e:
        print(f"Could not compute earliest/latest last_modified for {name}: {e}")
    
    return {
        "table": name,
        "records": count,
        "earliest_last_modified": earliest,
        "latest_last_modified": latest,
    }

summary_df = pd.DataFrame(
    [
        _table_summary(name, table) for name, table in tables.items()
    ]
).sort_values("table")

display(summary_df)

In [None]:
from adapters.utils.schemata import ARROW_FIELDS

FIELD_NAMES = [field.name for field in ARROW_FIELDS]

def get_sample(table, limit = 20):
    # Retrieve the first N data rows (excluding any projection to keep all columns)
    first_n = table.scan(
        selected_fields=FIELD_NAMES,
        limit=limit,
    ).to_arrow()

    return first_n.to_pandas()


def get_record_by_id(table, record_id: str):
    record = table.scan(
        selected_fields=FIELD_NAMES,
        row_filter=f"id = '{record_id}'",
    ).to_arrow()
    
    return record


table_samples = {
    name: get_sample(table)
    for name, table in tables.items()
}

for name, sample in table_samples.items():
    print(name)
    display(sample)

In [None]:
import xml.etree.ElementTree as ET

# Table to consult
example_table_name = "ebsco_adapter_table"
sample_table = tables.get(example_table_name)

# Extract an ID from the sample dataset, modify this to extract your own
sample_df = table_samples.get(example_table_name)
example_id = sample_df["id"].head(1).to_list()[0]
# example_id = "12345"

example_record = get_record_by_id(sample_table, example_id)

display(example_record)

# parse the XML content of the record and pretty print it
xml_value = example_record["content"].to_pylist()[0]

root = ET.fromstring(xml_value)

ET.indent(root)
ET.dump(root)

In [None]:
# Transform the record and print the transformed output

%reload_ext autoreload
%autoreload 2

import json
from adapters.transformers.ebsco_transformer import EbscoTransformer
from adapters.transformers.axiell_transformer import AxiellTransformer
from adapters.transformers.marcxml_transformer import MarcXmlTransformer
from adapters.utils.adapter_store import AdapterStore

table_to_transform = {
    "ebsco_adapter_table": EbscoTransformer,
    "axiell_adapter_table": AxiellTransformer,
    # "folio_adapter_table": FolioTransformer,  # TODO: Add when FOLIO transformer is implemented
}

# Strictly not needed, unless we are triggering with a changeset
adapter_store = AdapterStore(table)
changeset_ids = ["dummy_changeset"]

table = tables.get(example_table_name)
transformer: MarcXmlTransformer  = table_to_transform.get(example_table_name)(adapter_store, changeset_ids)

transformed_record = list(transformer.transform(example_record.to_pylist()))
print(json.dumps(transformed_record[0].model_dump(), indent=2))

In [None]:
from datetime import datetime
from pathlib import Path

# Backup and delete utilities for pyiceberg tables

BACKUP_DIR = Path("./data/backups")


def backup_table(table, table_name: str) -> Path:
    """Backup all rows from a table to a parquet file in ./data/backups."""
    BACKUP_DIR.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = BACKUP_DIR / f"{table_name}_{timestamp}.parquet"

    # Scan all data and write to parquet
    arrow_table = table.scan().to_arrow()
    row_count = arrow_table.num_rows
    if row_count == 0:
        print(f"Table '{table_name}' is empty, nothing to backup.")
        return backup_path

    import pyarrow.parquet as pq

    pq.write_table(arrow_table, backup_path)
    print(f"Backed up {row_count} rows to: {backup_path}")
    return backup_path


def restore_backup(table, backup_path: str | Path) -> int:
    """Restore rows from a parquet backup file to the table."""
    import pyarrow.parquet as pq

    backup_path = Path(backup_path)
    if not backup_path.exists():
        raise FileNotFoundError(f"Backup file not found: {backup_path}")

    arrow_table = pq.read_table(backup_path)
    row_count = arrow_table.num_rows
    if row_count == 0:
        print("Backup file is empty, nothing to restore.")
        return 0

    table.append(arrow_table)
    print(f"Restored {row_count} rows from: {backup_path}")
    return row_count


def list_backups(table_name: str | None = None) -> list[Path]:
    """List available backup files, optionally filtered by table name."""
    if not BACKUP_DIR.exists():
        return []
    backups = sorted(BACKUP_DIR.glob("*.parquet"), reverse=True)
    if table_name:
        backups = [b for b in backups if b.name.startswith(f"{table_name}_")]
    for b in backups:
        print(f"  {b.name}")
    return backups


def delete_all_rows(table, table_name: str, *, backup: bool = True, dry_run: bool = False):
    """Delete all rows from a table, with optional backup first.
    
    Args:
        table: The pyiceberg table to delete from.
        table_name: Name used for backup file prefix.
        backup: If True, backup the table before deleting.
        dry_run: If True, perform backup but skip the actual deletion.
    """
    from pyiceberg.expressions import AlwaysTrue

    before_count = table.scan().count()
    print(f"Rows before delete: {before_count}")

    if before_count == 0:
        print("Table is already empty, nothing to delete.")
        return

    if backup:
        backup_table(table, table_name)

    if dry_run:
        print("[DRY RUN] Skipping deletion. Backup was created if enabled.")
        return

    with table.transaction() as tx:
        try:
            tx.delete(delete_filter=AlwaysTrue())
        except Exception as e:
            raise RuntimeError("Row-level delete failed.") from e

    after_count = table.scan().count()
    print(f"Rows after delete:  {after_count}")
    assert after_count == 0, "Delete operation failed: table not empty"
    print("All rows deleted successfully via row-level delete.")

# Example usage:
table_to_target_name = "axiell_adapter_table"
table_to_target = tables.get(table_to_target_name)

# Set dry_run=False to actually delete
# delete_all_rows(table_to_target, table_to_target_name, backup=True, dry_run=True) 
# list_backups(table_to_target_name)
# restore_backup(table_to_target, f"./data/backups/{table_to_target_name}_20260115_155259.parquet")