In [None]:
import sys
from pathlib import Path
from datetime import datetime, UTC, timedelta
from uuid import uuid1

import pyarrow as pa

from adapters.utils.adapter_store import AdapterStore
from adapters.utils.iceberg import get_local_table

In [None]:
# Create a local Iceberg table for testing
table_name = f"demo_{str(uuid1())[:8]}"
table = get_local_table(
    table_name=table_name,
    namespace="demo",
    db_name="demo_catalog",
)

# Initialize the adapter store
store = AdapterStore(table, default_namespace="test_namespace")

print(f"Created table: {table_name}")

In [None]:
def create_record_table(records: list[dict], namespace: str = "test_namespace") -> pa.Table:
    """Create a PyArrow table from a list of records."""
    for record in records:
        record["namespace"] = namespace
    
    return pa.Table.from_pylist(
        records,
        schema=pa.schema([
            pa.field("namespace", pa.string(), nullable=False),
            pa.field("id", pa.string(), nullable=False),
            pa.field("content", pa.string(), nullable=True),
            pa.field("last_modified", pa.timestamp("us", "UTC"), nullable=True),
        ])
    )

def display_table(title: str):
    """Display current table contents."""
    print(f"\n{title}")
    print("=" * 80)
    current = table.scan().to_arrow().sort_by("id")
    if current.num_rows == 0:
        print("(empty table)")
    else:
        df = current.to_pandas()
        print(df.to_string(index=False))
    return current

In [None]:
# Define some timestamps
now = datetime.now(UTC)
yesterday = now - timedelta(days=1)
last_week = now - timedelta(days=7)

print(f"Reference times:")
print(f"  Last week: {last_week}")
print(f"  Yesterday: {yesterday}")
print(f"  Now:       {now}")

## Scenario 1: Insert Initial Records

First, let's insert some initial records with various timestamps.

In [None]:
# Insert initial records
initial_records = create_record_table([
    {"id": "rec001", "content": "Record 1 (last week)", "last_modified": last_week},
    {"id": "rec002", "content": "Record 2 (yesterday)", "last_modified": yesterday},
    {"id": "rec003", "content": "Record 3 (legacy, no timestamp)", "last_modified": None},
])

result = store.incremental_update(initial_records)
print(f"Insert result: {result.changeset_id}")
print(f"Updated records: {result.updated_record_ids}")

display_table("Initial State")

## Scenario 2: Update with Newer Timestamp (Should Succeed)

Try to update rec001 with a newer timestamp (yesterday > last_week).

In [None]:
update_newer = create_record_table([
    {"id": "rec001", "content": "Record 1 UPDATED (yesterday)", "last_modified": yesterday},
])

result = store.incremental_update(update_newer)
if result:
    print(f"✅ Update applied: {result.updated_record_ids}")
else:
    print("❌ Update rejected (no changes)")

display_table("After Update with Newer Timestamp")

## Scenario 3: Update with Older Timestamp (Should Fail)

Try to update rec001 with an older timestamp (last_week < yesterday).

In [None]:
update_older = create_record_table([
    {"id": "rec001", "content": "Record 1 OLD DATA (last week)", "last_modified": last_week},
])

result = store.incremental_update(update_older)
if result:
    print(f"✅ Update applied: {result.updated_record_ids}")
else:
    print("❌ Update rejected (timestamp is older)")

display_table("After Attempted Update with Older Timestamp")

## Scenario 4: Update with Equal Timestamp (Should Fail)

Try to update rec002 with the same timestamp.

In [None]:
update_equal = create_record_table([
    {"id": "rec002", "content": "Record 2 SAME TIME (yesterday)", "last_modified": yesterday},
])

result = store.incremental_update(update_equal)
if result:
    print(f"✅ Update applied: {result.updated_record_ids}")
else:
    print("❌ Update rejected (timestamp is equal)")

display_table("After Attempted Update with Equal Timestamp")

## Scenario 4.5: Newer Timestamp But Unchanged Content (Should Not Update)

Demonstrate that even if the incoming record has a newer `last_modified`, if the `content` is identical to the existing record, no update is applied. This preserves idempotency and avoids unnecessary churn.

In [None]:
# Prepare an update for rec002 with newer timestamp but identical content
update_newer_same_content = create_record_table([
    {"id": "rec002", "content": "Record 2 (yesterday)", "last_modified": now},
])

result = store.incremental_update(update_newer_same_content)
if result:
    print(f"Unexpected: update applied: {result.updated_record_ids}")
else:
    print("✅ Update rejected (content unchanged despite newer timestamp)")

display_table("After Attempted Update: Newer Timestamp, Same Content")

## Scenario 5: Update Record with Null Timestamp (Should Succeed)

Update rec003 which has a null timestamp (legacy data).

In [None]:
update_null_existing = create_record_table([
    {"id": "rec003", "content": "Record 3 UPDATED (now)", "last_modified": now},
])

result = store.incremental_update(update_null_existing)
if result:
    print(f"✅ Update applied: {result.updated_record_ids}")
else:
    print("❌ Update rejected")

display_table("After Update of Legacy Record (null timestamp)")

## Scenario 6: Mixed Batch Update

Process a batch with multiple records having different timestamp relationships.

In [None]:
tomorrow = now + timedelta(days=1)

mixed_batch = create_record_table([
    # rec001: newer (now > yesterday) - SHOULD UPDATE (content changed)
    {"id": "rec001", "content": "Record 1 FINAL (now)", "last_modified": now},
    # rec002: older (last_week < yesterday) - SHOULD NOT UPDATE (timestamp older)
    {"id": "rec002", "content": "Record 2 OLD (last week)", "last_modified": last_week},
    # rec002: newer (now > yesterday) but content unchanged - SHOULD NOT UPDATE (no content change)
    {"id": "rec002", "content": "Record 2 (yesterday)", "last_modified": now},
    # rec003: newer (tomorrow > now) - SHOULD UPDATE (legacy -> timestamped)
    {"id": "rec003", "content": "Record 3 NEWEST (tomorrow)", "last_modified": tomorrow},
    # rec004: new record - SHOULD INSERT
    {"id": "rec004", "content": "Record 4 NEW (now)", "last_modified": now},
])

result = store.incremental_update(mixed_batch)
if result:
    print(f"✅ Updates applied: {result.updated_record_ids}")
    print("   Expected updates: ['rec001', 'rec003', 'rec004'] (rec002 unchanged due to identical content)")
else:
    print("❌ No updates applied")

display_table("After Mixed Batch Update (including unchanged-content case)")

## Scenario 7: Require Timestamps

Incremental updates strictly require a `last_modified` column. Attempting to update without timestamps should raise an error.

In [None]:
# Create records without timestamp column
no_timestamp_records = pa.Table.from_pylist(
    [
        {"namespace": "test_namespace", "id": "rec001", "content": "Record 1 NO TIMESTAMP"},
    ],
    schema=pa.schema([
        pa.field("namespace", pa.string(), nullable=False),
        pa.field("id", pa.string(), nullable=False),
        pa.field("content", pa.string(), nullable=True),
    ])
)

# Attempt incremental_update without last_modified; expect ValueError
try:
    store.incremental_update(no_timestamp_records)
    print("Unexpected: update succeeded without timestamps")
except ValueError as e:
    print(f"Expected error: {e}")

display_table("After Attempted Update Without Timestamp (should be unchanged)")

## Summary

This notebook demonstrated:

1. ✅ Newer timestamps allow updates
2. ✅ Older timestamps prevent updates (protecting newer data)
3. ✅ Equal timestamps prevent updates (idempotency)
4. ✅ Null existing timestamps always allow updates (legacy data migration)
5. ✅ Mixed batches correctly filter based on individual record timestamps
6. ✅ Incremental updates strictly require a last_modified timestamp; missing timestamps raise an error
7. ✅ Require Timestamps

This feature ensures data integrity during incremental harvesting from sources like OAI-PMH where records may be processed out of order.

## Cleanup

In [None]:
# Drop the demo table
table.catalog.drop_table(f"demo.{table_name}")
print(f"Dropped table: {table_name}")