In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

In [None]:
from datetime import datetime, timedelta, timezone

import pandas as pd

from adapters.axiell import config
from adapters.axiell.clients import build_oai_client
from adapters.axiell.helpers import build_adapter_table, build_window_store
from adapters.axiell.steps.loader import AXIELL_NAMESPACE
from adapters.utils.adapter_store import AdapterStore
from adapters.utils.window_harvester import WindowHarvestManager

# Toggle between the remote REST API-backed Iceberg catalog and a local/dev catalog.
USE_REST_API_TABLE = True

# Enable this if you want harvested records to be written back into the adapter table.
ENABLE_RECORD_WRITER = False
NOTEBOOK_JOB_ID = "notebook-window-harvest"
WINDOW_RANGE_LABEL = "ad-hoc"

window_store = build_window_store(use_rest_api_table=USE_REST_API_TABLE)
adapter_table = build_adapter_table(use_rest_api_table=USE_REST_API_TABLE)
adapter_store = AdapterStore(adapter_table, default_namespace=AXIELL_NAMESPACE)
oai_client = build_oai_client()

record_writer = None
if ENABLE_RECORD_WRITER:
    from adapters.axiell.record_writer import WindowRecordWriter

    record_writer = WindowRecordWriter(
        namespace=AXIELL_NAMESPACE,
        table_client=adapter_store,
        job_id=NOTEBOOK_JOB_ID,
        window_range=WINDOW_RANGE_LABEL,
    )

harvester = WindowHarvestManager(
    client=oai_client,
    store=window_store,
    metadata_prefix=config.OAI_METADATA_PREFIX,
    set_spec=config.OAI_SET_SPEC,
    window_minutes=config.WINDOW_MINUTES,
    max_parallel_requests=config.WINDOW_MAX_PARALLEL_REQUESTS,
    record_callback=record_writer,
    default_tags={"job_id": NOTEBOOK_JOB_ID} if record_writer else None,
)

print("Axiell window harvester ready.")
print(f"- Using REST API table: {USE_REST_API_TABLE}")
print(f"- Record writer enabled: {ENABLE_RECORD_WRITER}")
print(f"- Window size (minutes): {harvester.window_minutes}")

In [None]:
# Configure the reporting window.
COVERAGE_RANGE_END = datetime.now(timezone.utc) - timedelta(hours=2)
COVERAGE_RANGE_START = COVERAGE_RANGE_END - timedelta(hours=24 * 25)

coverage_report = harvester.coverage_report(
    range_start=COVERAGE_RANGE_START,
    range_end=COVERAGE_RANGE_END,
)

print(
    "Coverage report from "
    f"{coverage_report.range_start.isoformat()} to "
    f"{coverage_report.range_end.isoformat()}"
)
print(f"Total windows evaluated: {coverage_report.total_windows}")
print(f"Total coverage hours: {coverage_report.coverage_hours:.2f}")

if coverage_report.state_counts:
    state_df = (
        pd.DataFrame(
            [
                {"state": state, "count": count}
                for state, count in coverage_report.state_counts.items()
            ]
        )
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
    )
    display(
        state_df.style
        .format({"count": "{:,}"})
        .set_caption("Windows by state")
    )
else:
    print("No window state data available.")

if coverage_report.coverage_gaps:
    gap_df = pd.DataFrame(
        [
            {
                "gap_start": gap.start,
                "gap_end": gap.end,
                "duration_hours": (gap.end - gap.start).total_seconds() / 3600.0,
            }
            for gap in coverage_report.coverage_gaps
        ]
    ).sort_values("gap_start", ascending=False)
    display(
        gap_df.style
        .format(
            {
                "gap_start": "{:%Y-%m-%d %H:%M}",
                "gap_end": "{:%Y-%m-%d %H:%M}",
                "duration_hours": "{:.2f}",
            }
        )
        .set_caption("Coverage gaps")
    )
else:
    print("No coverage gaps detected in this range.")

failure_count = len(coverage_report.failures)
print(f"Recorded failures in range: {failure_count}")
if failure_count:
    failure_preview = pd.DataFrame(
        [
            {
                "window_key": failure.window_key,
                "window_start": failure.window_start,
                "window_end": failure.window_end,
                "attempts": failure.attempts,
                "last_error": failure.last_error,
            }
            for failure in coverage_report.failures[:10]
        ]
    )
    display(
        failure_preview.style
        .format(
            {
                "window_start": "{:%Y-%m-%d %H:%M}",
                "window_end": "{:%Y-%m-%d %H:%M}",
            }
        )
        .set_caption("Failure preview (first 10)")
    )

In [None]:
if coverage_report.coverage_gaps:
    print(f"Found {len(coverage_report.coverage_gaps)} coverage gaps. Attempting to fill...")
    for i, gap in enumerate(coverage_report.coverage_gaps, 1):
        print(f"Processing gap {i}/{len(coverage_report.coverage_gaps)}: {gap.start} -> {gap.end}")
        harvester.harvest_range(
            start_time=gap.start,
            end_time=gap.end
        )
else:
    print("No coverage gaps to process.")