In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

neptune_client = NeptuneClient("prod")

# Reindex by ID

Extract, transform, and optionally index specific **works** or **concepts** by their canonical IDs.
This bypasses the normal window-based pipeline and fetches items directly from the merged index / Neptune graph.

## Prerequisites
- Running in the `catalogue_graph` uv environment (`uv sync`)
- `platform-developer` AWS profile configured
- VPN access to Wellcome Collection network

## Works

Extract specific works from the merged ES index, enrich with Neptune graph data (hierarchy, concepts), transform, and optionally index.

In [None]:
from collections.abc import Generator

from ingestor.extractors.base_extractor import GraphBaseExtractor
from ingestor.extractors.works_extractor import (
    ExtractedWork,
    GraphWorksExtractor,
    VisibleExtractedWork,
    get_related_works_query,
)
from ingestor.models.merged.work import MergedWork
from ingestor.transformers.works_transformer import ElasticsearchWorksTransformer
from models.events import BasePipelineEvent, PipelineIndexDates
from sources.merged_works_source import MergedWorksSource

from utils.elasticsearch import ElasticsearchMode, get_client
from clients.neptune_client import NeptuneClient

# === Configuration ===
WORK_IDS = ["tsayk6g3"]  # Change these to the work IDs you want to process
PIPELINE_DATE = "2025-10-02"
MERGED_INDEX_DATE = "2025-10-02"  # Set if different from pipeline_date

es_client = get_client("works_ingestor", PIPELINE_DATE, "public")

class NotebookWorksExtractor(GraphWorksExtractor):
    """Extract specific works by ID, bypassing the window-based ES stream."""

    def __init__(self, work_ids: list[str], pipeline_date: str, merged_index_date: str | None = None):
        # Skip GraphWorksExtractor.__init__ (which creates a full MergedWorksSource)
        # and call GraphBaseExtractor.__init__ directly
        super(GraphWorksExtractor, self).__init__(neptune_client)
        self._work_ids = work_ids
        self._pipeline_date = pipeline_date
        self._merged_index_date = merged_index_date
        self.streamed_ids: set[str] = set()
        self.related_ids: set[str] = set()

    def _make_event(self) -> BasePipelineEvent:
        return BasePipelineEvent(
            pipeline_date=self._pipeline_date,
            index_dates=PipelineIndexDates(merged=self._merged_index_date),
        )

    def extract_raw(self) -> Generator[ExtractedWork]:
        event = self._make_event()
        source = MergedWorksSource(
            event=event,
            query={"ids": {"values": self._work_ids}},
            es_client=es_client,
        )
        works_stream = (MergedWork.from_raw_document(w) for w in source.stream_raw())
        yield from self.process_es_works(works_stream)

        # Process related works (ancestors/children) for hierarchy consistency
        related_ids = list(self.related_ids.difference(self.streamed_ids))
        if related_ids:
            print(f"Processing {len(related_ids)} related works (ancestors/children)")
            related_source = MergedWorksSource(
                event=event,
                query=get_related_works_query(related_ids),
                es_client=es_client,
            )
            related_stream = (MergedWork.from_raw_document(w) for w in related_source.stream_raw())
            yield from self.process_es_works(related_stream)


class NotebookWorksTransformer(ElasticsearchWorksTransformer):
    """Transform specific works by ID without needing a full pipeline event."""

    def __init__(self, work_ids: list[str], pipeline_date: str, merged_index_date: str | None = None):
        # Skip ElasticsearchWorksTransformer.__init__ (which creates its own extractor)
        self.source = NotebookWorksExtractor(work_ids, pipeline_date, merged_index_date)


transformer = NotebookWorksTransformer(WORK_IDS, PIPELINE_DATE, MERGED_INDEX_DATE)
indexed_works = list(transformer.stream_es_documents())
print(f"Transformed {len(indexed_works)} works")

In [None]:
# Display transformed works
for work in indexed_works:
    work_id = work.get_id()
    work_type = work.type
    print(f"\n{'='*80}")
    print(f"Work: {work_id} ({work_type})")
    print(f"{'='*80}")

    if hasattr(work, "display"):
        d = work.display
        print(f"  Title: {d.title}")
        if hasattr(d, "subjects"):
            print(f"  Subjects: {[s.label for s in d.subjects[:5]]}")
        if hasattr(d, "genres"):
            print(f"  Genres: {[g.label for g in d.genres[:5]]}")

    if hasattr(work, "query") and hasattr(work.query, "concept_ids"):
        print(f"  Concept IDs: {work.query.concept_ids[:10]}")

    if hasattr(work, "debug"):
        print(f"  Debug source: {work.debug.source}")

In [None]:
# Optional: Index the transformed works into Elasticsearch
# Uncomment and run to actually write to the index

# import json
# import elasticsearch.helpers
# from utils.elasticsearch import get_client, get_standard_index_name

# INDEX_DATE = "2025-10-02"  # Change to target index date
# es_client = get_client("works_ingestor", PIPELINE_DATE, "public")
# index_name = get_standard_index_name("works-indexed", INDEX_DATE)

# def generate_operations(works):
#     for work in works:
#         source = json.loads(work.model_dump_json(exclude_none=True))
#         yield {"_index": index_name, "_id": work.get_id(), "_source": source}

# success_count, errors = elasticsearch.helpers.bulk(es_client, generate_operations(indexed_works))
# print(f"Indexed {success_count} works to {index_name}")
# if errors:
#     print(f"Errors: {errors}")

## Concepts

Extract specific concepts from the Neptune graph (with same-as expansion and related concepts), transform, and optionally index.

In [None]:
from collections.abc import Generator

from ingestor.extractors.concepts_extractor import GraphConceptsExtractor, CONCEPT_QUERY_PARAMS
from ingestor.transformers.concepts_transformer import ElasticsearchConceptsTransformer


class NotebookConceptsExtractor(GraphConceptsExtractor):
    """Bypasses the ES source and uses fixed concept IDs."""

    def __init__(self, concept_ids: list[str]):
        super(GraphConceptsExtractor, self).__init__(neptune_client)
        self.primary_map: dict[str, str] = {}
        self.same_as_map: dict[str, list[str]] = {}
        self.neptune_params = CONCEPT_QUERY_PARAMS
        self._concept_ids = concept_ids

    def get_concept_stream(self) -> Generator[set[str]]:
        self._update_same_as_map(self._concept_ids)
        full_batch = set()
        for cid in self._concept_ids:
            for same_as_id in self.get_same_as(cid):
                full_batch.add(same_as_id)
        yield full_batch


class NotebookConceptsTransformer(ElasticsearchConceptsTransformer):
    """Transform specific concepts by ID without needing a full pipeline event."""

    def __init__(self, concept_ids: list[str], only_specified: bool = False):
        self.source = NotebookConceptsExtractor(concept_ids)
        self._only_specified = set(concept_ids) if only_specified else None

    def stream_es_documents(self):
        for doc in super().stream_es_documents():
            if self._only_specified and doc.get_id() not in self._only_specified:
                continue
            yield doc


# === Configuration ===
# CONCEPT_IDS = [
#     "zbus63qt", "dujvfptt", "u33bzxsb", "euehm7ng", "ec77rqzq", "g6f9sn7t",
#     "bun3pg62", "cqm7r9pj", "d4r983x6", "ce7rratv", "a4wyrvq2", "ky24m9en",
#     "hv3ueb5k", "qj5kj8rz", "rasp7aye", "byqnbpfc", "pestkwqm", "nns7bsba",
#     "e8kur96g", "gk2eca5r", "up98mqb8", "usgkq8dj", "c24wmx3e", "umqzyxwk",
#     "q7c2xvdk", "w7yp9m3v", "rtwg3paj", "d5ghwutb", "a7gmt7ff", "caew98cx",
#     "h8fuyw3g", "vcqcqced", "u4y59z2p", "zwu7frtk", "nb2nvbwj", "khvwwfrk",
# ]
CONCEPT_IDS=["w7yp9m3v"]
ONLY_SPECIFIED = True  # Set to False to also index the same-as concepts

concepts_transformer = NotebookConceptsTransformer(CONCEPT_IDS, only_specified=ONLY_SPECIFIED)
indexed_concepts = list(concepts_transformer.stream_es_documents())
print(f"Transformed {len(indexed_concepts)} concepts")

In [None]:
# Display transformed concepts
for concept in indexed_concepts:
    concept_id = concept.get_id()
    print(f"\n{'='*80}")
    print(f"Concept: {concept_id}")
    print(f"{'='*80}")

    d = concept.display
    print(f"  Label: {d.label}")
    print(f"  Display Label: {d.displayLabel}")
    print(f"  Type: {d.type}")
    print(f"  Alt Labels: {d.alternativeLabels[:5]}")
    print(f"  Same As: {d.sameAs}")
    if d.description:
        print(f"  Description: {d.description.text[:200]}")
    if d.displayImages:
        print(f"  Images: {[img.url for img in d.displayImages[:3]]}")

    rc = d.relatedConcepts
    for field_name in ["relatedTo", "narrowerThan", "broaderThan", "people", "relatedTopics"]:
        items = getattr(rc, field_name, [])
        if items:
            print(f"  {field_name}: {[f'{c.label} ({c.id})' for c in items[:5]]}")

In [None]:
import json
import pprint

import elasticsearch.helpers
from utils.elasticsearch import get_client, get_standard_index_name

# CONCEPTS_INDEX_DATE = "2026-02-16"  # Change to target index date
# CONCEPTS_PIPELINE_DATE = "2025-10-02"
# concepts_es_client = get_client("concepts_ingestor_rkenny_test", CONCEPTS_PIPELINE_DATE, "public")
# concepts_index_name = get_standard_index_name("concepts-indexed", CONCEPTS_INDEX_DATE)

# def generate_concept_operations(concepts):
#     for c in concepts:
#         source = json.loads(c.model_dump_json(exclude_none=True))
#         yield {"_index": concepts_index_name, "_id": c.get_id(), "_source": source}

# success_count, errors = elasticsearch.helpers.bulk(
#     concepts_es_client,
#     generate_concept_operations(indexed_concepts),
#     raise_on_error=False,
# )
# print(f"Indexed {success_count} concepts to {concepts_index_name}")
# if errors:
#     print(f"{len(errors)} error(s):")
#     for err in errors:
#         pprint.pprint(err)