In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

from typing import Any

import pandas as pd
from tabulate import tabulate

from utils.aws import get_neptune_client

neptune_client = get_neptune_client(use_public_endpoint=True)


def run_query(query: str, parameters: dict[str, Any] | None = None) -> list[dict]:
    """Execute an openCypher query with optional parameters."""
    return neptune_client.run_open_cypher_query(query, parameters)


def display_results(results: list[dict], limit: int = 50) -> None:
    """Display query results in a formatted table."""
    if not results:
        print("No results to display")
        return
    df = pd.DataFrame(results)
    print(f"Results: {len(results)} rows, {len(df.columns)} columns")
    print(tabulate(df.head(limit), headers="keys", tablefmt="grid", showindex=False))
    if len(results) > limit:
        print(f"... and {len(results) - limit} more rows")

# Wellcome Collection Authority — Graph Inspection

Inspect the weco-authority SourceConcept nodes in Neptune and their connectivity to Concept nodes. This helps diagnose why some weco concepts (e.g. images, descriptions) may not appear in the ES index.

## Background

The `WeCoConceptsTransformer` loads each row from `wellcome_collection_authority.csv` as:
- A **SourceConcept** node with `id=weco:<concept_id>`, `source=weco-authority`, and `image_urls`/`description` properties
- Two **SAME_AS** edges: `weco:<concept_id> → <concept_id>` and `<concept_id> → weco:<concept_id>`

During ingest, `SOURCE_CONCEPT_QUERY` traverses **directed** `SAME_AS` edges from a Concept's linked source concepts to discover all equivalent SourceConcepts. If the weco-authority node is unreachable via this directed traversal, the concept won't get images or the weco description.

## Load CSV

Read all weco-authority concepts from the source CSV.

In [None]:
from sources.weco_concepts.concepts_source import WeCoConceptsSource

source = WeCoConceptsSource()
csv_rows = list(source.stream_raw())

csv_df = pd.DataFrame(csv_rows)
csv_df["image_count"] = csv_df["image_url"].apply(lambda x: len(x.split("||")) if x else 0)
csv_df["desc_snippet"] = csv_df["description"].apply(lambda x: (x[:80] + "...") if x and len(x) > 80 else x)

concept_ids = csv_df["id"].tolist()
print(f"Loaded {len(csv_df)} weco-authority concepts from CSV")
print(tabulate(csv_df[["id", "label", "desc_snippet", "image_count"]], headers="keys", tablefmt="grid", showindex=False))

## Check weco-authority SourceConcept nodes in Neptune

Query Neptune for all SourceConcept nodes with `source='weco-authority'` and compare against the CSV.

In [None]:
# Find all weco-authority SourceConcept nodes in Neptune
weco_nodes_query = """
MATCH (sc:SourceConcept {source: 'weco-authority'})
RETURN sc.id AS node_id, sc.label AS label, sc.description AS description,
       sc.image_urls AS image_urls
ORDER BY sc.id
"""
weco_nodes = run_query(weco_nodes_query)

neptune_ids = {row["node_id"] for row in weco_nodes}
csv_prefixed_ids = {f"weco:{cid}" for cid in concept_ids}

missing_from_neptune = csv_prefixed_ids - neptune_ids
extra_in_neptune = neptune_ids - csv_prefixed_ids

print(f"weco-authority SourceConcept nodes in Neptune: {len(weco_nodes)}")
print(f"Concepts in CSV: {len(concept_ids)}")
if missing_from_neptune:
    print(f"\nMISSING from Neptune (in CSV but no node): {sorted(missing_from_neptune)}")
else:
    print("\nAll CSV concepts have a weco-authority SourceConcept node in Neptune.")
if extra_in_neptune:
    print(f"\nExtra in Neptune (not in CSV): {sorted(extra_in_neptune)}")

display_results(weco_nodes)

## Check SAME_AS edges from weco-authority nodes

For each weco-authority SourceConcept, find its outgoing and incoming SAME_AS edges. The `WeCoConceptsTransformer` creates bidirectional edges: `weco:<id> ↔ <id>`. Check whether both directions exist and whether the target node (`<id>`) actually exists as a SourceConcept.

In [None]:
# Check SAME_AS edges connected to weco-authority SourceConcepts
same_as_query = """
MATCH (weco:SourceConcept {source: 'weco-authority'})
OPTIONAL MATCH (weco)-[:SAME_AS]->(outgoing)
OPTIONAL MATCH (incoming)-[:SAME_AS]->(weco)
RETURN weco.id AS weco_id,
       collect(DISTINCT outgoing.id) AS outgoing_targets,
       collect(DISTINCT incoming.id) AS incoming_sources
ORDER BY weco.id
"""
same_as_results = run_query(same_as_query)

edge_rows = []
for row in same_as_results:
    weco_id = row["weco_id"]
    outgoing = [t for t in row["outgoing_targets"] if t is not None]
    incoming = [s for s in row["incoming_sources"] if s is not None]
    edge_rows.append({
        "weco_id": weco_id,
        "outgoing_SAME_AS": outgoing,
        "incoming_SAME_AS": incoming,
        "has_both_directions": len(outgoing) > 0 and len(incoming) > 0,
    })

edge_df = pd.DataFrame(edge_rows)
print(f"weco-authority nodes with SAME_AS edges: {len(edge_df)}")
missing_edges = edge_df[~edge_df["has_both_directions"]]
if len(missing_edges) > 0:
    print(f"\nNodes MISSING bidirectional SAME_AS edges ({len(missing_edges)}):")
    print(tabulate(missing_edges, headers="keys", tablefmt="grid", showindex=False))
else:
    print("All weco-authority nodes have bidirectional SAME_AS edges.")
print()
print(tabulate(edge_df, headers="keys", tablefmt="grid", showindex=False))

## SOURCE_CONCEPT_QUERY — what the ingestor sees

Run the actual `SOURCE_CONCEPT_QUERY` for each weco concept ID to see exactly which SourceConcepts the ingestor discovers. If `weco-authority` doesn't appear in `source_concepts`, that concept won't get images or the weco description.

This uses the same directed `SAME_AS` traversal as the real pipeline:
```
Concept → HAS_SOURCE_CONCEPT → linked_source_concept → SAME_AS*0.. → source_concepts
```

In [None]:
from ingestor.queries.concept_queries import SOURCE_CONCEPT_QUERY

# Run SOURCE_CONCEPT_QUERY for all weco concept IDs at once
results = run_query(SOURCE_CONCEPT_QUERY, {"ids": concept_ids})
results_by_id = {r["id"]: r for r in results}

summary_rows = []
for cid in concept_ids:
    csv_row = csv_df[csv_df["id"] == cid].iloc[0]
    result = results_by_id.get(cid)

    if result is None:
        summary_rows.append({
            "concept_id": cid,
            "label": csv_row.get("label", ""),
            "linked_sources": "NOT FOUND IN GRAPH",
            "all_sources": "",
            "has_weco_authority": False,
        })
        continue

    linked = result.get("linked_source_concepts", [])
    source_concepts = result.get("source_concepts", [])

    linked_summary = [
        f"{sc['~properties']['id']} ({sc['~properties']['source']})"
        for sc in linked
    ]
    sources_summary = [
        f"{sc['~properties']['id']} ({sc['~properties']['source']})"
        for sc in source_concepts
    ]
    has_weco = any(
        sc["~properties"]["source"] == "weco-authority"
        for sc in source_concepts
    )

    summary_rows.append({
        "concept_id": cid,
        "label": csv_row.get("label", ""),
        "linked_sources": ", ".join(linked_summary),
        "all_sources": ", ".join(sources_summary),
        "has_weco_authority": has_weco,
    })

summary_df = pd.DataFrame(summary_rows)

reachable = summary_df[summary_df["has_weco_authority"]]
unreachable = summary_df[~summary_df["has_weco_authority"]]

print(f"Total weco concepts: {len(summary_df)}")
print(f"  weco-authority REACHABLE (will have images): {len(reachable)}")
print(f"  weco-authority UNREACHABLE (NO images): {len(unreachable)}")

if len(unreachable) > 0:
    print(f"\n{'='*80}")
    print("UNREACHABLE — these concepts will NOT get images/weco description:")
    print(f"{'='*80}")
    print(tabulate(unreachable[["concept_id", "label", "linked_sources", "all_sources"]], headers="keys", tablefmt="grid", showindex=False))

print(f"\n{'='*80}")
print("Full summary:")
print(f"{'='*80}")
print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))