# Concept Overrides Transformation

This notebook documents the `ConceptOverrides` transformation, which allows for manual overrides of concept descriptions in our dataset. 

It outputs a transformed CSV after retrieving IIIF image urls from image ids, we may want to convert this into a script if we end up using it long term.

In [None]:
import csv
from pathlib import Path
from itertools import islice
import json
import urllib.request
import urllib.error
from functools import lru_cache

input_path = Path("./data/combined_overrides.csv")
output_path = Path("../src/ingestor/transformers/wellcome_collection_authority.csv")

API_BASE = "https://api.wellcomecollection.org/catalogue/v2/images/"

expected_columns = {
    "conceptid",
    "imageid1",
    "imageid2",
    "imageid3",
    "imageid4",
    "labeloverride",
    "description",
}

class ImageResolutionError(Exception):
    """Raised when an image ID cannot be resolved to a IIIF URL."""
    pass

@lru_cache(maxsize=2048)
def resolve_image_id(image_id: str) -> str:
    """Return IIIF Image API info.json URL for an image_id or raise.

    Looks up the image in the API, finds the first DigitalLocation whose
    locationType.id == 'iiif-image', and returns its 'url'.
    Raises ImageResolutionError if it cannot be resolved.
    """
    image_id = (image_id or "").strip()
    if not image_id:
        raise ImageResolutionError("Blank image_id")
    url = API_BASE + image_id
    try:
        with urllib.request.urlopen(url, timeout=10) as resp:  # nosec B310 (read-only public API)
            if resp.status != 200:
                raise ImageResolutionError(f"HTTP {resp.status} for {image_id}")
            data = json.loads(resp.read().decode("utf-8"))
    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e:
        raise ImageResolutionError(f"Request/parse failed for {image_id}: {e}") from e

    locations = data.get("locations") or []
    for loc in locations:
        try:
            if (loc.get("locationType") or {}).get("id") == "iiif-image" and "url" in loc:
                return loc["url"]
        except AttributeError:
            continue
    raise ImageResolutionError(f"No iiif-image location for {image_id}")

unresolved_ids = set()

with input_path.open(newline="", encoding="utf-8") as in_f:
    reader = csv.DictReader(in_f)

    # Normalize fieldnames (strip whitespace + BOM) to avoid false missing-column errors
    raw_fieldnames = reader.fieldnames or []
    normalized_fieldnames = [fn.strip().lstrip("\ufeff") for fn in raw_fieldnames]
    fieldname_map = dict(zip(raw_fieldnames, normalized_fieldnames))

    if normalized_fieldnames != raw_fieldnames:
        print("Normalized header names:")
        for before, after in zip(raw_fieldnames, normalized_fieldnames):
            if before != after:
                print(f"  '{before}' -> '{after}'")

    missing = expected_columns - set(normalized_fieldnames)
    if missing:
        raise ValueError(
            "Input CSV is missing expected columns after normalization: "
            f"{missing}. Found: {normalized_fieldnames}"
        )

    with output_path.open("w", newline="", encoding="utf-8") as out_f:
        fieldnames = ["id", "label", "description", "image_url"]
        writer = csv.DictWriter(out_f, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            # Re-key the row using normalized names
            norm_row = {fieldname_map.get(k, k): v for k, v in row.items()}
            image_ids = [
                (norm_row.get(col) or "").strip()
                for col in ("imageid1", "imageid2", "imageid3", "imageid4")
            ]
            image_ids = [iid for iid in image_ids if iid]  # drop blanks

            resolved_urls = []
            for iid in image_ids:
                try:
                    resolved_urls.append(resolve_image_id(iid))
                except ImageResolutionError as e:
                    unresolved_ids.add(iid)
                    # omit this ID from the output
                    continue

            writer.writerow(
                {
                    "id": (norm_row.get("conceptid") or "").strip(),
                    "label": (norm_row.get("labeloverride") or "").strip(),
                    "description": (norm_row.get("description") or "").strip(),
                    "image_url": "||".join(resolved_urls),
                }
            )

if unresolved_ids:
    raise ImageResolutionError(
        f"Failed to resolve {len(unresolved_ids)} image IDs; they were omitted: "
        + ", ".join(sorted(unresolved_ids))
    )

print(f"Wrote transformed CSV to {output_path.resolve()}")

# Preview first 5 rows of the transformed file
with output_path.open(newline="", encoding="utf-8") as f:
    preview_reader = csv.DictReader(f)
    print("Preview:")
    for r in islice(preview_reader, 5):
        print(r)