# Bedrock multimodal test notebook (config-driven)

This notebook targets non-text scenarios:

- Image generation
- Video generation
- Image understanding
- Video understanding

Design principle:
- Keep **all tunable parameters** in `bedrock_harness.yaml`.
- Keep this notebook focused on loading config and executing scenarios.

## One-time setup

```bash
export AWS_BEARER_TOKEN_BEDROCK="ABSK..."
export AWS_REGION="us-east-1"   # optional
pip install requests pyyaml
```

## Files expected in the same folder

- `test_bedrock_multimodal.ipynb`
- `bedrock_harness.yaml`

Optional local inputs (for understanding scenarios):

- `./samples/demo_image.jpg`
- `./samples/demo_video.mp4`


In [None]:
# =========================
# 0) Config (YAML) + Auth (env vars only)
# =========================
import os
from pathlib import Path

try:
    import yaml
except ImportError as e:
    raise ImportError("Missing dependency: pyyaml. Run: pip install pyyaml") from e

CONFIG_ENV = "BEDROCK_HARNESS_CONFIG"
CONFIG_PATH = os.environ.get(CONFIG_ENV, "bedrock_harness.yaml")
cfg_path = Path(CONFIG_PATH).expanduser().resolve()
CFG_BASE_DIR = cfg_path.parent

if not cfg_path.exists():
    raise FileNotFoundError(
        f"Missing config file: {cfg_path}\n"
        "Put 'bedrock_harness.yaml' next to this notebook, or set:\n"
        f"  export {CONFIG_ENV}=/path/to/bedrock_harness.yaml"
    )

cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}

auth_cfg = cfg.get("auth", {}) or {}
TOKEN_ENV = auth_cfg.get("token_env", "AWS_BEARER_TOKEN_BEDROCK")
REGION_ENV = auth_cfg.get("region_env", "AWS_REGION")
DEFAULT_REGION = auth_cfg.get("default_region", "us-east-1")

token = (os.environ.get(TOKEN_ENV) or "").strip()
if not token or len(token) < 20:
    raise RuntimeError(
        f"Missing env var {TOKEN_ENV}.\n\n"
        "Export it before launching VS Code / Jupyter, e.g.:\n"
        f"  export {TOKEN_ENV}='ABSK...'\n"
        f"  export {REGION_ENV}='us-east-1'"
    )

AWS_REGION = (os.environ.get(REGION_ENV) or DEFAULT_REGION).strip()
os.environ[REGION_ENV] = AWS_REGION

mm_cfg = cfg.get("multimodal", {}) or {}
scenario_names = sorted((mm_cfg.get("scenarios", {}) or {}).keys())
default_run = mm_cfg.get("default_run", []) or []

print("Loaded config:", str(cfg_path))
print("Region:", AWS_REGION)
print("Token env:", TOKEN_ENV, "=", "set" if bool(os.environ.get(TOKEN_ENV)) else "missing")
print("Multimodal scenarios:", ", ".join(scenario_names) if scenario_names else "(none)")
print("Default multimodal run:", default_run)


In [None]:
# =========================
# 1) Multimodal harness (invoke + bearer token; config-driven)
# =========================
from __future__ import annotations

import base64
import json
import mimetypes
import re
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import requests


_FULL_PLACEHOLDER = re.compile(r"^\{([A-Za-z_][A-Za-z0-9_]*)\}$")
_ANY_PLACEHOLDER = re.compile(r"\{([A-Za-z_][A-Za-z0-9_]*)\}")

_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}
_VIDEO_EXTS = {".mp4", ".mov", ".m4v", ".avi", ".webm", ".mkv"}


def _deep_get(data: Any, path: str) -> Any:
    if not path:
        return data
    cur = data
    for part in path.split("."):
        if isinstance(cur, dict):
            cur = cur.get(part)
        elif isinstance(cur, list) and part.isdigit():
            idx = int(part)
            if idx < 0 or idx >= len(cur):
                return None
            cur = cur[idx]
        else:
            return None
    return cur


def _render_template(value: Any, ctx: Dict[str, Any]) -> Any:
    if isinstance(value, dict):
        return {k: _render_template(v, ctx) for k, v in value.items()}
    if isinstance(value, list):
        return [_render_template(v, ctx) for v in value]
    if isinstance(value, str):
        m = _FULL_PLACEHOLDER.match(value)
        if m:
            key = m.group(1)
            if key not in ctx:
                raise KeyError(f"Missing template var: {key}")
            return ctx[key]

        def repl(match: re.Match[str]) -> str:
            key = match.group(1)
            if key not in ctx:
                raise KeyError(f"Missing template var: {key}")
            return str(ctx[key])

        return _ANY_PLACEHOLDER.sub(repl, value)

    return value


class BedrockMultimodalHarness:
    def __init__(self, cfg: Dict[str, Any], config_dir: Path):
        self.cfg = cfg or {}
        self.config_dir = config_dir

        auth = self.cfg.get("auth", {}) or {}
        self.token_env = auth.get("token_env", "AWS_BEARER_TOKEN_BEDROCK")
        self.region_env = auth.get("region_env", "AWS_REGION")
        self.default_region = auth.get("default_region", "us-east-1")

        http = self.cfg.get("http", {}) or {}
        self.timeout_seconds = int(http.get("timeout_seconds", 120))

        endpoints = self.cfg.get("endpoints", {}) or {}
        self.runtime_base_tmpl = endpoints.get("bedrock_runtime_base", "https://bedrock-runtime.{region}.amazonaws.com")

        providers = self.cfg.get("providers", {}) or {}
        self.anthropic_version = ((providers.get("anthropic", {}) or {}).get("anthropic_version")) or "bedrock-2023-05-31"

        self.models = self.cfg.get("models", {}) or {}

        self.mm = self.cfg.get("multimodal", {}) or {}
        self.scenarios = self.mm.get("scenarios", {}) or {}
        self.default_run = list(self.mm.get("default_run", []) or [])
        self.output_dir = self._resolve_path(self.mm.get("output_dir", "./outputs"))

        self._session = requests.Session()

    # ---- env ----
    def region(self) -> str:
        return (os.environ.get(self.region_env) or self.default_region).strip()

    def bearer_token(self) -> str:
        tok = (os.environ.get(self.token_env) or "").strip()
        if not tok or len(tok) < 20:
            raise RuntimeError(f"Missing env var {self.token_env}.")
        return tok

    def runtime_base(self) -> str:
        return self.runtime_base_tmpl.format(region=self.region()).rstrip("/")

    def headers_json(self) -> Dict[str, str]:
        return {
            "Authorization": f"Bearer {self.bearer_token()}",
            "Content-Type": "application/json",
        }

    # ---- helpers ----
    def _resolve_path(self, p: str) -> Path:
        raw = Path(p).expanduser()
        if raw.is_absolute():
            return raw.resolve()
        return (self.config_dir / raw).resolve()

    def _resolve_model(self, model_ref: str) -> Dict[str, str]:
        if model_ref not in self.models:
            raise ValueError(f"Unknown model_ref '{model_ref}'. Available: {sorted(self.models.keys())}")
        m = self.models[model_ref] or {}
        provider = m.get("provider")
        model_id = m.get("model_id")
        if not provider or not model_id:
            raise ValueError(f"Invalid model config for '{model_ref}': provider/model_id required")
        return {"provider": provider, "model_id": model_id}

    def _prepare_media(self, scenario: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
        media_cfg = scenario.get("media", []) or []
        media_list: List[Dict[str, Any]] = []
        ctx: Dict[str, Any] = {}

        for idx, item in enumerate(media_cfg, start=1):
            name = item.get("name") or f"media_{idx}"
            mtype = item.get("type", "image")
            path_val = item.get("path")
            if not path_val:
                raise ValueError(f"Scenario media '{name}' missing path")

            path = self._resolve_path(path_val)
            if not path.exists():
                raise FileNotFoundError(
                    f"Media file not found for scenario item '{name}': {path}"
                )

            media_type = item.get("media_type")
            if not media_type:
                guessed, _ = mimetypes.guess_type(str(path))
                media_type = guessed or ("image/jpeg" if mtype == "image" else "video/mp4")

            fmt = item.get("format")
            if not fmt:
                if "/" in media_type:
                    fmt = media_type.split("/", 1)[1].split(";", 1)[0]
                else:
                    fmt = path.suffix.lstrip(".")

            b64 = base64.b64encode(path.read_bytes()).decode("ascii")

            media_list.append(
                {
                    "name": name,
                    "type": mtype,
                    "path": str(path),
                    "media_type": media_type,
                    "format": fmt,
                    "base64": b64,
                }
            )

            ctx[f"{name}_path"] = str(path)
            ctx[f"{name}_media_type"] = media_type
            ctx[f"{name}_format"] = fmt
            ctx[f"{name}_base64"] = b64

        return media_list, ctx

    def _extract_first(self, data: Dict[str, Any], paths: List[str]) -> Tuple[Any, Optional[str]]:
        for p in paths:
            v = _deep_get(data, p)
            if v is not None:
                return v, p
        return None, None

    def _save_base64_to_file(self, b64_text: str, save_dir: str, file_name: str) -> Path:
        out_dir = (self.output_dir / save_dir).resolve()
        out_dir.mkdir(parents=True, exist_ok=True)
        out_path = out_dir / file_name
        out_path.write_bytes(base64.b64decode(b64_text))
        return out_path

    def _parse_response(self, scenario_name: str, response_cfg: Dict[str, Any], resp_json: Dict[str, Any]) -> Dict[str, Any]:
        parsed: Dict[str, Any] = {
            "response_text": None,
            "saved_files": [],
            "job_id": None,
        }

        rtype = (response_cfg or {}).get("type", "json")

        if rtype == "anthropic_text":
            blocks = resp_json.get("content", []) or []
            parts = [b.get("text", "") for b in blocks if isinstance(b, dict) and b.get("type") == "text"]
            parsed["response_text"] = "".join(parts).strip()

        elif rtype == "text_path":
            path = (response_cfg or {}).get("path", "")
            v = _deep_get(resp_json, path)
            parsed["response_text"] = "" if v is None else str(v)

        elif rtype == "base64_images":
            path = (response_cfg or {}).get("path", "images")
            images = _deep_get(resp_json, path)
            save_dir = (response_cfg or {}).get("save_dir", scenario_name)
            ext = ((response_cfg or {}).get("file_ext", "png") or "png").lstrip(".")

            if isinstance(images, list):
                for i, b64_text in enumerate(images, start=1):
                    if isinstance(b64_text, str) and b64_text.strip():
                        file_name = f"{scenario_name}_{i:02d}.{ext}"
                        out_path = self._save_base64_to_file(b64_text, save_dir, file_name)
                        parsed["saved_files"].append(str(out_path))

        elif rtype == "base64_video_or_job":
            save_dir = (response_cfg or {}).get("save_dir", scenario_name)
            ext = ((response_cfg or {}).get("file_ext", "mp4") or "mp4").lstrip(".")
            video_paths = (response_cfg or {}).get(
                "video_path_candidates",
                ["video", "videoBase64", "output.video", "result.video"],
            )
            job_paths = (response_cfg or {}).get(
                "job_id_candidates",
                ["invocationArn", "jobArn", "jobId", "id"],
            )

            video_blob, _ = self._extract_first(resp_json, video_paths)
            if isinstance(video_blob, str) and len(video_blob) > 100:
                file_name = f"{scenario_name}.{ext}"
                out_path = self._save_base64_to_file(video_blob, save_dir, file_name)
                parsed["saved_files"].append(str(out_path))

            job_id, _ = self._extract_first(resp_json, job_paths)
            if job_id is not None:
                parsed["job_id"] = str(job_id)

        return parsed

    # ---- scenario runners ----
    def _run_invoke_template(self, scenario_name: str, scenario: Dict[str, Any], model: Dict[str, str]) -> Dict[str, Any]:
        template_vars = dict(scenario.get("template_vars", {}) or {})
        media_list, media_ctx = self._prepare_media(scenario)
        template_vars.update(media_ctx)

        req_tmpl = scenario.get("request_template")
        if req_tmpl is None:
            raise ValueError(f"Scenario '{scenario_name}' kind=invoke_template requires request_template")

        payload = _render_template(req_tmpl, template_vars)

        endpoint_path = (scenario.get("endpoint_path") or "/model/{model_id}/invoke").format(
            model_id=model["model_id"]
        )
        url = f"{self.runtime_base()}{endpoint_path}"
        timeout = int(scenario.get("timeout_seconds", self.timeout_seconds))

        r = self._session.post(url, headers=self.headers_json(), data=json.dumps(payload), timeout=timeout)
        if r.status_code != 200:
            raise RuntimeError(f"Invoke failed {r.status_code}: {r.text[:1200]}")

        resp_json = r.json()
        response_cfg = scenario.get("response", {}) or {}
        parsed = self._parse_response(scenario_name, response_cfg, resp_json)

        return {
            "endpoint_path": endpoint_path,
            "media_count": len(media_list),
            "response_json": resp_json,
            **parsed,
        }

    def _run_anthropic_messages(self, scenario_name: str, scenario: Dict[str, Any], model: Dict[str, str]) -> Dict[str, Any]:
        if model["provider"] != "anthropic":
            raise ValueError(
                f"Scenario '{scenario_name}' kind=anthropic_messages requires an anthropic provider model_ref"
            )

        media_list, _ = self._prepare_media(scenario)
        user_prompt = str(scenario.get("user_prompt", "") or "").strip()
        system_prompt = str(scenario.get("system_prompt", "") or "").strip()

        content: List[Dict[str, Any]] = []
        for m in media_list:
            content.append(
                {
                    "type": m["type"],
                    "source": {
                        "type": "base64",
                        "media_type": m["media_type"],
                        "data": m["base64"],
                    },
                }
            )

        if user_prompt:
            content.append({"type": "text", "text": user_prompt})

        if not content:
            raise ValueError(f"Scenario '{scenario_name}' must provide media or user_prompt")

        payload: Dict[str, Any] = {
            "anthropic_version": self.anthropic_version,
            "messages": [{"role": "user", "content": content}],
            "max_tokens": int(scenario.get("max_output_tokens", 1024)),
        }
        if system_prompt:
            payload["system"] = system_prompt
        if "temperature" in scenario and scenario.get("temperature") is not None:
            payload["temperature"] = float(scenario.get("temperature"))

        endpoint_path = f"/model/{model['model_id']}/invoke"
        url = f"{self.runtime_base()}{endpoint_path}"
        timeout = int(scenario.get("timeout_seconds", self.timeout_seconds))

        r = self._session.post(url, headers=self.headers_json(), data=json.dumps(payload), timeout=timeout)
        if r.status_code != 200:
            raise RuntimeError(f"Anthropic multimodal invoke failed {r.status_code}: {r.text[:1200]}")

        resp_json = r.json()
        response_cfg = scenario.get("response", {}) or {"type": "anthropic_text"}
        parsed = self._parse_response(scenario_name, response_cfg, resp_json)

        usage = resp_json.get("usage")

        return {
            "endpoint_path": endpoint_path,
            "media_count": len(media_list),
            "usage": usage,
            "response_json": resp_json,
            **parsed,
        }

    def run_scenario(self, scenario_name: str) -> Dict[str, Any]:
        if scenario_name not in self.scenarios:
            raise ValueError(f"Unknown scenario '{scenario_name}'. Available: {sorted(self.scenarios.keys())}")

        scenario = self.scenarios[scenario_name] or {}
        kind = scenario.get("kind", "invoke_template")
        model_ref = scenario.get("model_ref")
        if not model_ref:
            raise ValueError(f"Scenario '{scenario_name}' missing model_ref")

        model = self._resolve_model(model_ref)

        t0 = time.time()
        base_record: Dict[str, Any] = {
            "scenario": scenario_name,
            "kind": kind,
            "model_ref": model_ref,
            "provider": model["provider"],
            "model_id": model["model_id"],
            "latency_s": None,
            "error": None,
            "response_text": None,
            "saved_files": [],
            "job_id": None,
            "usage": None,
            "response_json": None,
            "media_count": 0,
            "endpoint_path": None,
        }

        try:
            if kind == "invoke_template":
                out = self._run_invoke_template(scenario_name, scenario, model)
            elif kind == "anthropic_messages":
                out = self._run_anthropic_messages(scenario_name, scenario, model)
            else:
                raise ValueError(f"Unsupported scenario kind: {kind}")

            base_record.update(out)
            base_record["latency_s"] = round(time.time() - t0, 3)

        except Exception as e:
            base_record["latency_s"] = round(time.time() - t0, 3)
            base_record["error"] = f"{type(e).__name__}: {e}"

        return base_record

    def run_suite(self, scenario_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
        names = scenario_names or self.default_run or sorted(self.scenarios.keys())
        return [self.run_scenario(name) for name in names]


In [None]:
# =========================
# 2) Display helpers
# =========================
from IPython.display import Markdown, display, Image, Video
import html
from pathlib import Path
from typing import Any, Dict, List


def _as_text(x: Any) -> str:
    s = "" if x is None else str(x)
    return html.escape(s)


def _is_image_file(p: Path) -> bool:
    return p.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}


def _is_video_file(p: Path) -> bool:
    return p.suffix.lower() in {".mp4", ".mov", ".m4v", ".avi", ".webm", ".mkv"}


def display_multimodal_results(records: List[Dict[str, Any]]):
    for rec in records:
        title = f"{rec.get('scenario')} | {rec.get('kind')} | {rec.get('model_ref')}"
        meta_lines = [
            f"provider: {rec.get('provider')}",
            f"model_id: {rec.get('model_id')}",
            f"endpoint: {rec.get('endpoint_path')}",
            f"latency_s: {rec.get('latency_s')}",
            f"media_count: {rec.get('media_count')}",
        ]

        if rec.get("job_id"):
            meta_lines.append(f"job_id: {rec.get('job_id')}")
        if rec.get("usage"):
            meta_lines.append(f"usage: {rec.get('usage')}")

        if rec.get("error"):
            meta_lines.append(f"error: {rec.get('error')}")

        body = "\n".join([f"- {line}" for line in meta_lines])
        display(Markdown(f"### {title}\n{body}"))

        if rec.get("response_text"):
            txt = rec.get("response_text")
            display(Markdown("**Response text**"))
            display(Markdown(f"```text\n{txt}\n```"))

        saved_files = rec.get("saved_files") or []
        if saved_files:
            display(Markdown("**Saved artifacts**"))
            for fp in saved_files:
                p = Path(fp)
                display(Markdown(f"- `{p}`"))
                if p.exists() and _is_image_file(p):
                    display(Image(filename=str(p)))
                elif p.exists() and _is_video_file(p):
                    display(Video(filename=str(p), embed=True))


In [None]:
# =========================
# 3) Run default multimodal suite from YAML
# =========================
mm_harness = BedrockMultimodalHarness(cfg, config_dir=CFG_BASE_DIR)

results = mm_harness.run_suite()
display_multimodal_results(results)

results
