# Bedrock local test notebook (invoke + bearer token)

This notebook runs **locally** (VS Code / Jupyter) and calls **Amazon Bedrock Runtime** via HTTPS using a team-owned **Bedrock long-term API key** (bearer token).

## One-time setup (recommended)

1. Export these env vars **before you launch VS Code** (so the Jupyter kernel inherits them):

```bash
export AWS_BEARER_TOKEN_BEDROCK="ABSK..."
export AWS_REGION="us-east-1"   # optional (or set a different enabled region)
```

> If you previously opened VS Code, fully quit it (Cmd+Q) and re-open it **after** exporting the env vars.

2. Put these two files in the same folder:

- `test_bedrock.ipynb`
- `bedrock_harness.yaml`

3. Install deps in your venv once:

```bash
pip install requests pyyaml
```

## What you get

- A single helper: `get_completion(system, user, model="preset_name")`
- Preset-based fallback across providers (Claude-first, with explicit Nova / gpt-oss fallbacks)
- A side-by-side A/B compare display that shows:
  - preset, provider, model_ref, model_id
  - latency + token usage


In [None]:
# =========================
# 0) Config (YAML) + Auth (env vars only)
# =========================

import os
from pathlib import Path

try:
    import yaml
except ImportError as e:
    raise ImportError("Missing dependency: pyyaml. Run: pip install pyyaml") from e

CONFIG_ENV = "BEDROCK_HARNESS_CONFIG"
CONFIG_PATH = os.environ.get(CONFIG_ENV, "bedrock_harness.yaml")
cfg_path = Path(CONFIG_PATH).expanduser().resolve()

if not cfg_path.exists():
    raise FileNotFoundError(
        f"Missing config file: {cfg_path}\n"
        "Put 'bedrock_harness.yaml' next to this notebook, or set:\n"
        f"  export {CONFIG_ENV}=/path/to/bedrock_harness.yaml"
    )

cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}

auth_cfg = cfg.get("auth", {}) or {}
TOKEN_ENV = auth_cfg.get("token_env", "AWS_BEARER_TOKEN_BEDROCK")
REGION_ENV = auth_cfg.get("region_env", "AWS_REGION")
DEFAULT_REGION = auth_cfg.get("default_region", "us-east-1")

# --- Token must be provided via env var (no prompts, no YAML secrets) ---
token = (os.environ.get(TOKEN_ENV) or "").strip()
if not token or len(token) < 20:
    raise RuntimeError(
        f"Missing env var {TOKEN_ENV}.\n\n"
        "Export it *before launching VS Code / Jupyter*, e.g.:\n"
        f"  export {TOKEN_ENV}='ABSK...'\n"
        f"  export {REGION_ENV}='us-east-1'\n\n"
        "Then fully quit VS Code (Cmd+Q) and re-open it."
    )

# Region: env var overrides YAML default_region
AWS_REGION = (os.environ.get(REGION_ENV) or DEFAULT_REGION).strip()
os.environ[REGION_ENV] = AWS_REGION

defaults_cfg = cfg.get("defaults", {}) or {}
DEFAULT_PRESET = defaults_cfg.get("preset", "analysis_max")
DEBUG_FALLBACK_DEFAULT = bool(defaults_cfg.get("debug_fallback", True))

print("✅ Loaded config:", str(cfg_path))
print("   Region:", AWS_REGION)
print("   Token env:", TOKEN_ENV, "=", "set" if bool(os.environ.get(TOKEN_ENV)) else "missing")
print("   Presets:", ", ".join(sorted((cfg.get("presets", {}) or {}).keys())))
print("   Default preset:", DEFAULT_PRESET)
print("   Debug fallback:", DEBUG_FALLBACK_DEFAULT)


In [None]:
# =========================
# 1) Harness (invoke + bearer token; config-driven)
# =========================
from __future__ import annotations

import json
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import requests


@dataclass(frozen=True)
class ModelSpec:
    preset: str
    model_ref: str
    provider: str          # "anthropic" | "nova" | "openai_compat"
    model_id: str          # Bedrock modelId / inference profile id
    temperature: Optional[float]
    max_output_tokens: int

    # Anthropic-only knobs
    anthropic_thinking_enabled: bool = False
    anthropic_thinking_budget_tokens: Optional[int] = None


def _normalize_usage(provider: str, resp_json: Dict[str, Any]) -> Dict[str, Optional[int]]:
    """Normalize usage fields into {input_tokens, output_tokens, total_tokens}."""
    u = (resp_json or {}).get("usage") or {}
    inp = out = total = None

    if provider == "anthropic":
        inp = u.get("input_tokens")
        out = u.get("output_tokens")
        total = u.get("total_tokens")

    elif provider == "nova":
        # Nova tends to use camelCase in examples; handle both.
        inp = u.get("inputTokens", u.get("input_tokens"))
        out = u.get("outputTokens", u.get("output_tokens"))
        total = u.get("totalTokens", u.get("total_tokens"))

    elif provider == "openai_compat":
        inp = u.get("prompt_tokens", u.get("input_tokens"))
        out = u.get("completion_tokens", u.get("output_tokens"))
        total = u.get("total_tokens")

    # Best-effort total
    if total is None and (inp is not None or out is not None):
        total = (inp or 0) + (out or 0)

    return {
        "input_tokens": int(inp) if inp is not None else None,
        "output_tokens": int(out) if out is not None else None,
        "total_tokens": int(total) if total is not None else None,
    }


class BedrockHarness:
    def __init__(self, cfg: Dict[str, Any]):
        self.cfg = cfg or {}

        auth = self.cfg.get("auth", {}) or {}
        self.token_env = auth.get("token_env", "AWS_BEARER_TOKEN_BEDROCK")
        self.region_env = auth.get("region_env", "AWS_REGION")
        self.default_region = auth.get("default_region", "us-east-1")

        http = self.cfg.get("http", {}) or {}
        self.timeout_seconds = int(http.get("timeout_seconds", 60))

        defaults = self.cfg.get("defaults", {}) or {}
        self.default_preset = defaults.get("preset", "analysis_max")
        self.debug_fallback_default = bool(defaults.get("debug_fallback", True))

        endpoints = self.cfg.get("endpoints", {}) or {}
        self.runtime_base_tmpl = endpoints.get("bedrock_runtime_base", "https://bedrock-runtime.{region}.amazonaws.com")

        providers = self.cfg.get("providers", {}) or {}
        self.anthropic_version = ((providers.get("anthropic", {}) or {}).get("anthropic_version")) or "bedrock-2023-05-31"

        openai_cfg = providers.get("openai_compat", {}) or {}
        self.openai_chat_path = openai_cfg.get("chat_completions_path", "/openai/v1/chat/completions")
        self.openai_system_role = openai_cfg.get("system_role", "developer")
        self.openai_max_tokens_param = openai_cfg.get("max_tokens_param", "max_completion_tokens")

        self._models = self.cfg.get("models", {}) or {}
        self._presets = self.cfg.get("presets", {}) or {}

        # Reuse connections
        self._session = requests.Session()

    # ---- env ----
    def region(self) -> str:
        return (os.environ.get(self.region_env) or self.default_region).strip()

    def bearer_token(self) -> str:
        tok = (os.environ.get(self.token_env) or "").strip()
        if not tok or len(tok) < 20:
            raise RuntimeError(f"Missing env var {self.token_env}.")
        return tok

    def runtime_base(self) -> str:
        return self.runtime_base_tmpl.format(region=self.region()).rstrip("/")

    def headers_json(self) -> Dict[str, str]:
        return {"Authorization": f"Bearer {self.bearer_token()}", "Content-Type": "application/json"}

    # ---- config -> ModelSpec ----
    def build_preset_specs(self, preset: str) -> List[ModelSpec]:
        if preset not in self._presets:
            raise ValueError(f"Unknown preset '{preset}'. Available: {sorted(self._presets.keys())}")

        out: List[ModelSpec] = []
        for item in self._presets[preset]:
            ref = item["model_ref"]
            if ref not in self._models:
                raise ValueError(f"Preset '{preset}' references unknown model_ref '{ref}'.")

            m = self._models[ref]
            provider = m["provider"]
            model_id = m["model_id"]

            temperature = item.get("temperature", None)
            max_output_tokens = int(item["max_output_tokens"])

            thinking_enabled = False
            thinking_budget = None
            if provider == "anthropic":
                anth = item.get("anthropic", {}) or {}
                thinking = (anth.get("thinking", {}) or {})
                thinking_enabled = bool(thinking.get("enabled", False))
                if "budget_tokens" in thinking:
                    thinking_budget = int(thinking["budget_tokens"])

                # --- Guardrails for Claude extended thinking ---
                # 1) When thinking is enabled, temperature must be 1.
                if thinking_enabled:
                    temperature = 1

                # 2) max_output_tokens must be > thinking.budget_tokens.
                if thinking_enabled and thinking_budget is not None and thinking_budget >= max_output_tokens:
                    thinking_budget = max_output_tokens - 1

            out.append(ModelSpec(
                preset=preset,
                model_ref=ref,
                provider=provider,
                model_id=model_id,
                temperature=None if temperature is None else float(temperature),
                max_output_tokens=max_output_tokens,
                anthropic_thinking_enabled=thinking_enabled,
                anthropic_thinking_budget_tokens=thinking_budget,
            ))

        return out

    # ---- provider adapters ----
    def _invoke_anthropic(self, spec: ModelSpec, system_prompt: Optional[str], user_prompt: str) -> Tuple[str, Dict[str, Any], Dict[str, Optional[int]]]:
        url = f"{self.runtime_base()}/model/{spec.model_id}/invoke"

        payload: Dict[str, Any] = {
            "anthropic_version": self.anthropic_version,
            "messages": [{"role": "user", "content": [{"type": "text", "text": user_prompt}]}],
            "max_tokens": int(spec.max_output_tokens),
        }
        if system_prompt:
            payload["system"] = system_prompt
        if spec.temperature is not None:
            payload["temperature"] = float(spec.temperature)

        if spec.anthropic_thinking_enabled:
            thinking: Dict[str, Any] = {"type": "enabled"}
            if spec.anthropic_thinking_budget_tokens is not None:
                thinking["budget_tokens"] = int(spec.anthropic_thinking_budget_tokens)
            payload["thinking"] = thinking

        r = self._session.post(url, headers=self.headers_json(), data=json.dumps(payload), timeout=self.timeout_seconds)
        if r.status_code != 200:
            raise RuntimeError(f"Anthropic invoke failed {r.status_code}: {r.text[:1200]}")

        resp = r.json()
        parts = [c.get("text", "") for c in resp.get("content", []) if isinstance(c, dict) and c.get("type") == "text"]
        text = "".join(parts).strip()
        usage_norm = _normalize_usage("anthropic", resp)
        return text, resp, usage_norm

    def _invoke_nova(self, spec: ModelSpec, system_prompt: Optional[str], user_prompt: str) -> Tuple[str, Dict[str, Any], Dict[str, Optional[int]]]:
        url = f"{self.runtime_base()}/model/{spec.model_id}/invoke"

        payload: Dict[str, Any] = {
            "messages": [{"role": "user", "content": [{"text": user_prompt}]}],
            "inferenceConfig": {"maxTokens": int(spec.max_output_tokens)},
        }
        if spec.temperature is not None:
            payload["inferenceConfig"]["temperature"] = float(spec.temperature)
        if system_prompt:
            payload["system"] = [{"text": system_prompt}]

        r = self._session.post(url, headers=self.headers_json(), data=json.dumps(payload), timeout=self.timeout_seconds)
        if r.status_code != 200:
            raise RuntimeError(f"Nova invoke failed {r.status_code}: {r.text[:1200]}")

        resp = r.json()
        content = (((resp.get("output") or {}).get("message") or {}).get("content")) or []
        parts = [c.get("text", "") for c in content if isinstance(c, dict) and "text" in c]
        text = "".join(parts).strip()
        usage_norm = _normalize_usage("nova", resp)
        return text, resp, usage_norm

    def _openai_compat_chat(self, spec: ModelSpec, system_prompt: Optional[str], user_prompt: str) -> Tuple[str, Dict[str, Any], Dict[str, Optional[int]]]:
        url = f"{self.runtime_base()}{self.openai_chat_path}"

        messages: List[Dict[str, Any]] = []
        if system_prompt:
            messages.append({"role": self.openai_system_role, "content": system_prompt})
        messages.append({"role": "user", "content": user_prompt})

        payload: Dict[str, Any] = {
            "model": spec.model_id,
            "messages": messages,
            "stream": False,
        }
        if spec.temperature is not None:
            payload["temperature"] = float(spec.temperature)
        payload[self.openai_max_tokens_param] = int(spec.max_output_tokens)

        r = self._session.post(url, headers=self.headers_json(), data=json.dumps(payload), timeout=self.timeout_seconds)
        if r.status_code != 200:
            raise RuntimeError(f"OpenAI-compat chat failed {r.status_code}: {r.text[:1200]}")

        resp = r.json()
        text = (resp.get("choices", [{}])[0].get("message", {}) or {}).get("content", "") or ""
        usage_norm = _normalize_usage("openai_compat", resp)
        return text, resp, usage_norm

    # ---- public API ----
    def get_completion(
        self,
        system_prompt: Optional[str],
        user_prompt: str,
        model: Optional[str] = None,
        *,
        return_record: bool = False,
        label: str = "",
        debug_fallback: Optional[bool] = None,
    ) -> Union[str, Dict[str, Any]]:
        preset = model or self.default_preset
        if debug_fallback is None:
            debug_fallback = self.debug_fallback_default

        candidates = self.build_preset_specs(preset)

        last_err: Optional[Exception] = None
        t0 = time.time()

        for idx, spec in enumerate(candidates, start=1):
            try:
                if spec.provider == "anthropic":
                    text, raw, usage = self._invoke_anthropic(spec, system_prompt, user_prompt)
                elif spec.provider == "nova":
                    text, raw, usage = self._invoke_nova(spec, system_prompt, user_prompt)
                elif spec.provider == "openai_compat":
                    text, raw, usage = self._openai_compat_chat(spec, system_prompt, user_prompt)
                else:
                    raise ValueError(f"Unknown provider: {spec.provider}")

                latency = time.time() - t0

                if return_record:
                    return {
                        "label": label or preset,
                        "preset": preset,
                        "picked_index": idx,
                        "model_ref": spec.model_ref,
                        "provider": spec.provider,
                        "model_id": spec.model_id,
                        "temperature": spec.temperature,
                        "max_output_tokens": spec.max_output_tokens,
                        "anthropic_thinking_enabled": spec.anthropic_thinking_enabled,
                        "anthropic_thinking_budget_tokens": spec.anthropic_thinking_budget_tokens,
                        "latency_s": round(latency, 3),
                        "usage": usage,
                        "usage_raw": raw.get("usage"),
                        "system_prompt": system_prompt or "",
                        "user_prompt": user_prompt,
                        "response": text,
                    }
                return text

            except Exception as e:
                last_err = e
                if debug_fallback:
                    print(f"[fallback] preset={preset} model_ref={spec.model_ref} provider={spec.provider} failed: {type(e).__name__}: {e}")
                continue

        if return_record:
            return {
                "label": label or preset,
                "preset": preset,
                "picked_index": None,
                "model_ref": None,
                "provider": None,
                "model_id": None,
                "temperature": None,
                "max_output_tokens": None,
                "anthropic_thinking_enabled": None,
                "anthropic_thinking_budget_tokens": None,
                "latency_s": None,
                "usage": None,
                "usage_raw": None,
                "system_prompt": system_prompt or "",
                "user_prompt": user_prompt,
                "response": f"An error occurred: {last_err}",
            }
        return f"An error occurred: {last_err}"


# Instantiate once
h = BedrockHarness(cfg)

# Notebook-friendly alias (same signature as your original notebook)
def get_completion(system_prompt, user_prompt, model=None, return_record: bool = False, label: str = "", debug_fallback: Optional[bool] = None):
    return h.get_completion(system_prompt, user_prompt, model, return_record=return_record, label=label, debug_fallback=debug_fallback)


In [None]:
# =========================
# 2) Display helper (side-by-side compare + token usage)
# =========================
from IPython.display import Markdown, display
import html
from typing import Any, Dict, Optional


def display_responses(*records: Dict[str, Any], title: Optional[str] = None, max_col_width_px: int = 520):
    """Side-by-side display for harness records."""

    cols = list(records[0]) if (len(records) == 1 and isinstance(records[0], (list, tuple))) else list(records)

    def esc(x: Any) -> str:
        s = "" if x is None else str(x)
        return html.escape(s).replace("\n", "<br/>")

    def fmt_tokens(rec: Dict[str, Any]) -> str:
        u = rec.get("usage") or {}
        if not isinstance(u, dict) or not u:
            return ""
        inp = u.get("input_tokens")
        out = u.get("output_tokens")
        tot = u.get("total_tokens")
        parts = []
        if inp is not None:
            parts.append(f"in:{inp}")
        if out is not None:
            parts.append(f"out:{out}")
        if tot is not None:
            parts.append(f"total:{tot}")
        return "tokens: " + " · ".join(parts) if parts else ""

    def header_block(rec: Dict[str, Any]) -> str:
        preset = rec.get("preset")
        provider = rec.get("provider")
        model_ref = rec.get("model_ref")
        model_id = rec.get("model_id")
        picked = rec.get("picked_index")
        lat = rec.get("latency_s")
        temp = rec.get("temperature")
        max_out = rec.get("max_output_tokens")

        thinking_on = rec.get("anthropic_thinking_enabled")
        thinking_budget = rec.get("anthropic_thinking_budget_tokens")

        meta_lines = []
        if preset:
            meta_lines.append(f"preset: {preset}")
        if provider:
            meta_lines.append(f"provider: {provider}")
        if model_ref:
            meta_lines.append(f"model_ref: {model_ref}")
        if model_id:
            meta_lines.append(f"model_id: {model_id}")
        if picked is not None:
            meta_lines.append(f"picked: #{picked}")
        if lat is not None:
            meta_lines.append(f"latency: {lat}s")
        if max_out is not None:
            meta_lines.append(f"max_out: {max_out}")
        if temp is not None:
            meta_lines.append(f"temp: {temp}")

        if thinking_on:
            meta_lines.append("thinking: on")
            if thinking_budget is not None:
                meta_lines.append(f"budget: {thinking_budget}")
        elif thinking_on is False and provider == "anthropic":
            meta_lines.append("thinking: off")

        tok_line = fmt_tokens(rec)
        if tok_line:
            meta_lines.append(tok_line)

        return "<br/>".join(esc(x) for x in meta_lines)

    table = []
    table.append("<div style='overflow-x:auto; padding:6px 2px;'>")
    table.append("<table style='border-collapse:collapse; width:max-content; min-width:100%;'>")

    if title:
        table.append(
            f"<tr><th colspan='{len(cols)}' "
            "style='text-align:left; font-size:16px; padding:10px; border:1px solid #ddd; background:#fafafa;'>"
            f"{esc(title)}</th></tr>"
        )

    # Headers
    table.append("<tr>")
    for i, rec in enumerate(cols):
        label = rec.get("label") or f"Variant {i+1}"
        meta = header_block(rec)
        table.append(
            "<th style='text-align:left; vertical-align:top; border:1px solid #ddd; padding:10px; "
            "background:#f7f7f7; min-width:320px; "
            f"max-width:{max_col_width_px}px;'>"
            f"{esc(label)}<br/><span style='font-weight:normal;color:#666'>{meta}</span></th>"
        )
    table.append("</tr>")

    # Prompts
    table.append("<tr>")
    for rec in cols:
        sys = rec.get("system_prompt", "")
        usr = rec.get("user_prompt", "")
        table.append(
            "<td style='vertical-align:top; border:1px solid #ddd; padding:10px; "
            f"max-width:{max_col_width_px}px;'>"
            f"<div style='margin-bottom:10px;'><b>System</b><br/>{esc(sys)}</div>"
            f"<div><b>User</b><br/>{esc(usr)}</div>"
            "</td>"
        )
    table.append("</tr>")

    # Responses
    table.append("<tr>")
    for rec in cols:
        resp = rec.get("response", "")
        table.append(
            "<td style='vertical-align:top; border:1px solid #ddd; padding:10px; "
            f"max-width:{max_col_width_px}px;'>"
            "<div><b>Response</b></div>"
            f"<div style='margin-top:6px; line-height:1.35;'>{esc(resp)}</div>"
            "</td>"
        )
    table.append("</tr>")

    table.append("</table></div>")
    display(Markdown("".join(table)))


In [None]:
# =========================
# 3) Quick A/B test (analysis_max vs cheap_fast)
# =========================
SYSTEM = "You are an AI engineering expert. Explain with metaphors and examples."
USER = "What's the difference of RAG, MCP, and SKILL? Explain it in Chinese."

resp_max = get_completion(
    SYSTEM,
    USER,
    model="analysis_max",
    return_record=True,
    label="analysis_max",
)

resp_fast = get_completion(
    SYSTEM,
    USER,
    model="cheap_fast",
    return_record=True,
    label="cheap_fast",
)

display_responses(resp_max, resp_fast, title="Bedrock A/B: analysis_max vs cheap_fast")
