# Arrow Judge Notebook

Evaluates how similar reapplied arrows are to original arrows

In [None]:
import os
import base64
from pathlib import Path
from typing import Dict, Any, List

from openai import OpenAI

In [None]:
MODEL = "gpt-4o"  # Advanced multimodal model
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise ValueError("OPENAI_API_KEY is not set in your environment.")

client = OpenAI(api_key=API_KEY)

In [None]:
def image_to_data_url(path: str) -> str:
    ext = Path(path).suffix.lower().replace(".", "")
    if ext == "jpg":
        ext = "jpeg"
    with open(path, "rb") as f:
        data = base64.b64encode(f.read()).decode("utf-8")
    return f"data:image/{ext};base64,{data}"

def build_rubric_prompt() -> str:
    return """You are an expert annotator for arrow segmentation and overlay quality.
You will recieve an ORIGINAL image and a REAPPLIED image. The original image will be a medical image with overlaid arrows.
The reapplied image will have additional arrows that have been lifted off of the original image with segmentation and re-overlayed at random locations on the image.
Your job is to determine how natural and inconspicuous the new arrows look compared to the original arrows.

Judge similarity of arrow placement and appearance using this rubric:

1) Arrow General Shape (1-5)
   - 5: Shape of new arrows matches the shape of the original arrows almost exactly
   - 4: Shape of new arrows is close to the original arrows but still distinguishable due to very minor imperfections
   - 3: New arrows are clearly distinguishable based on shape imperfections
   - 2: New arrows are barely reminiscent of original arrow 
   - 1: Shape is not at all reminiscent of original arrow

5) Arrow Style (1-5)
   - 5: Style and color almost identical to original arrows
   - 4: Style and color match well
   - 3: Noticeable differences in style and color
   - 2: Major differences in style and color
   - 1: Style and color are completely different

6) Arrow Edges (1-5)
   - 5: Edges of new arrows are crisp, clean, and straight
   - 4: Edges of new arrows are clear and straight, with minor speckle or jaggedness
   - 3: Edges of new arrows make them look clearly segmented and different
   - 2: Edges very jagged or have a lot of speckle. Not clean lines.
   - 1: Cannot tell where the edges of the arrow are.

Output JSON only with:
{
  "overall_score": 3-15,
  "per_metric": {
    "shape": 0-5,
    "style": 0-5,
    "edges": 0-5
  },
  "notes": "justification for scores"
}

Compute overall_score as a sum:
shape + style + edges
"""


In [None]:
def judge_arrow_similarity(reapplied_path: str, original_path: str) -> Dict[str, Any]:
    reapplied_url = image_to_data_url(reapplied_path)
    original_url = image_to_data_url(original_path)

    messages = [
        {
            "role": "system",
            "content": "You are a precise visual evaluator. Follow the rubric exactly."
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": build_rubric_prompt()},
                {"type": "text", "text": "ORIGINAL IMAGE:"},
                {"type": "image_url", "image_url": {"url": original_url}},
                {"type": "text", "text": "REAPPLIED IMAGE:"},
                {"type": "image_url", "image_url": {"url": reapplied_url}},
            ],
        },
    ]

    resp = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.0,
        response_format={"type": "json_object"},
    )
    return resp.choices[0].message.content

In [None]:
REAPPLIED_PATH = "reapplied/pic5.jpeg"
ORIGINAL_PATH = "../pics/pic5.jpeg"

result_json = judge_arrow_similarity(REAPPLIED_PATH, ORIGINAL_PATH)
print(result_json)