In [1]:
import asyncio
from ragas.dataset_schema import SingleTurnSample  # Single turn container
from ragas.metrics import RubricsScore             # 0–5 rubric scoring
from ragas.llms import LangchainLLMWrapper
from langchain_ollama import ChatOllama

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
user_input = (
    "A junior orthopaedic surgery resident accidentally cuts a flexor tendon during "
    "carpal tunnel repair. The attending says to omit this minor, non-harmful error "
    "from the report to avoid worrying the patient. What is the correct next action?\n"
    "Options:\nA) Disclose the error to the patient but leave it out of the operative report "
    "B) Disclose the error to the patient and put it in the operative report "
    "C) Tell the attending that he cannot fail to disclose this mistake "
    "D) Report the physician to the ethics committee "
    "E) Refuse to dictate the operative report"
)

# retrieved contexts (strings); keep this list short to avoid truncation
retrieved_contexts = [
    "Ethical standards generally require disclosure of intraoperative complications "
    "to the patient and accurate documentation in the operative note."
]

# model's answer you want to judge (from your endpoint)
response = "<|comp|Report the physician to the ethics committee"

# reference (gold answer, a single string)
reference = "Tell the attending that he cannot fail to disclose this mistake"

sample = SingleTurnSample(
    user_input=user_input,
    retrieved_contexts=retrieved_contexts,
    response=response,
    reference=reference,
)



In [10]:
# ---- 2) Create the evaluator LLM (Ollama), temp=0.1 ----
judge = ChatOllama(
    model="gemma3:4b",  # instruction-tuned + light; good for 4GB GPUs
    temperature=0.1,
    num_ctx=2048,     # reduce if VRAM is tight
    num_predict=64,   # judges don’t need long generations
    num_gpu=1,        # allow partial GPU offload; will fallback if needed
    format="json", # chat format
)
evaluator_llm = LangchainLLMWrapper(judge)

In [11]:
from ragas.metrics import AspectCritic

scorer =  AspectCritic(
        name="answer_match",
        definition="Is the LLMs answer matches the gound truth?",
        llm=evaluator_llm
)

In [12]:
await scorer.single_turn_ascore(sample)

OutputParserException: Invalid json output: The retrieved context and reference strongly support disclosing the error to the patient and documenting it accurately, aligning with ethical standards. The response "Report the physician to the ethics committee" is a misdirection and does not represent the correct next action based on the provided information and ethical considerations. Therefore, the response is
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 