In [1]:
distract_evaluation_instruction = """
I will give you a set of input, and here is the evaluation guideline, please score the given input

## 1. Objective
Evaluate the quality of three distractor options (incorrect candidates) accompanying the correct answer (object) in a multiple-choice setting derived from biomedical fill-in-the-blank prompts.
Each distractor should be:
- Plausible given the question
- Incorrect (not the original object)
- Relevant in context and domain

## 2. Input
You will be shown:
- Original Sentence
- Triple (Subject, Relation, Object)
- Fill-in-the-Blank Prompt
- Answer Choices

## 3. Evaluation Criteria
### 1) Plausibility in Context
Is the distractor believable given the prompt and domain knowledge (biomedical)?**

- **What it checks:** subject, relation, and expected answer type (should be the object)
- **Focuses on:** Be cautious of meaning shifts, incorrect substitutions, or role reversals.

| **Score** | **Description** |
| --- | --- |
| 3 | Highly plausible: Very convincing as an answer; can confuse even experts; fits subject, relation, domain well. |
| 2 | Moderately plausible: Makes sense in general; fits domain and context somewhat; can be ruled out by basic domain knowledge. |
| 1 | Barely plausible: Awkward or uncommon; easily ruled out by surface cues or common sense without any domain knowledge. |
| 0 | Implausible: Irrelevant, nonsensical, or grammatically incorrect; not a valid answer option. |

### 2) Incorrectness

**Is the distractor clearly incorrect given the original sentence and triple?**

| **Score** | **Description** |
| --- | --- |
| 3 | Definitely wrong: the distractor is contradictory or not supported for the original sentence. |
| 2 | Likely wrong: the distractor is ambiguous or partially true based on the given sentence. |
| 1 | Borderline: the distractor is possibly true or partially correct based on the given sentence, which creates ambiguity. |
| 0 | Incorrectly labeled: the distractor is actually correct or the original answer given the original sentence. |

Please summarize the score and the reason for each metric. Please do reasoning in English" \
"""

In [2]:
english_crosslang_evaluation_instruction = """
Given the English knowledge triple and Japanese abstract, please do the evaluation about the cross-lingual correctness of the knowledge.

## Cross-lingual Correctness
Can the same knowledge (triple) be found or inferred from the Japanese abstract?

Score = 2: Direct match — same triple is clearly expressed in one sentence in the abstract.
Score = 1: Inferable — not in one sentence, but can be reasonably inferred from the paragraph as a whole.
Score = 0: Not supported — the triple cannot be inferred from the abstract, or unrelated.

If score is not 0, please also identify the sentences talking about the triple knowledge. 
Please do reasoning in English.
"""

In [3]:
japanese_crosslang_evaluation_instruction = """
Given the Japanese knowledge triple and English abstract, please do the evaluation about the cross-lingual correctness of the knowledge.

## Cross-lingual Correctness
Can the same knowledge (triple) be found or inferred from the Japanese abstract?

Score = 2: Direct match — same triple is clearly expressed in one sentence in the abstract.
Score = 1: Inferable — not in one sentence, but can be reasonably inferred from the paragraph as a whole.
Score = 0: Not supported — the triple cannot be inferred from the abstract, or unrelated.

If score is not 0, please also identify the sentences talking about the triple knowledge. 
Please do reasoning in English.
"""

In [15]:
lang = "en"

task = "crosslang_evaluation"
# task = "distractor_evaluation"

In [5]:
import os
from langchain_openai import AzureChatOpenAI

os.environ["AZURE_MODEL_NAME"] = "gpt-4.1-2025-04-14"
os.environ["OPENAI_API_TYPE"] = "azure_ad"
os.environ["OPENAI_API_VERSION"] = "2025-04-01-preview"
os.environ["AZURE_OPENAI_API_KEY"] = "<API_KEY>"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://llm-jp-openai-mmwg-01.openai.azure.com"

llm = AzureChatOpenAI(
        deployment_name=os.environ["AZURE_MODEL_NAME"],
        temperature=0,
        top_p=1,
        max_retries=1,
        timeout=120,
        logprobs=False,
        max_tokens=2048,  
    )

In [22]:
import os

from utils import EXP_ROOT, load_jsonl

lang = "ja"
qa_root = os.path.join(EXP_ROOT, "datasets/kg-datasets/ja-0.5/eval_qa/03_en_qa")
ner_root = os.path.join(EXP_ROOT, "datasets/kg-datasets/ja-0.5/eval_qa/01_fact_check")

if lang == "ja":
    response_path = os.path.join(qa_root, "ja_generation.jsonl")
    ner_path = os.path.join(ner_root, "bionlp_ners.jsonl")
else:
    response_path = os.path.join(qa_root, "en_generation.jsonl")
    ner_path = os.path.join(ner_root, "ja_ners.jsonl")

responses = load_jsonl(response_path)
en_docs = {}
for item in load_jsonl(ner_path):
    en_docs[item['docid']] = item['abstract']

for item in responses:
    if "target_lang_abs" not in item['metadata']:
        assert item['metadata']['docid'] in en_docs, f"Missing docid {item['docid']} in en_docs"
        item['metadata']['target_lang_abs'] = en_docs[item['metadata']['docid']]

request_ids = [item['id'] for item in load_jsonl(f"../99_adaxeval_datasets/{lang}_knowledge_memorization.jsonl")]
print(f"Total {len(request_ids)} requests, {len(responses)} responses")
responses = [item for item in responses if item['request_id'] in request_ids]
print(f"Filtered to {len(responses)} responses")

Total 2553 requests, 3934 responses
Filtered to 2553 responses


In [24]:
item['metadata']['target_lang_abs']

'We report a case of severe multiple blunt trauma in a 51-year-old male who had fallen from a height. After the arrival of the air ambulance, he was diagnosed with a massive left hemothorax and an unstable pelvic fracture. At the scene, he underwent tracheal intubation and left thoracostomy. Upon arrival at hospital, he was in a state of cardiopulmonary arrest caused by hemorrhagic shock due to the massive left hemothorax. Therefore, resuscitative thoracotomy was performed for the massive left hemothorax, and immediate pulmonary hilum clamping was conducted as a damage control measure against active bleeding from a deep pulmonary laceration. After aortic occlusion had been performed to achieve primary hemostasis, the patient was transported to a hybrid operating room, and left lower lobectomy was performed along with gauze packing. After the lobectomy, transcatheter arterial embolization and external fixation were carried out for the unstable pelvic fracture. Thus, this case shows that

In [25]:
def get_responses(method="random", preload_file=None):
    assert method in ["random", "preloaded"], "Method must be 'random' or 'preloaded'"
    if method == "random":   
        random.seed(42)
        random.shuffle(responses)
        for idx in range(100):
            yield responses[idx]
    else:
        import pandas as pd
        assert preload_file is not None, "Preload file must be specified for 'preloaded' method"
        assert os.path.exists(preload_file), f"Preload file {preload_file} does not exist"
        loaded_responses = pd.read_csv(preload_file)
        request_ids = loaded_responses["request_id"].tolist()

        id2response = {response["request_id"]: response for response in responses}
        for request_id in request_ids:
            if request_id in id2response:
                yield id2response[request_id]
            else:
                raise ValueError(f"Request ID {request_id} not found in responses")

In [26]:
from tqdm import tqdm
from utils import dump_jsonl

if lang == "en":
    preloaded_file = "./en_evaluation.csv"
else:
    preloaded_file = "./ja_evaluation.csv"

if task == "crosslang_evaluation":
    if lang == "en":
        system_prompt = english_crosslang_evaluation_instruction
    elif lang == "ja":
        system_prompt = japanese_crosslang_evaluation_instruction
    dump_file = f"./{lang}_crosslang_evaluation.jsonl"
else:
    system_prompt = distract_evaluation_instruction
    dump_file = f"./{lang}_distractor_annotations.jsonl"

annotations = load_jsonl(dump_file) if os.path.exists(dump_file) else []
loaded_annotations = {annotation['request_id']: annotation for annotation in annotations}

all_annotations, user_prompts, openai_requests = [], [], []
for idx, response in tqdm(enumerate(get_responses(method="preloaded", preload_file=preloaded_file))):
    request_id = response['request_id']
    generation = response["generation"]
    triple = generation['triple']
    prompt = generation["fill_in_blank"]
    paraphrase = generation["question"]
    distractors = generation["distractors"]
    
    correct_ans = generation["answer"]
    sentence = response['metadata']['sentence']
    abstract = response["metadata"]["target_lang_abs"]
    
    user_prompt = ""

    if task == "crosslang_evaluation":
        if lang == "en":
            user_prompt += f"Japanese abstract: {abstract}\n"
            user_prompt += f"English triple: {triple}\n"
        else:
            user_prompt += f"English abstract: {abstract}\n"
            user_prompt += f"Japanese triple: {triple}\n"
    else:
        user_prompt += f"sentence: {sentence}\n"
        user_prompt += f"triple: {triple}\n"
        user_prompt += f"question: {paraphrase}\n"
        user_prompt += f"distractor 1: {distractors[0]}\n"
        user_prompt += f"distractor 2: {distractors[1]}\n"
        user_prompt += f"distractor 3: {distractors[2]}\n"
                
    if response['request_id'] in loaded_annotations:
        print(f"Skipping request {response['request_id']} as it is already annotated.")
        user_prompts.append(user_prompt)
        all_annotations.append(loaded_annotations[response['request_id']])
        continue

    request = {
        "request_id": response['request_id'],
        "message": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        }
    annotation = llm.invoke(request['message']).model_dump()
    annotation["request_id"] = request["request_id"]
    all_annotations.append(annotation)
    user_prompts.append(user_prompt)
    dump_jsonl(all_annotations, dump_file)
    # break

0it [00:00, ?it/s]

95it [04:56,  3.12s/it]


BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'medium'}}}}}

In [27]:
if task == "crosslang_evaluation":
    dump_root = f"./{lang}_crosslang_evaluation"
else:
    dump_root = f"./{lang}_distractor_annotations"
os.makedirs(dump_root, exist_ok=True)

for idx, (user_prompt, annotation) in enumerate(zip(user_prompts, all_annotations)):
    with open(f"{dump_root}/{idx+2}.md", "w") as f:
        f.write(user_prompt + "\n" + annotation['content'])