In [None]:
# Directly evaluate: score biomedical hypotheses on four metrics (novelty, relevance, significance, verifiability), outputting results to CSV.
import json
import csv
import logging
import backoff
import re
from statistics import mode, mean
from pathlib import Path
import openai
import autogen

config_list = autogen.config_list_from_models(model_list=["gpt-4.1"])
gpt_config = {
    "chat_model": "gpt-4.1",
    "cache_seed": 42,
    "temperature": 0.7,
    "config_list": config_list,
    "timeout": 540000,
    "max_output_tokens": 1500
}

# Logger
logger = logging.getLogger(__name__)
if logger.hasHandlers():
    logger.handlers.clear()
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
logger.addHandler(handler)
logger.setLevel(logging.INFO)


METRICS_PROMPT = r"""
You are a senior biomedical reviewer.

Task:
Evaluate the following hypothesis by assigning a score for each metric (Novelty, Relevance, Significance, Verifiability) and providing a concise reason (<=15 words).

Metric definitions:
Novelty: Evaluate the novelty of the generated scientific hypothesis. The score range should be 0 to 3. 0 means there's no novelty, which indicates that the hypothesis is a paraphrase of the input. 1 means there's slight novelty. 2 means there's moderate novelty. 3 means the hypothesis has strong novelty, which gives new insights beyond the background. Output is an integer.
Relevance: Evaluate the relevance of the generated scientific hypothesis. The score range should be 0 to 3. 0 means there's no relevance. 1 means there's slight relevance. 2 means there's moderate relevance. 3 means they are strongly related. Output is an integer.
Significance: Evaluate the significance of the generated scientific hypothesis. The score range should be 0 to 3. 0 means there's no significance, which indicates that the hypothesis is just a common knowledge. 1 means there's slight significance. 2 means there's moderate significance. 3 means the hypothesis has strong significance, which gives significant insights beyond the background. Output is an integer.
Verifiability: Evaluate the verifiability of the generated scientific hypothesis. The score range should be 0 to 3. 0 means there's no verifiability, which indicates that the hypothesis is not possible to be verified in future work. 1 means there's slight verifiability. 2 means there's moderate verifiability. 3 means the hypothesis has strong verifiability, which means the hypothesis is very likely to be verified in future work. Output is an integer.

Return format:
{{
  "novelty":{{"score":<float>,"reason":"≤15 words"}},
  "relevance":{{"score":<float>,"reason":"≤15 words"}},
  "significance":{{"score":<float>,"reason":"≤15 words"}},
  "verifiability":{{"score":<float>,"reason":"≤15 words"}}
}}

Background: {background}

Hypothesis: {hypothesis}
"""

@backoff.on_exception(backoff.expo, openai.OpenAIError, max_time=90)
def call_openai_metrics(bg: str, hp: str, samples: int) -> list:
    prompt = METRICS_PROMPT.format(background=bg, hypothesis=hp)
    resp = openai.chat.completions.create(
        model=gpt_config["chat_model"],
        messages=[
            {"role": "system", "content": "Use half-point scale."},
            {"role": "user",   "content": prompt}
        ],
        temperature=gpt_config["temperature"],
        max_tokens=gpt_config["max_output_tokens"],
        n=samples
    )
    out = []
    for c in resp.choices:
        txt = c.message.content
        txt = re.sub(r"```(?:json)?", "", txt).strip() # type: ignore
        try:
            out.append(json.loads(txt))
        except:
            logger.warning("Still bad metrics JSON: %s", txt[:120])
    return out

def aggregate_metrics(samples: list) -> dict:
    final = {}
    for m in ["novelty", "relevance", "significance", "verifiability"]:
        vals = [s[m]["score"] for s in samples
                if isinstance(s.get(m), dict) and isinstance(s[m].get("score"), (int, float))]
        if not vals:
            final[m] = None
        else:
            sc = mode(vals) if vals and vals.count(mode(vals)) > 1 else round(mean(vals) * 2) / 2
            final[m] = sc
    return final

def load_entries_multi(path: Path):
    if not path.exists(): 
        print(f"file {path} Nothing exists")
        return []
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                o = json.loads(line)
                disease = o.get("disease", "")
                core_genes = o.get("core_genes", [])
                background = f"Disease: {disease}\nCore genes: {', '.join(core_genes)}"
                hypos = o.get("hypotheses", [])
                for hypo in hypos:
                    h = hypo.strip()
                    if h and background:
                        out.append((background, h))
            except Exception as e:
                print("Line parsing error:", e)
    print(f"Successfully generated {len(out)} hypothesis records")
    return out

input_path = Path(r"../data/data_raw/raw_baseline.jsonl")
output_path = Path("direct_baseline.csv")
samples = 3

entries = load_entries_multi(input_path)

with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["index", "novelty", "relevance", "significance", "verifiability"])
    for idx, (bg, hp) in enumerate(entries, 1):
        m = aggregate_metrics(call_openai_metrics(bg, hp, samples))
        row = [idx, m["novelty"], m["relevance"], m["significance"], m["verifiability"]]
        writer.writerow(row)
        f.flush()  
        logger.info(f"[{idx}] Novelty={m['novelty']} | Relevance={m['relevance']} | Significance={m['significance']} | Verifiability={m['verifiability']}")

logger.info(f"Saved {output_path}")


In [None]:
# Pairwise evaluation of two biomedical hypotheses on four metrics (novelty, relevance, significance, verifiability); outputs comparison results for A vs. B.
import json
import openai
import backoff
import re
import logging

import autogen
try:
    config_list = autogen.config_list_from_models(model_list=["gpt-4.1"])
except Exception:
    config_list = []

gpt_config = {
    "model_name":        "gpt-4.1",
    "temperature":       0.2,
    "top_p":             1.0,
    "max_tokens":        400,
    "cache_seed":        42,
    "config_list":       config_list,
}

logger = logging.getLogger(__name__)
logger.handlers.clear()
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

metrics = ["Novelty", "Relevance", "Significance", "Verifiability"]
metric_defs = [
    # Novelty
    """
You are an expert in biomedicine.

Evaluate the novelty of two scientific hypotheses (H_A and H_B) given the user input.

For each, assign a novelty score from 0 to 3. 0 means there's no novelty, which indicates that the hypothesis is a paraphrase of the background. 1 means there's slight novelty. 2 means there's moderate novelty. 3 means the hypothesis has strong novelty, which gives new insights beyond the background. Score two hypotheses and compare which one is more novel(“A”, “B”, or “0” if equal or difference is unclear) 

""",

    # Relevance
    """
 You are an expert in biomedicine.

Evaluate the relevance of two scientific hypotheses (H_A and H_B) given the user input.

For each, assign a relevance score from 0 to 3. 0 means there's no relevance. 1 means there's slight relevance. 2 means there's moderate relevance. 3 means the hypothesis is strongly related to the background. Score both hypotheses and compare which one is more relevant (“A”, “B”, or “0” if equal or difference is unclear)

""",

    # Significance
    """
You are an expert in biomedicine.

Evaluate the significance of two scientific hypotheses (H_A and H_B) given the user input.

For each, assign a significance score from 0 to 3. 0 means there's no significance, which indicates that the hypothesis is just common knowledge. 1 means there's slight significance. 2 means there's moderate significance. 3 means the hypothesis has strong significance, providing significant insights beyond the background. Score both hypotheses and compare which one is more significant (“A”, “B”, or “0” if equal or difference is unclear)

""",

    # Verifiability
    """
You are an expert in biomedicine.

Evaluate the verifiability of two scientific hypotheses (H_A and H_B) given the user input.

For each, assign a verifiability score from 0 to 3. 0 means there's no verifiability, which indicates that the hypothesis is not possible to be verified in future work. 1 means there's slight verifiability. 2 means there's moderate verifiability. 3 means the hypothesis has strong verifiability, which means it is very likely to be verified in future work. Score both hypotheses and compare which one is more verifiable (“A”, “B”, or “0” if equal or difference is unclear)

"""
]

@backoff.on_exception(backoff.expo, openai.OpenAIError, max_time=60)
def call_all_metrics(bg: str, h_a: str, h_b: str, metrics: list, metric_defs: list):
    SYSTEM_METRIC = {
        "role": "system",
        "content": f"""
You are a senior biomedical reviewer. Compare two hypotheses H_A and H_B on **four metrics**: Novelty, Relevance, Significance, Verifiability.

Instructions:
- For each metric, judge and select a winner:
    - "A" if H_A is clearly superior,
    - "B" if H_B is clearly superior,
    - "0" if they are equal or difference is unclear.
- For each, give a concise reason .
- Each metric is judged strictly independently.

Definitions:
Novelty: {metric_defs[0]}
Relevance: {metric_defs[1]}
Significance: {metric_defs[2]}
Verifiability: {metric_defs[3]}

Return **exactly one JSON** object:
{{
  "Novelty": {{"winner": "A"|"B"|"0", "reason": "..." }},
  "Relevance": {{"winner": "A"|"B"|"0", "reason": "..." }},
  "Significance": {{"winner": "A"|"B"|"0", "reason": "..." }},
  "Verifiability": {{"winner": "A"|"B"|"0", "reason": "..." }}
}}
No extra explanation.
""".strip()
    }
    user = {
        "role": "user",
        "content": f"User Input: {bg}\n\nH_A: {h_a}\nH_B: {h_b}"
    }
    resp = openai.chat.completions.create(
        model       = gpt_config["model_name"],
        messages    = [SYSTEM_METRIC, user], # type: ignore
        temperature = gpt_config["temperature"],
        top_p       = gpt_config["top_p"],
        max_tokens  = 400
    )
    txt = resp.choices[0].message.content or ""
    txt = re.sub(r"```(?:json)?", "", txt).strip()
    try:
        js = json.loads(txt)
        return js
    except Exception:
        logger.warning("Multi-metric parse failed: %r", txt)
        return {m: {"winner": "0", "reason": "Parse failed"} for m in metrics}


def format_background(disease, core_genes):
    return f"disease: {disease}, core_genes: {', '.join(core_genes)}"
def get_first_hypothesis(item):
    hypos = item.get("hypotheses", [])
    if isinstance(hypos, list) and hypos:
        return hypos[0]
    elif isinstance(hypos, str) and hypos.strip():
        return hypos.strip()
    else:
        return ""
if __name__ == "__main__":
    input_path_a = r"../data/data_raw/raw_baseline.jsonl"
    input_path_b = r"../data/data_raw/raw_Multiagent.jsonl"
    output_path = "baseline_vs_Multiagent.jsonl"

    with open(input_path_a, "r", encoding="utf-8") as f_a, \
         open(input_path_b, "r", encoding="utf-8") as f_b, \
         open(output_path, "w", encoding="utf-8") as fout:

        for idx, (line_a, line_b) in enumerate(zip(f_a, f_b), 1):

            item_a = json.loads(line_a)
            item_b = json.loads(line_b)
            disease = item_a.get("disease", "")
            core_genes = item_a.get("core_genes", [])
            bg = format_background(disease, core_genes)
            hypo_a = get_first_hypothesis(item_a)
            hypo_b = get_first_hypothesis(item_b)


            all_metric_result = call_all_metrics(bg, hypo_a, hypo_b, metrics, metric_defs)

            winner_list = [all_metric_result[m]["winner"] for m in metrics]
            print(f"Pair {idx}:", winner_list)
            fout.write(json.dumps(winner_list, ensure_ascii=False) + "\n")
            fout.flush()
