In [1]:
%cd ~/synth-persona

import logging
import os
import re
import sys
from abc import ABC, abstractmethod
from time import time

import pandas as pd
from datasets import Dataset, load_dataset
from litellm import batch_completion
from tqdm import tqdm
from vllm import LLM, SamplingParams

/home/susumu.ota/synth-persona


  from .autonotebook import tqdm as notebook_tqdm
2025-03-10 02:19:25,620	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel("DEBUG")

In [3]:
class LanguageModel(ABC):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        self.model = model
        self.max_tokens = max_tokens
        self.seed = seed
        self.temperature = temperature
        self.top_p = top_p
        logger.debug(
            f"model: {model}, max_tokens: {max_tokens}, seed: {seed}, temperature: {temperature}, top_p: {top_p}"
        )

    @abstractmethod
    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        pass


class LiteLLMModel(LanguageModel):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        super().__init__(model, max_tokens, seed, temperature, top_p)

    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        contents = [
            response.choices[0].message.content or ""
            for response in batch_completion(
                model=self.model,
                messages=messages_batch,
                max_tokens=self.max_tokens,
                seed=self.seed,
                temperature=self.temperature,
                top_p=self.top_p,
            )
        ]
        assert len(contents) == len(messages_batch)
        return contents


class VLLMModel(LanguageModel):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        super().__init__(model, max_tokens, seed, temperature, top_p)
        self.vllm = LLM(model, seed=seed, gpu_memory_utilization=1.0, max_model_len=32 * 1024)  # TODO: parameterize
        self.tokenizer = self.vllm.get_tokenizer()

    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        sampling_params = SamplingParams(
            max_tokens=self.max_tokens, seed=self.seed, temperature=self.temperature, top_p=self.top_p
        )
        prompts = [
            self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            for messages in messages_batch
        ]
        outputs = self.vllm.generate(prompts, sampling_params=sampling_params, use_tqdm=False)
        contents = [output.outputs[0].text for output in outputs]
        assert len(contents) == len(messages_batch)
        return contents

In [4]:
SYSTEM_PROMPT = 'Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user\'s instructions and answers the user\'s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie.'
SYSTEM_PROMPT_RENAME = 'Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user\'s instructions and answers the user\'s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[B]]" if assistant B is better, "[[A]]" if assistant A is better, and "[[C]]" for a tie.'

PROMPT_TEMPLATE = "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]"
PROMPT_TEMPLATE_RENAME = "[User Question]\n{question}\n\n[The Start of Assistant B's Answer]\n{answer_a}\n[The End of Assistant B's Answer]\n\n[The Start of Assistant A's Answer]\n{answer_b}\n[The End of Assistant A's Answer]"

In [5]:
# answers need to be a list of dictionaries with keys "question", "answer_a", and "answer_b"
def generate_decision_texts(
    llm: LanguageModel, answers: list[dict[str, str]], system_prompt: str, prompt_template: str
) -> list[str]:
    prompts = [
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt_template.format(**answer)},
        ]
        for answer in answers
    ]
    return llm(prompts)

In [6]:
QUESTION_TEMPLATE = """Create a math problem related to the following persona:

{persona}

Note:

1. The math problem should be short, easy and involve basic mathematical skills and knowledge. Any average grade school student can solve it correctly.
2. You should make full use of the persona description to create the math problem to ensure that the math problem is unique and specific to the persona.
3. Your response should not include a solution to the created math problem.
4. 深い専門知識が必要な問題を避け、平均的な知識と常識の範囲内で解ける問題にしてください。
5. 簡潔に日本語で回答してください。
"""

In [7]:
def convert_answers(dataset_a: Dataset, dataset_b: Dataset, question_template: str) -> list[dict[str, str]]:
    return [
        {
            "question": question_template.format(persona=a["persona"]),
            "answer_a": a["problem_answer"],
            "answer_b": b["problem_answer"],
        }
        for a, b in zip(dataset_a, dataset_b)
    ]

In [8]:
def parse_response(text: str) -> dict[str, str]:
    m = re.match(r"^(<think>)?(.*)</think>(.*)$", text, flags=re.DOTALL | re.MULTILINE)
    return (
        {"think": m.group(2), "answer": m.group(3), "format_reward": 1.0}
        if m
        else {"think": "", "answer": text, "format_reward": 0.0}
    )

In [9]:
def parse_decision_text(decision_text: str) -> str:
    is_a = is_b = is_c = False
    if "[[A]]" in decision_text:
        is_a = True
    if "[[B]]" in decision_text:
        is_b = True
    if "[[C]]" in decision_text:
        is_c = True
    decision = "C"
    if is_a and is_b:
        # raise ValueError(f"Both A and B are chosen: {decision_text}")
        logger.debug(f"Both A and B are chosen: {decision_text}")
        decision = "C"
    elif is_a:
        decision = "A"
    elif is_b:
        decision = "B"
    elif is_c:
        decision = "C"
    else:
        # raise ValueError(f"Unknown decision: {decision_text}")
        logger.debug(f"Unknown decision: {decision_text}")
        decision = "C"
    # logger.debug(f"decision: {decision}")
    return decision

In [10]:
def judge_answers(
    llm: LanguageModel, answers: list[dict[str, str]], system_prompt: str, prompt_template: str
) -> list[str]:
    return [
        parse_decision_text(parse_response(decision_text)["answer"])
        for decision_text in generate_decision_texts(llm, answers, system_prompt, prompt_template)
    ]

In [33]:
def count_decisions(decisions: list[str]) -> dict[str, int]:
    return {decision: decisions.count(decision) for decision in ["A", "B", "C"]}

In [12]:
def swap_answers(answers: list[dict[str, str]]) -> list[dict[str, str]]:
    return [{"question": a["question"], "answer_a": a["answer_b"], "answer_b": a["answer_a"]} for a in answers]

In [13]:
def swap_decisions(decisions: list[str]) -> list[str]:
    return ["A" if d == "B" else "B" if d == "A" else "C" for d in decisions]

In [14]:
def compare_decisions(
    decisions: list[str], swapped_decisions: list[str], is_position_swapped: bool
) -> dict[str, dict[str, int]]:
    count = {"A": 0, "B": 0, "C": 0, "bias": 0}
    bias = {"first": 0, "second": 0}
    for d, sd in zip(decisions, swapped_decisions):
        if d == sd:
            count[d] += 1
        else:
            count["bias"] += 1
            if d == "A" and (sd == "B" or sd == "C"):
                # if is_position_swapped is True,  sd == "B" means that LLM prefers the first answer which is denoted as "first"
                # if is_position_swapped is False, sd == "B" means that LLM prefers the name "B" which is denoted as "second"
                bias["first" if is_position_swapped else "second"] += 1
            elif d == "B" and (sd == "A" or sd == "C"):
                bias["second" if is_position_swapped else "first"] += 1
            elif d == "C" and sd == "A":
                bias["second" if is_position_swapped else "first"] += 1
            elif d == "C" and sd == "B":
                bias["first" if is_position_swapped else "second"] += 1
            else:
                raise ValueError(f"Unknown decision: {d}, {sd}")
    return {"count": count, "bias": bias}

In [15]:
def get_unbiased_decisions(
    decisions: list[str], position_swapped_decisions: list[str], name_swapped_decisions: list[str]
) -> dict[str, dict[str, int]]:
    assert len(decisions) == len(position_swapped_decisions) == len(name_swapped_decisions)
    return [
        d if d == pd == nd else "C" for d, pd, nd in zip(decisions, position_swapped_decisions, name_swapped_decisions)
    ]

In [16]:
def get_bias(comparison):
    total = sum(comparison["count"].values())
    bias = comparison["count"]["bias"]
    first = comparison["bias"]["first"]
    second = comparison["bias"]["second"]
    assert bias == first + second
    return {
        "consistent": (total - bias) / total,
        "first": first / total,
        "second": second / total,
    }

In [17]:
def get_win_rate(decisions: list[str]) -> dict[str, float]:
    count = count_decisions(decisions)
    total = sum(count.values())
    decision_a = count.get("A", 0)
    decision_b = count.get("B", 0)
    decision_c = count.get("C", 0)
    assert total == decision_a + decision_b + decision_c
    return {
        "A": decision_a / (total - decision_c),
        "B": decision_b / (total - decision_c),
    }

In [18]:
def run_experiment_once(judge_llm: LanguageModel, answers: list[dict[str, str]]) -> dict[str, dict]:
    logger.debug("judging baseline...")
    time_start = time()
    decisions = judge_answers(judge_llm, answers, SYSTEM_PROMPT, PROMPT_TEMPLATE)
    logger.debug(f"judging baseline...done: {time() - time_start:.2f}s")

    logger.debug("judging position swapped...")
    time_start = time()
    position_swapped_decisions = swap_decisions(
        judge_answers(judge_llm, swap_answers(answers), SYSTEM_PROMPT, PROMPT_TEMPLATE)
    )
    logger.debug(f"judging position swapped...done: {time() - time_start:.2f}s")

    logger.debug("judging name swapped...")
    time_start = time()
    name_swapped_decisions = swap_decisions(
        judge_answers(judge_llm, answers, SYSTEM_PROMPT_RENAME, PROMPT_TEMPLATE_RENAME)
    )
    logger.debug(f"judging name swapped...done: {time() - time_start:.2f}s")

    unbiased_decisions = get_unbiased_decisions(decisions, position_swapped_decisions, name_swapped_decisions)

    position_comparison = compare_decisions(decisions, position_swapped_decisions, True)
    name_comparison = compare_decisions(decisions, name_swapped_decisions, False)

    position_bias = get_bias(position_comparison)
    name_bias = get_bias(name_comparison)

    return {
        "decisions": decisions,
        "position_swapped_decisions": position_swapped_decisions,
        "name_swapped_decisions": name_swapped_decisions,
        "unbiased_decisions": unbiased_decisions,
        "decisions_count": count_decisions(decisions),
        "position_swapped_decisions_count": count_decisions(position_swapped_decisions),
        "name_swapped_decisions_count": count_decisions(name_swapped_decisions),
        "unbiased_decisions_count": count_decisions(unbiased_decisions),
        "decisions_win_rate": get_win_rate(decisions),
        "position_swapped_decisions_win_rate": get_win_rate(position_swapped_decisions),
        "name_swapped_decisions_win_rate": get_win_rate(name_swapped_decisions),
        "unbiased_decisions_win_rate": get_win_rate(unbiased_decisions),
        "position_comparison": position_comparison,
        "name_comparison": name_comparison,
        "position_bias": position_bias,
        "name_bias": name_bias,
    }

In [19]:
def run_experiment(judge_model: str, input_1_jsonl: str, input_2_jsonl: str, n: int) -> dict[str, pd.DataFrame]:
    judge_llm = LiteLLMModel(judge_model, max_tokens=4096, seed=1, temperature=0.6, top_p=0.95)

    dataset_1 = load_dataset("json", data_files=input_1_jsonl, split="train", cache_dir=".cache")
    dataset_2 = load_dataset("json", data_files=input_2_jsonl, split="train", cache_dir=".cache")
    answers = convert_answers(dataset_1, dataset_2, QUESTION_TEMPLATE)

    results = [run_experiment_once(judge_llm, answers) for _ in tqdm(range(n))]
    return {key: pd.DataFrame([result[key] for result in results]) for key in results[0].keys()}

In [20]:
os.environ["DEEPSEEK_API_BASE"] = "http://192.168.11.25:39877/v1"
judge_model = "deepseek/deepseek-reasoner"
input_1_jsonl = "misc/data/output-r1-46023.jsonl"
input_2_jsonl = "misc/data/output-gpt-4o-mini-46023.jsonl"
n = 8

dfs = run_experiment(judge_model, input_1_jsonl, input_2_jsonl, n)
dfs

2025-03-09 11:04:49 - DEBUG - __main__ - model: deepseek/deepseek-reasoner, max_tokens: 4096, seed: 1, temperature: 0.6, top_p: 0.95


  0%|          | 0/8 [00:00<?, ?it/s]

2025-03-09 11:04:50 - DEBUG - __main__ - judging baseline...
2025-03-09 11:07:05 - DEBUG - __main__ - judging baseline...done: 134.66s
2025-03-09 11:07:05 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:09:24 - DEBUG - __main__ - judging position swapped...done: 139.25s
2025-03-09 11:09:24 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:11:28 - DEBUG - __main__ - judging name swapped...done: 124.23s


 12%|█▎        | 1/8 [06:38<46:26, 398.14s/it]

2025-03-09 11:11:28 - DEBUG - __main__ - judging baseline...
2025-03-09 11:13:19 - DEBUG - __main__ - judging baseline...done: 110.63s
2025-03-09 11:13:19 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:14:56 - DEBUG - __main__ - judging position swapped...done: 97.17s
2025-03-09 11:14:56 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:16:44 - DEBUG - __main__ - judging name swapped...done: 107.59s


 25%|██▌       | 2/8 [11:53<34:56, 349.46s/it]

2025-03-09 11:16:44 - DEBUG - __main__ - judging baseline...
2025-03-09 11:19:27 - DEBUG - __main__ - judging baseline...done: 163.50s
2025-03-09 11:19:27 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:21:35 - DEBUG - __main__ - judging position swapped...done: 128.06s
2025-03-09 11:21:35 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:23:30 - DEBUG - __main__ - judging name swapped...done: 114.54s


 38%|███▊      | 3/8 [18:39<31:16, 375.33s/it]

2025-03-09 11:23:30 - DEBUG - __main__ - judging baseline...
2025-03-09 11:25:15 - DEBUG - __main__ - judging baseline...done: 104.95s
2025-03-09 11:25:15 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:27:18 - DEBUG - __main__ - judging position swapped...done: 122.85s
2025-03-09 11:27:18 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:29:34 - DEBUG - __main__ - judging name swapped...done: 136.07s


 50%|█████     | 4/8 [24:43<24:43, 370.80s/it]

2025-03-09 11:29:34 - DEBUG - __main__ - judging baseline...
2025-03-09 11:31:29 - DEBUG - __main__ - judging baseline...done: 115.24s
2025-03-09 11:31:29 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:33:28 - DEBUG - __main__ - judging position swapped...done: 119.34s
2025-03-09 11:33:28 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:35:29 - DEBUG - __main__ - judging name swapped...done: 120.44s


 62%|██████▎   | 5/8 [30:38<18:15, 365.11s/it]

2025-03-09 11:35:29 - DEBUG - __main__ - judging baseline...
2025-03-09 11:37:37 - DEBUG - __main__ - judging baseline...done: 128.42s
2025-03-09 11:37:37 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:39:18 - DEBUG - __main__ - judging position swapped...done: 100.85s
2025-03-09 11:39:18 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:41:30 - DEBUG - __main__ - judging name swapped...done: 132.31s


 75%|███████▌  | 6/8 [36:40<12:07, 363.91s/it]

2025-03-09 11:41:30 - DEBUG - __main__ - judging baseline...
2025-03-09 11:43:44 - DEBUG - __main__ - judging baseline...done: 133.89s
2025-03-09 11:43:44 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:46:00 - DEBUG - __main__ - judging position swapped...done: 135.52s
2025-03-09 11:46:00 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:48:18 - DEBUG - __main__ - judging name swapped...done: 138.40s


 88%|████████▊ | 7/8 [43:27<06:18, 378.26s/it]

2025-03-09 11:48:18 - DEBUG - __main__ - judging baseline...
2025-03-09 11:50:20 - DEBUG - __main__ - judging baseline...done: 121.43s
2025-03-09 11:50:20 - DEBUG - __main__ - judging position swapped...
2025-03-09 11:52:01 - DEBUG - __main__ - judging position swapped...done: 101.64s
2025-03-09 11:52:01 - DEBUG - __main__ - judging name swapped...
2025-03-09 11:53:55 - DEBUG - __main__ - judging name swapped...done: 113.45s


100%|██████████| 8/8 [49:04<00:00, 368.06s/it]


{'decisions':   0  1  2  3  4  5  6  7  8  9   ... 90 91 92 93 94 95 96 97 98 99
 0  A  A  B  B  B  A  B  A  A  B  ...  A  A  A  A  B  A  B  A  B  B
 1  A  A  B  B  B  B  B  B  A  A  ...  A  A  A  A  B  A  B  A  A  B
 2  A  B  B  B  B  A  C  A  A  B  ...  A  A  A  B  A  A  B  A  A  B
 3  A  B  B  B  A  A  B  A  A  A  ...  A  A  A  A  B  A  B  A  B  B
 4  A  A  A  B  A  A  C  B  A  A  ...  A  A  A  B  B  A  B  A  B  B
 5  A  A  B  B  A  A  B  A  A  A  ...  A  A  A  A  A  A  B  A  A  B
 6  A  A  A  B  A  A  B  B  A  C  ...  A  A  A  B  B  B  B  A  A  B
 7  A  A  A  B  A  A  A  B  A  B  ...  A  A  A  A  B  A  B  A  B  B
 
 [8 rows x 100 columns],
 'position_swapped_decisions':   0  1  2  3  4  5  6  7  8  9   ... 90 91 92 93 94 95 96 97 98 99
 0  B  B  B  B  B  B  B  B  A  B  ...  B  A  A  B  B  A  B  B  B  B
 1  A  B  B  B  B  B  B  B  A  B  ...  A  A  A  A  B  B  B  B  B  B
 2  B  B  B  B  B  B  C  B  A  B  ...  B  A  A  B  B  B  B  B  B  B
 3  B  B  B  B  B  B  B  B  A  B  ...  B  A  C

In [26]:
dfs.keys()

dict_keys(['decisions', 'position_swapped_decisions', 'name_swapped_decisions', 'unbiased_decisions', 'decisions_count', 'position_swapped_decisions_count', 'name_swapped_decisions_count', 'unbiased_decisions_count', 'decisions_win_rate', 'position_swapped_decisions_win_rate', 'name_swapped_decisions_win_rate', 'unbiased_decisions_win_rate', 'position_comparison', 'name_comparison', 'position_bias', 'name_bias'])

In [2]:
output_dir = "misc/data"
prefix = "results-r1-46023-vs-gpt-4o-mini-46023"

In [None]:
# save the results
os.makedirs(output_dir, exist_ok=True)
for key, df in dfs.items():
    df.to_csv(os.path.join(output_dir, f"{prefix}-{key}.csv"), index=False)

In [3]:
# read from the saved results
dfs = {}
prefix = "results-r1-46023-vs-gpt-4o-mini-46023"
for key in [
    "decisions",
    "position_swapped_decisions",
    "name_swapped_decisions",
    "unbiased_decisions",
    "decisions_count",
    "position_swapped_decisions_count",
    "name_swapped_decisions_count",
    "unbiased_decisions_count",
    "decisions_win_rate",
    "position_swapped_decisions_win_rate",
    "name_swapped_decisions_win_rate",
    "unbiased_decisions_win_rate",
    "position_comparison",
    "name_comparison",
    "position_bias",
    "name_bias",
]:
    dfs[key] = pd.read_csv(os.path.join(output_dir, f"{prefix}-{key}.csv"))

In [5]:
dfs["unbiased_decisions_win_rate"]

Unnamed: 0,A,B
0,0.738095,0.261905
1,0.822222,0.177778
2,0.777778,0.222222
3,0.777778,0.222222
4,0.730769,0.269231
5,0.787234,0.212766
6,0.803571,0.196429
7,0.8,0.2


In [4]:
dfs["unbiased_decisions_win_rate"].mean(), dfs["unbiased_decisions_win_rate"].std()

(A    0.779681
 B    0.220319
 dtype: float64,
 A    0.031576
 B    0.031576
 dtype: float64)

In [8]:
dfs["position_bias"].mean(), dfs["position_bias"].std()

(consistent    0.61000
 first         0.36375
 second        0.02625
 dtype: float64,
 consistent    0.032950
 first         0.029731
 second        0.014079
 dtype: float64)

In [10]:
dfs["name_bias"].mean(), dfs["name_bias"].std()

(consistent    0.80875
 first         0.17125
 second        0.02000
 dtype: float64,
 consistent    0.046733
 first         0.050267
 second        0.007559
 dtype: float64)