In [1]:
%cd ~/synth-persona

import logging
import os
import re
import sys
from abc import ABC, abstractmethod
from time import time

import pandas as pd
from datasets import Dataset, load_dataset
from litellm import batch_completion
from tqdm import tqdm
from vllm import LLM, SamplingParams

/home/susumu.ota/synth-persona


In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel("DEBUG")

In [3]:
class LanguageModel(ABC):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        self.model = model
        self.max_tokens = max_tokens
        self.seed = seed
        self.temperature = temperature
        self.top_p = top_p
        logger.debug(
            f"model: {model}, max_tokens: {max_tokens}, seed: {seed}, temperature: {temperature}, top_p: {top_p}"
        )

    @abstractmethod
    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        pass


class LiteLLMModel(LanguageModel):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        super().__init__(model, max_tokens, seed, temperature, top_p)

    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        contents = [
            response.choices[0].message.content or ""
            for response in batch_completion(
                model=self.model,
                messages=messages_batch,
                max_tokens=self.max_tokens,
                seed=self.seed,
                temperature=self.temperature,
                top_p=self.top_p,
            )
        ]
        assert len(contents) == len(messages_batch)
        return contents


class VLLMModel(LanguageModel):
    def __init__(self, model: str, max_tokens=512, seed: int | None = None, temperature=1.0, top_p=0.95) -> None:
        super().__init__(model, max_tokens, seed, temperature, top_p)
        self.vllm = LLM(model, seed=seed, gpu_memory_utilization=1.0, max_model_len=32 * 1024)  # TODO: parameterize
        self.tokenizer = self.vllm.get_tokenizer()

    def __call__(self, messages_batch: list[list[dict[str, str]]]) -> list[str]:
        sampling_params = SamplingParams(
            max_tokens=self.max_tokens, seed=self.seed, temperature=self.temperature, top_p=self.top_p
        )
        prompts = [
            self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            for messages in messages_batch
        ]
        outputs = self.vllm.generate(prompts, sampling_params=sampling_params, use_tqdm=False)
        contents = [output.outputs[0].text for output in outputs]
        assert len(contents) == len(messages_batch)
        return contents

In [4]:
SYSTEM_PROMPT = 'Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user\'s instructions and answers the user\'s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie.'
SYSTEM_PROMPT_RENAME = 'Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user\'s instructions and answers the user\'s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[B]]" if assistant B is better, "[[A]]" if assistant A is better, and "[[C]]" for a tie.'

PROMPT_TEMPLATE = "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]"
PROMPT_TEMPLATE_RENAME = "[User Question]\n{question}\n\n[The Start of Assistant B's Answer]\n{answer_a}\n[The End of Assistant B's Answer]\n\n[The Start of Assistant A's Answer]\n{answer_b}\n[The End of Assistant A's Answer]"

In [5]:
# answers need to be a list of dictionaries with keys "question", "answer_a", and "answer_b"
def generate_decision_texts(
    llm: LanguageModel, answers: list[dict[str, str]], system_prompt: str, prompt_template: str
) -> list[str]:
    prompts = [
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt_template.format(**answer)},
        ]
        for answer in answers
    ]
    return llm(prompts)

In [None]:
# def convert_answers(dataset_a: Dataset, dataset_b: Dataset, question_template: str) -> list[dict[str, str]]:
#     return [
#         {
#             "question": question_template.format(persona=a["persona"]),
#             "answer_a": a["problem_answer"],
#             "answer_b": b["problem_answer"],
#         }
#         for a, b in zip(dataset_a, dataset_b)
#     ]

In [7]:
def parse_response(text: str) -> dict[str, str]:
    m = re.match(r"^(<think>)?(.*)</think>(.*)$", text, flags=re.DOTALL | re.MULTILINE)
    return (
        {"think": m.group(2), "answer": m.group(3), "format_reward": 1.0}
        if m
        else {"think": "", "answer": text, "format_reward": 0.0}
    )

In [8]:
def parse_decision_text(decision_text: str) -> str:
    is_a = is_b = is_c = False
    if "[[A]]" in decision_text:
        is_a = True
    if "[[B]]" in decision_text:
        is_b = True
    if "[[C]]" in decision_text:
        is_c = True
    decision = "C"
    if is_a and is_b:
        # raise ValueError(f"Both A and B are chosen: {decision_text}")
        logger.debug(f"Both A and B are chosen: {decision_text}")
        decision = "C"
    elif is_a:
        decision = "A"
    elif is_b:
        decision = "B"
    elif is_c:
        decision = "C"
    else:
        # raise ValueError(f"Unknown decision: {decision_text}")
        logger.debug(f"Unknown decision: {decision_text}")
        decision = "C"
    # logger.debug(f"decision: {decision}")
    return decision

In [9]:
def judge_answers(
    llm: LanguageModel, answers: list[dict[str, str]], system_prompt: str, prompt_template: str
) -> list[str]:
    return [
        parse_decision_text(parse_response(decision_text)["answer"])
        for decision_text in generate_decision_texts(llm, answers, system_prompt, prompt_template)
    ]

In [10]:
def count_decisions(decisions: list[str]) -> dict[str, int]:
    return {decision: decisions.count(decision) for decision in ["A", "B", "C"]}

In [11]:
def swap_answers(answers: list[dict[str, str]]) -> list[dict[str, str]]:
    return [{"question": a["question"], "answer_a": a["answer_b"], "answer_b": a["answer_a"]} for a in answers]

In [12]:
def swap_decisions(decisions: list[str]) -> list[str]:
    return ["A" if d == "B" else "B" if d == "A" else "C" for d in decisions]

In [13]:
def compare_decisions(
    decisions: list[str], swapped_decisions: list[str], is_position_swapped: bool
) -> dict[str, dict[str, int]]:
    count = {"A": 0, "B": 0, "C": 0, "bias": 0}
    bias = {"first": 0, "second": 0}
    for d, sd in zip(decisions, swapped_decisions):
        if d == sd:
            count[d] += 1
        else:
            count["bias"] += 1
            if d == "A" and (sd == "B" or sd == "C"):
                # if is_position_swapped is True,  sd == "B" means that LLM prefers the first answer which is denoted as "first"
                # if is_position_swapped is False, sd == "B" means that LLM prefers the name "B" which is denoted as "second"
                bias["first" if is_position_swapped else "second"] += 1
            elif d == "B" and (sd == "A" or sd == "C"):
                bias["second" if is_position_swapped else "first"] += 1
            elif d == "C" and sd == "A":
                bias["second" if is_position_swapped else "first"] += 1
            elif d == "C" and sd == "B":
                bias["first" if is_position_swapped else "second"] += 1
            else:
                raise ValueError(f"Unknown decision: {d}, {sd}")
    return {"count": count, "bias": bias}

In [14]:
def get_unbiased_decisions(
    decisions: list[str], position_swapped_decisions: list[str], name_swapped_decisions: list[str]
) -> dict[str, dict[str, int]]:
    assert len(decisions) == len(position_swapped_decisions) == len(name_swapped_decisions)
    return [
        d if d == pd == nd else "C" for d, pd, nd in zip(decisions, position_swapped_decisions, name_swapped_decisions)
    ]

In [15]:
def get_bias(comparison):
    total = sum(comparison["count"].values())
    bias = comparison["count"]["bias"]
    first = comparison["bias"]["first"]
    second = comparison["bias"]["second"]
    assert bias == first + second
    return {
        "consistent": (total - bias) / total,
        "first": first / total,
        "second": second / total,
    }

In [16]:
def get_win_rate(decisions: list[str]) -> dict[str, float]:
    count = count_decisions(decisions)
    total = sum(count.values())
    decision_a = count.get("A", 0)
    decision_b = count.get("B", 0)
    decision_c = count.get("C", 0)
    assert total == decision_a + decision_b + decision_c
    return {
        "A": decision_a / (total - decision_c),
        "B": decision_b / (total - decision_c),
    }

In [17]:
def run_experiment_once(judge_llm: LanguageModel, answers: list[dict[str, str]]) -> dict[str, dict]:
    logger.debug("judging baseline...")
    time_start = time()
    decisions = judge_answers(judge_llm, answers, SYSTEM_PROMPT, PROMPT_TEMPLATE)
    logger.debug(f"judging baseline...done: {time() - time_start:.2f}s")

    logger.debug("judging position swapped...")
    time_start = time()
    position_swapped_decisions = swap_decisions(
        judge_answers(judge_llm, swap_answers(answers), SYSTEM_PROMPT, PROMPT_TEMPLATE)
    )
    logger.debug(f"judging position swapped...done: {time() - time_start:.2f}s")

    logger.debug("judging name swapped...")
    time_start = time()
    name_swapped_decisions = swap_decisions(
        judge_answers(judge_llm, answers, SYSTEM_PROMPT_RENAME, PROMPT_TEMPLATE_RENAME)
    )
    logger.debug(f"judging name swapped...done: {time() - time_start:.2f}s")

    unbiased_decisions = get_unbiased_decisions(decisions, position_swapped_decisions, name_swapped_decisions)

    position_comparison = compare_decisions(decisions, position_swapped_decisions, True)
    name_comparison = compare_decisions(decisions, name_swapped_decisions, False)

    position_bias = get_bias(position_comparison)
    name_bias = get_bias(name_comparison)

    return {
        "decisions": decisions,
        "position_swapped_decisions": position_swapped_decisions,
        "name_swapped_decisions": name_swapped_decisions,
        "unbiased_decisions": unbiased_decisions,
        "decisions_count": count_decisions(decisions),
        "position_swapped_decisions_count": count_decisions(position_swapped_decisions),
        "name_swapped_decisions_count": count_decisions(name_swapped_decisions),
        "unbiased_decisions_count": count_decisions(unbiased_decisions),
        "decisions_win_rate": get_win_rate(decisions),
        "position_swapped_decisions_win_rate": get_win_rate(position_swapped_decisions),
        "name_swapped_decisions_win_rate": get_win_rate(name_swapped_decisions),
        "unbiased_decisions_win_rate": get_win_rate(unbiased_decisions),
        "position_comparison": position_comparison,
        "name_comparison": name_comparison,
        "position_bias": position_bias,
        "name_bias": name_bias,
    }

In [18]:
def run_experiment(
    judge_model: str, input_1_jsonl: str, input_2_jsonl: str, question_template: str, n: int
) -> dict[str, pd.DataFrame]:
    judge_llm = LiteLLMModel(judge_model, max_tokens=4096, seed=1, temperature=0.6, top_p=0.95)

    dataset_1 = load_dataset("json", data_files=input_1_jsonl, split="train", cache_dir=".cache")
    dataset_2 = load_dataset("json", data_files=input_2_jsonl, split="train", cache_dir=".cache")
    answers = convert_answers(dataset_1, dataset_2, question_template)

    results = [run_experiment_once(judge_llm, answers) for _ in tqdm(range(n))]
    return {key: pd.DataFrame([result[key] for result in results]) for key in results[0].keys()}

In [19]:
PERSONA_TEMPLATE = """You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:
A machine learning researcher...
A pediatric nurse whose...
An urban planner focused on...

What is the likely profession, interest, or role of the person who would write or be interested in this text?

## Text
{text}

Note:
1. Your response should always start with "ペルソナ:".
2. Your response should be one sentence. You should not include any notes or translations.
3. 簡潔に日本語で回答してください。
"""

PERSONA_TEMPLATE_EN = """You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:
A machine learning researcher...
A pediatric nurse whose...
An urban planner focused on...

What is the likely profession, interest, or role of the person who would write or be interested in this text?

## Text
{text}

Note:
1. Your response should always start with "ペルソナ:".
2. Your response should be one sentence. You should not include any notes or translations.
3. Please respond in English.
"""


def convert_answers(dataset_a: Dataset, dataset_b: Dataset, question_template: str) -> list[dict[str, str]]:
    return [
        {
            "question": question_template.format(text=a["text"]),
            "answer_a": a["persona_answer"],
            "answer_b": b["persona_answer"],
        }
        for a, b in zip(dataset_a, dataset_b)
    ]

In [20]:
os.environ["DEEPSEEK_API_BASE"] = "http://192.168.11.43:39877/v1"
judge_model = "deepseek/deepseek-reasoner"
# input_1_jsonl = "eval/data/persona-rinna-r1-3407.jsonl"
# input_2_jsonl = "eval/data/persona-gpt-4o-mini-3407.jsonl"
# question_template = PERSONA_TEMPLATE
input_1_jsonl = "eval/data/persona-rinna-r1-en-3407.jsonl"
input_2_jsonl = "eval/data/persona-gpt-4o-mini-en-3407.jsonl"
question_template = PERSONA_TEMPLATE_EN
n = 8

dfs = run_experiment(judge_model, input_1_jsonl, input_2_jsonl, question_template, n)
dfs

2025-03-13 01:02:44 - DEBUG - __main__ - model: deepseek/deepseek-reasoner, max_tokens: 4096, seed: 1, temperature: 0.6, top_p: 0.95


  0%|          | 0/8 [00:00<?, ?it/s]

2025-03-13 01:02:45 - DEBUG - __main__ - judging baseline...
2025-03-13 01:04:25 - DEBUG - __main__ - judging baseline...done: 100.47s
2025-03-13 01:04:25 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:06:33 - DEBUG - __main__ - judging position swapped...done: 127.94s
2025-03-13 01:06:33 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:07:51 - DEBUG - __main__ - judging name swapped...done: 77.82s


 12%|█▎        | 1/8 [05:06<35:43, 306.23s/it]

2025-03-13 01:07:51 - DEBUG - __main__ - judging baseline...
2025-03-13 01:10:03 - DEBUG - __main__ - judging baseline...done: 131.89s
2025-03-13 01:10:03 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:12:29 - DEBUG - __main__ - judging position swapped...done: 145.86s
2025-03-13 01:12:29 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:14:27 - DEBUG - __main__ - judging name swapped...done: 118.20s


 25%|██▌       | 2/8 [11:42<35:54, 359.01s/it]

2025-03-13 01:14:27 - DEBUG - __main__ - judging baseline...
2025-03-13 01:16:46 - DEBUG - __main__ - judging baseline...done: 138.53s
2025-03-13 01:16:46 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:18:29 - DEBUG - __main__ - judging position swapped...done: 103.84s
2025-03-13 01:18:29 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:19:45 - DEBUG - __main__ - judging name swapped...done: 75.11s


 38%|███▊      | 3/8 [16:59<28:20, 340.05s/it]

2025-03-13 01:19:45 - DEBUG - __main__ - judging baseline...
2025-03-13 01:21:37 - DEBUG - __main__ - judging baseline...done: 112.40s
2025-03-13 01:21:37 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:23:31 - DEBUG - __main__ - judging position swapped...done: 113.65s
2025-03-13 01:23:31 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:24:49 - DEBUG - __main__ - judging name swapped...done: 78.63s


 50%|█████     | 4/8 [22:04<21:44, 326.08s/it]

2025-03-13 01:24:49 - DEBUG - __main__ - judging baseline...
2025-03-13 01:26:34 - DEBUG - __main__ - judging baseline...done: 104.82s
2025-03-13 01:26:34 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:28:15 - DEBUG - __main__ - judging position swapped...done: 101.43s
2025-03-13 01:28:15 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:29:34 - DEBUG - __main__ - judging name swapped...done: 78.59s


 62%|██████▎   | 5/8 [26:49<15:33, 311.21s/it]

2025-03-13 01:29:34 - DEBUG - __main__ - judging baseline...
2025-03-13 01:31:22 - DEBUG - __main__ - judging baseline...done: 108.26s
2025-03-13 01:31:22 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:32:54 - DEBUG - __main__ - judging position swapped...done: 91.44s
2025-03-13 01:32:54 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:34:24 - DEBUG - __main__ - judging name swapped...done: 90.50s


 75%|███████▌  | 6/8 [31:39<10:08, 304.07s/it]

2025-03-13 01:34:24 - DEBUG - __main__ - judging baseline...
2025-03-13 01:36:03 - DEBUG - __main__ - judging baseline...done: 99.09s
2025-03-13 01:36:03 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:37:46 - DEBUG - __main__ - judging position swapped...done: 102.70s
2025-03-13 01:37:46 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:38:57 - DEBUG - __main__ - judging name swapped...done: 71.19s


 88%|████████▊ | 7/8 [36:12<04:53, 293.90s/it]

2025-03-13 01:38:57 - DEBUG - __main__ - judging baseline...
2025-03-13 01:40:26 - DEBUG - __main__ - judging baseline...done: 88.40s
2025-03-13 01:40:26 - DEBUG - __main__ - judging position swapped...
2025-03-13 01:42:17 - DEBUG - __main__ - judging position swapped...done: 110.91s
2025-03-13 01:42:17 - DEBUG - __main__ - judging name swapped...
2025-03-13 01:43:41 - DEBUG - __main__ - judging name swapped...done: 84.23s


100%|██████████| 8/8 [40:55<00:00, 306.99s/it]


{'decisions':   0  1  2  3  4  5  6  7  8  9   ... 90 91 92 93 94 95 96 97 98 99
 0  A  A  A  A  A  B  B  A  A  A  ...  A  A  B  A  A  B  A  B  A  A
 1  A  A  A  A  A  B  B  A  A  A  ...  A  A  A  B  A  A  A  B  A  A
 2  A  A  A  A  B  A  C  B  A  A  ...  A  A  A  B  A  B  A  B  B  A
 3  A  A  A  A  A  A  B  B  A  A  ...  A  A  A  B  A  A  A  B  A  A
 4  A  A  A  A  A  A  B  A  A  A  ...  A  A  B  A  A  A  A  B  B  A
 5  A  A  A  A  A  A  B  B  A  A  ...  A  A  A  B  A  A  A  B  A  A
 6  A  A  A  A  A  A  A  A  A  A  ...  A  A  A  B  A  B  A  B  A  A
 7  A  A  A  A  A  A  B  B  A  A  ...  A  A  A  B  A  B  A  B  A  A
 
 [8 rows x 100 columns],
 'position_swapped_decisions':   0  1  2  3  4  5  6  7  8  9   ... 90 91 92 93 94 95 96 97 98 99
 0  A  A  A  A  A  A  B  B  A  A  ...  A  B  A  B  A  B  A  B  A  A
 1  A  A  A  A  A  A  B  A  A  A  ...  A  A  A  B  A  B  A  B  A  A
 2  A  A  A  A  A  A  B  B  A  A  ...  B  A  A  B  A  B  A  B  A  A
 3  A  A  A  A  A  A  B  B  A  A  ...  A  B  A

In [34]:
output_dir = "eval/data"
# prefix = "results-persona-en-rinna-vs-gpt4-3407"
prefix = "results-persona-rinna-vs-gpt4-3407"

In [22]:
# save the results
os.makedirs(output_dir, exist_ok=True)
for key, df in dfs.items():
    df.to_csv(os.path.join(output_dir, f"{prefix}-{key}.csv"), index=False)

In [35]:
# read from the saved results
dfs = {}
for key in [
    "decisions",
    "position_swapped_decisions",
    "name_swapped_decisions",
    "unbiased_decisions",
    "decisions_count",
    "position_swapped_decisions_count",
    "name_swapped_decisions_count",
    "unbiased_decisions_count",
    "decisions_win_rate",
    "position_swapped_decisions_win_rate",
    "name_swapped_decisions_win_rate",
    "unbiased_decisions_win_rate",
    "position_comparison",
    "name_comparison",
    "position_bias",
    "name_bias",
]:
    dfs[key] = pd.read_csv(os.path.join(output_dir, f"{prefix}-{key}.csv"))

In [36]:
dfs["unbiased_decisions_win_rate"]

Unnamed: 0,A,B
0,0.883117,0.116883
1,0.914286,0.085714
2,0.888889,0.111111
3,0.905405,0.094595
4,0.909091,0.090909
5,0.933333,0.066667
6,0.897436,0.102564
7,0.906667,0.093333


In [37]:
dfs["unbiased_decisions_win_rate"].mean(), dfs["unbiased_decisions_win_rate"].std()

(A    0.904778
 B    0.095222
 dtype: float64,
 A    0.015602
 B    0.015602
 dtype: float64)

In [31]:
dfs["position_bias"].mean(), dfs["position_bias"].std()

(consistent    0.78750
 first         0.15375
 second        0.05875
 dtype: float64,
 consistent    0.028661
 first         0.033354
 second        0.019594
 dtype: float64)

In [33]:
dfs["name_bias"].mean(), dfs["name_bias"].std()

(consistent    0.84875
 first         0.12875
 second        0.02250
 dtype: float64,
 consistent    0.022321
 first         0.022321
 second        0.011650
 dtype: float64)

In [28]:
dfs["unbiased_decisions_count"]

Unnamed: 0,A,B,C
0,64,10,26
1,58,8,34
2,62,10,28
3,62,9,29
4,61,11,28
5,64,8,28
6,65,8,27
7,58,8,34
