In [None]:
import sys
sys.path.append("./api")

import generate_openai
import generate_claude
import generate_gemini

In [None]:
MAX_NEW_TOKEN = 512

In [None]:
from datasets import load_dataset
dataset = load_dataset("thashiguchi/LizeHelesta_Benchmark", split="test")

In [None]:
dataset

In [None]:
def create_input(data):
    result = f"""
以下の4択クイズを回答せよ。必ず回答を選択すること。
## 問題文
{data['問題文']}

## 選択肢
- 選択肢1: {data['選択肢①']}
- 選択肢2: {data['選択肢②']}
- 選択肢3: {data['選択肢③']}
- 選択肢4: {data['選択肢④']}
    """
    return result

def add_input_output(example):
    example["input"] = create_input(example)
    example["output"] = example["答え(正答率)"]    
    return example

dataset = dataset.map(add_input_output)

In [None]:
print(dataset["input"][0])

In [None]:
gpt4_judge_prompt_ans = """
[指示]
公平な判断者として行動し、以下に表示される4択クイズに対するAIアシスタントのクイズの回答を評価してください。
あなたの評価は正確さと正答率を考慮すべきです。正答率の高いクイズを誤っていたら、より減点されるべきです。
AIアシスタントの返答の言語は、ユーザーが使用している言語と一致しているべきで、そうでない場合は減点されるべきです。
正解とアシスタントの答えが与えられます。あなたの評価は、アシスタントの答えと正解を比較してください。
正解について説明を提供した後、このフォーマットに厳密に従って1から10までのスケールで応答を評価する必要があります：\"[[評価]]\"、例えば：\"評価：[[5]]\"。

[質問]
{question}

[正解の開始]
{ref_answer_1}
[正解の終了]

[アシスタントの回答の開始]
{answer}
[アシスタントの回答の終了]
"""

In [None]:
def get_message_from_complettion_json(response_json):
    return response_json["choices"][0]["message"]["content"]

## Vtuberについて質問

In [None]:
user_prompt = "Vtuberについて教えてください"

In [None]:
openai_model = "gpt-4o"
openai_result = generate_openai.chat_complettion(model=openai_model, user_prompt=user_prompt)
openai_result = get_message_from_complettion_json(openai_result)
print(f"******{openai_model}******")
print(openai_result)

In [None]:
claude_model = "anthropic.claude-3-sonnet-20240229-v1:0"
prompt = generate_claude.format_claude_v3_prompt(user_prompt, MAX_NEW_TOKEN)
claude_result = generate_claude.claude_v3_chat_complettion(model=claude_model, prompt=prompt)
claude_result = claude_result["content"][0]["text"]
print(f"******{claude_model}******")
print(claude_result)

In [None]:
gemini_model = "gemini-1.5-flash-latest"
model, chat = generate_gemini.configure_model(gemini_model)
gemini_response = generate_gemini.generate_content(model=model, prompt=user_prompt)
print(f"******{gemini_model}******")
print(gemini_response.text)

In [None]:
model_id = "elyza/Llama-3-ELYZA-JP-8B"
model, tokenizer = generate_local_llm.load_model(model_id)

In [None]:
prompt = generate_local_llm.build_llama_prompt(user_prompt, tokenizer)
result = generate_local_llm.generate(model, tokenizer, prompt)

In [None]:
result

## リゼヘルエスタについて質問

In [None]:
user_prompt = "リゼ・ヘルエスタについて教えてください"

In [None]:
openai_model = "gpt-4o"
openai_result = generate_openai.chat_complettion(model=openai_model, user_prompt=user_prompt)
openai_result = get_message_from_complettion_json(openai_result)
print(f"******{openai_model}******")
print(openai_result)

In [None]:
claude_model = "anthropic.claude-3-sonnet-20240229-v1:0"
prompt = generate_claude.format_claude_v3_prompt(user_prompt, MAX_NEW_TOKEN)
claude_result = generate_claude.claude_v3_chat_complettion(model=claude_model, prompt=prompt)
claude_result = claude_result["content"][0]["text"]
print(f"******{claude_model}******")
print(claude_result)

In [None]:
gemini_model = "gemini-1.5-flash-latest"
model, chat = generate_gemini.configure_model(gemini_model)
gemini_response = generate_gemini.generate_content(model=model, prompt=user_prompt)
print(f"******{gemini_model}******")
print(gemini_response.text)

In [None]:
prompt = generate_local_llm.build_llama_prompt(user_prompt, tokenizer)
result = generate_local_llm.generate(model, tokenizer, prompt)

In [None]:
print(result)

# クイズの正答率を確認

In [None]:
import time
results = []
for data in dataset:
    user_prompt = data["input"]
    ref_answer_1 = dataset["output"]
    print(data["問題文"])
    for openai_model in ["gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo"]:
        openai_result = generate_openai.chat_complettion(model=openai_model, user_prompt=user_prompt)
        openai_result = get_message_from_complettion_json(openai_result)
        openai_eval = gpt4_judge_prompt_ans.format(question=user_prompt, ref_answer_1=ref_answer_1, answer=openai_result)
        openai_eval_result = generate_openai.chat_complettion(model="gpt-4o", user_prompt=openai_eval)
        openai_eval_result = get_message_from_complettion_json(openai_eval_result)
        results.append([user_prompt, openai_result, openai_eval_result, openai_model])
        time.sleep(1)

import pandas as pd
df_ = pd.DataFrame(results)
df_.columns = ["input", "output", "eval", "model"]
df_.to_csv("LizeHelesta_Benchmark/openai_result.csv", index=False)

In [None]:
import time
results = []
for data in dataset:
    user_prompt = data["input"]
    ref_answer_1 = dataset["output"]
    print(data["問題文"])
    for claude_model in ["anthropic.claude-v2:1", "anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
        if claude_model == "anthropic.claude-v2:1":
            prompt = generate_claude.format_claude_v2_prompt(user_prompt)
            claude_result = generate_claude.claude_v2_chat_complettion(model=claude_model, prompt=prompt, max_tokens_to_sample=MAX_NEW_TOKEN)
            claude_result = claude_result.get('completion')
        else:
            prompt = generate_claude.format_claude_v3_prompt(user_prompt, MAX_NEW_TOKEN)
            claude_result = generate_claude.claude_v3_chat_complettion(model=claude_model, prompt=prompt)
            claude_result = claude_result["content"][0]["text"]
        claude_eval = gpt4_judge_prompt_ans.format(question=user_prompt, ref_answer_1=ref_answer_1, answer=claude_result)
        claude_eval_result = generate_openai.chat_complettion(model="gpt-4o", user_prompt=claude_eval)
        claude_eval_result = get_message_from_complettion_json(claude_eval_result)
        results.append([user_prompt, claude_result, claude_eval_result, claude_model])
        time.sleep(1)

import pandas as pd
df_ = pd.DataFrame(results)
df_.columns = ["input", "output", "eval", "model"]
df_.to_csv("LizeHelesta_Benchmark/claude_result.csv", index=False)

In [None]:
import time
results = []

safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
    }
]
for data in dataset:
    user_prompt = data["input"]
    ref_answer_1 = dataset["output"]
    print(data["問題文"])
    for gemini_model in ["gemini-1.5-flash-latest", "gemini-1.5-pro-latest"]:
        model, chat = generate_gemini.configure_model(gemini_model)
        gemini_response = generate_gemini.generate_content(model=model, prompt=user_prompt, safety_settings=safety_settings)
        gemini_result = gemini_response.text
        gemini_eval = gpt4_judge_prompt_ans.format(question=user_prompt, ref_answer_1=ref_answer_1, answer=gemini_result)
        gemini_eval_result = generate_openai.chat_complettion(model="gpt-4o", user_prompt=gemini_eval)
        gemini_eval_result = get_message_from_complettion_json(gemini_eval_result)
        results.append([user_prompt, gemini_result, gemini_eval_result, gemini_model])
        time.sleep(1)

import pandas as pd
df_ = pd.DataFrame(results)
df_.columns = ["input", "output", "eval", "model"]
df_.to_csv("LizeHelesta_Benchmark/gemini_result.csv", index=False)

In [None]:
from ollama import Client
client = Client(host='http://host.docker.internal:8881')

def ollama_chat(model_name, user_prompt):
    response = client.chat(model=model_name, messages=[
        {
        'role': 'system',
        'content': "あなたはAIアシスタントです。聞かれた内容について誠実に回答してください。"
        },
        {
        'role': 'user',
        'content': user_prompt
        },
    ])
    return response['message']['content']

In [None]:
ollama_models = []
for model in client.list()["models"]:
    ollama_models.append(model["model"])

In [None]:
results = []
for model in ollama_models:
    print(model)
    for data in dataset:
        user_prompt = data["input"]
        ref_answer_1 = dataset["output"]
        local_result = ollama_chat(model, user_prompt)
        local_eval = gpt4_judge_prompt_ans.format(question=user_prompt, ref_answer_1=ref_answer_1, answer=local_result)
        local_eval_result = generate_openai.chat_complettion(model="gpt-4o", user_prompt=local_eval)
        local_eval_result = get_message_from_complettion_json(local_eval_result)
        results.append([user_prompt, local_result, local_eval_result, model])
import pandas as pd
df_ = pd.DataFrame(results)
df_.columns = ["input", "output", "eval", "model"]
df_.to_csv("LizeHelesta_Benchmark/local_result.csv", index=False)

In [None]:
import pandas as pd
import os

df = pd.DataFrame()
for file in os.listdir("./LizeHelesta_Benchmark"):
    _df = pd.read_csv(f"./LizeHelesta_Benchmark/{file}")
    df = pd.concat([df, _df])
df = df.reset_index(drop=True)

In [None]:
def scoring(row):
    pattern = r'\[(\d{1,2})\]'
    text = row["eval"]
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        
        pattern = "評価\].*(\d{1,2})"
        match = re.search(pattern, text)
        if match:
            return match.group(1)
        else:
            return 0

In [None]:
df["score"] = df.apply(lambda x: scoring(x), axis=1)

In [None]:
df["score"] = df["score"].astype(int)

In [None]:
df[["model", "score"]].groupby("model").mean().sort_values("score", ascending=False).plot.bar()