# Use ChatGPT to evaluate generated answers in the LLM-as-a-judge way

In [25]:
import os
import pandas as pd
import openai
from gpt_score.gpt3_score import gpt3score
from tqdm import tqdm

tqdm.pandas()

proxy = 'http://dell-1.star:7890' # 3090 docker
os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

openai.api_key_path = ".openai-key2"

In [26]:
SYSTEM_PROMPT = {
    "role": "system",
    "prompt": "你是一个有用的助手"
}

ANSWER_PROMPT = """[指令]
请充当一个公正的裁判，评估AI助手对下面显示的问题的回答质量。你的评估应该考虑回复的有用性、相关性、准确性、深度、创造力和详细程度等因素。请通过提供简短的解释来开始你的评估，并尽可能做到客观。提供解释后，你必须遵循以下格式对回复进行评分(从1到10)：\"[[评分]]\"，例如：\"评分：[[5]]\"。

[问题]
{question}

[AI回答开始]
{answer}
[AI回答结束]
"""

In [27]:
import logging


def eval_questions_by_chat(row, max_tokens=1000):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": ANSWER_PROMPT.format(
                        question=row["question"],
                        answer=row["answer"]
                    )
                }
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(e)
        return ""


In [28]:
filename = "test/manual-qa.csv"
df = pd.read_csv(filename)
df.rename(columns={"prompt": "question", "completion": "answer"}, inplace=True)

df["eval_raw"] = df.progress_apply(eval_questions_by_chat, axis=1)

  8%|▊         | 84/1076 [03:06<37:45,  2.28s/it]ERROR:root:The server is overloaded or not ready yet.
 30%|██▉       | 319/1076 [14:12<28:11,  2.23s/it]  ERROR:root:The server is overloaded or not ready yet.
 55%|█████▌    | 595/1076 [24:48<20:12,  2.52s/it]  ERROR:root:The server is overloaded or not ready yet.
100%|██████████| 1076/1076 [46:53<00:00,  2.61s/it] 


In [29]:
import re

score_pattern = re.compile(r"评分：\[\[(\d+)\]\]")

df["eval_score"] = df["eval_raw"].apply(lambda x: int(score_pattern.search(x).group(1)) if score_pattern.search(x) else -1)
df.to_csv(filename.replace(".csv", "-eval.csv"), index=False)

In [30]:
valid_df = df[df["eval_score"] != -1]
valid_df.sort_values(by="eval_score", ascending=False) \
    .to_csv(filename.replace(".csv", "-eval-descend.csv"), index=False)
valid_df.sort_values(by="eval_score", ascending=True) \
    .to_csv(filename.replace(".csv", "-eval-ascend.csv"), index=False)

In [31]:
# calculate correlation scores
corr = valid_df[["eval_score", "informative_score", "coherent_score", "relevant_score", "fluent_score"]].corr()
print(corr)

                   eval_score  informative_score  coherent_score  \
eval_score           1.000000           0.088628        0.087818   
informative_score    0.088628           1.000000        0.992092   
coherent_score       0.087818           0.992092        1.000000   
relevant_score       0.081309           0.990572        0.995123   
fluent_score         0.087120           0.993130        0.996107   

                   relevant_score  fluent_score  
eval_score               0.081309      0.087120  
informative_score        0.990572      0.993130  
coherent_score           0.995123      0.996107  
relevant_score           1.000000      0.993676  
fluent_score             0.993676      1.000000  


In [32]:
non_valid_df = df[df["eval_score"] == -1]
non_valid_df.to_csv(filename.replace(".csv", "-eval-non-valid.csv"), index=False)