### Simple Evaluator

In [1]:
from langsmith.schemas import Example, Run

def correct_label(inputs: dict, reference_outputs: dict, outputs: dict):
    score = outputs.get("output") == reference_outputs.get("lable")
    return {"score": int(score), "key": "correct_label"}

### LLM-as-Judge Evaluation

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env", override=True)

True

In [3]:
from openai import OpenAI
from pydantic import BaseModel, Field

In [4]:
client = OpenAI()

In [5]:
class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")

In [6]:
def compare_semantic_similarity_score(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_output = reference_outputs["output"]
    output = outputs["output"]
    
    completion = client.beta.chat.completions.parse(
        model = "gpt-4o-mini",
        messages = [
            {
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Question: {input_question}\n"
                    f"Reference Response: {reference_output}\n"
                    f"Run Response: {output}"
                ),
            }
        ],
        response_format = Similarity_Score,
    )
    
    similarity_score = completion.choices[0].message.parsed
    return {"similarity_score": similarity_score}

In [7]:
# From Dataset Example
inputs = {
  "question": "အဏုဇီဝဗေဒ ဆိုတာ ဘာလဲ ရှင်းပြပါ"
}
reference_outputs = {
  "output": "အဏုဇီဝဗေဒသည် မျက်စိဖြင့် မမြင်နိုင်သော အဏုဇီဝ သက်ရှိများကို လေ့လာသည့် သိပ္ပံပညာရပ် ဖြစ်သည် ။ ဤပညာရပ်တွင် ဘက်တီးရီးယား ၊ ဗိုင်းရပ်စ်၊ မှို၊ နှင့် protozoa များကို အဓိက လေ့လာသည် ။"
}


# From Run
outputs = {
  "output": "အဏုဇီဝဗေဒဆိုသည်မှာ လူနာများကို ကုသရာတွင် သုံးသော ဗေဒပညာဖြစ်သည်။"
}


similarity_score = compare_semantic_similarity_score(inputs, reference_outputs, outputs)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'similarity_score': Similarity_Score(similarity_score=2)}


In [8]:
from langsmith.schemas import Run, Example

def compare_semantic_similarity_v2(root_run: Run, example: Example):
    input_question = example["inputs"]["question"]
    reference_response = example["outputs"]["output"]
    run_response = root_run["outputs"]["output"]
    
    completion = client.beta.chat.completions.parse(
        model = "gpt-4o-mini",
        messages = [
            {
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Question: {input_question}\n"
                    f"Reference Response: {reference_response}\n"
                    f"Run Response: {run_response}"
                ),
            }
        ],
        response_format = Similarity_Score,
    )
    
    similarity_score = completion.choices[0].message.parsed
    return {"similarity_score": similarity_score}

In [9]:
# Example from dataset row
sample_run = {
  "name": "Sample Run",
  "inputs": {
    "question": "အဏုဇီဝဗေဒ ဆိုတာ ဘာလဲ ရှင်းပြပါ"
  },
  "outputs": {
    "output": "အဏုဇီဝဗေဒသည် မျက်စိဖြင့် မမြင်နိုင်သော အဏုဇီဝ သက်ရှိများကို လေ့လာသည့် သိပ္ပံပညာရပ် ဖြစ်သည် ။"
  },
  "is_root": True,
  "status": "success",
  "extra": {
    "metadata": {
      "key": "biology-test"
    }
  }
}

sample_example = {
  "inputs": {
    "question": "အဏုဇီဝဗေဒ ဆိုတာ ဘာလဲ ရှင်းပြပါ"
  },
  "outputs": {
    "output": "အဏုဇီဝဗေဒသည် မျက်စိဖြင့် မမြင်နိုင်သော အဏုဇီဝ သက်ရှိများကို လေ့လာသည့် သိပ္ပံပညာရပ် ဖြစ်သည် ။ ဤပညာရပ်တွင် ဘက်တီးရီးယား ၊ ဗိုင်းရပ်စ်၊ မှို၊ နှင့် protozoa များကို အဓိက လေ့လာသည် ။"
  },
  "metadata": {
    "dataset_split": [
      "biology",
      "base"
    ]
  }
}

similarity_score = compare_semantic_similarity_v2(sample_run, sample_example)
print(f"Semantic similarity score: {similarity_score}")


Semantic similarity score: {'similarity_score': Similarity_Score(similarity_score=9)}
