In [9]:
from typing import Literal

from dotenv import load_dotenv
from langchain import hub
from langchain.chat_models import init_chat_model
from langsmith import evaluate
from pydantic import BaseModel, Field

load_dotenv()

from langchain.output_parsers import StructuredOutputParser

# See the prompt: https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2
# prompt = hub.pull("langchain-ai/pairwise-evaluation-2")
from langchain.prompts import ChatPromptTemplate


class PreferenceResult(BaseModel):
    """Result of the preference evaluation between two AI responses"""

    preferred_assistant: Literal["A", "B", "Tie"] = Field(description="Which assistant provided the better response - A, B, or Tie if equal")
    explanation: str = Field(description="Detailed explanation of the reasoning behind the preference, analyzing the quality, accuracy, and effectiveness of the responses")


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You may choose one assistant that follows the user's instructions and answers the user's question better, indicate if both answers are equally good, or indicate if neither answer is satisfactory. Each evaluation should be made independently without comparing to previous evaluations. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible."),
        ("human", "[User Question] {question}\n[The Start of Assistant A's Answer] {answer_a} [The End of Assistant A's Answer]\nThe Start of Assistant B's Answer] {answer_b} [The End of Assistant B's Answer]"),
    ]
)


model = init_chat_model("gpt-4o")
chain = prompt | model.with_structured_output(PreferenceResult)

prompt

ChatPromptTemplate(input_variables=['answer_a', 'answer_b', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template="Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You may choose one assistant that follows the user's instructions and answers the user's question better, indicate if both answers are equally good, or indicate if neither answer is satisfactory. Each evaluation should be made independently without comparing to previous evaluations. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not infl

In [10]:
# Test the chain with some example responses
test_question = "What is the capital of France?"
response_a = "The capital of France is Paris, a global center for art, fashion, and culture. It's known for iconic landmarks like the Eiffel Tower."
response_b = "Paris is France's capital city."

result = chain.invoke({"question": test_question, "answer_a": response_a, "answer_b": response_b})
print(f"Evaluation result: {result}")

# Try another example
test_question_2 = "How do you make a peanut butter sandwich?"
response_a_2 = "To make a peanut butter sandwich: 1. Take two slices of bread 2. Spread peanut butter evenly on one slice 3. Place the second slice on top 4. Optional: cut diagonally for easier eating"
response_b_2 = "To make a peanut butter sandwich: 1. Take two slices of bread 2. Spread peanut butter evenly on one slice 3. Place the second slice on top 4. Optional: cut diagonally for easier eating"

result_2 = chain.invoke({"question": test_question_2, "answer_a": response_a_2, "answer_b": response_b_2})
print(f"Evaluation result 2: {result_2}")

Evaluation result: preferred_assistant='A' explanation="Assistant A provides a more comprehensive answer by not only stating that Paris is the capital of France but also adding additional context about Paris being a global center for art, fashion, and culture, and mentioning the Eiffel Tower as an iconic landmark. This additional detail makes the response more informative and engaging. On the other hand, Assistant B gives a correct answer but lacks the extra depth and context provided by Assistant A. Therefore, Assistant A's response is preferred."
Evaluation result 2: preferred_assistant='Tie' explanation='Both Assistant A and Assistant B provided identical responses to the question on how to make a peanut butter sandwich. Each response clearly outlines the steps needed to make the sandwich, including an optional step to cut it diagonally. The instructions are simple, straightforward, and easy to follow, making both responses equally satisfactory in terms of helpfulness, relevance, ac

In [None]:
def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    # Assumes example inputs have a 'question' key and experiment
    # outputs have an 'answer' key.
    response = chain.invoke(
        {
            "question": inputs["question"],
            "answer_a": outputs[0].get("answer", "N/A"),
            "answer_b": outputs[1].get("answer", "N/A"),
        }
    )

    if response["Preference"] == 1:
        scores = [1, 0]
    elif response["Preference"] == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]
    return scores

In [None]:
evaluate(
    ("experiment-1", "experiment-2"),  # Replace with the names/IDs of your experiments
    evaluators=[ranked_preference],
    randomize_order=True,
    max_concurrency=4,
)

In [1]:
from dotenv import load_dotenv
from langsmith import Client
from pydantic import BaseModel, Field

load_dotenv()

client = Client()

In [2]:
# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application

# Create inputs and reference outputs
examples = [
    (
        "Which country is Mount Kilimanjaro located in?",
        "Mount Kilimanjaro is located in Tanzania.",
    ),
    (
        "What is Earth's lowest point?",
        "Earth's lowest point is The Dead Sea.",
    ),
]

inputs = [{"question": input_prompt} for input_prompt, _ in examples]
outputs = [{"answer": output_answer} for _, output_answer in examples]

try:
    # Programmatically create a dataset in LangSmith
    dataset = client.create_dataset(dataset_name="Sample dataset", description="A sample dataset in LangSmith.")

    # Add examples to the dataset
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
except Exception as e:
    pass

In [3]:
# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI


def target(inputs: dict) -> dict:
    prompt = ChatPromptTemplate.from_messages([("user", "{question}")])
    llm = ChatOpenAI(model="gpt-4o-mini")
    chain = prompt | llm
    response = chain.invoke({"question": inputs["question"]})
    return {"response": response.content}

In [4]:
# Define instructions for the LLM judge evaluator
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false: 
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
"""


# Define output schema for the LLM judge
class Grade(BaseModel):
    score: bool = Field(description="Boolean that indicates whether the response is accurate relative to the reference answer")
    explanation: str = Field(description="Explanation of the grading decision")


# Define LLM judge that grades the accuracy of the response relative to reference output
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", instructions),
            ("user", "Ground Truth answer: {answer}; Student's Answer: {response}"),
        ]
    )
    llm = ChatOpenAI(model="gpt-4o-mini")
    chain = prompt | llm.with_structured_output(Grade)
    response = chain.invoke({"answer": reference_outputs["answer"], "response": outputs["response"]})
    return response

In [5]:
# Test with dummy data
test_outputs = {"response": "The Earth revolves around the Sun"}
test_reference = {"answer": "Our planet Earth orbits the Sun in an elliptical path"}
res = accuracy(test_outputs, test_reference)
print(res)

score=True explanation="Both answers describe the same concept: the movement of Earth in relation to the Sun. The term 'orbits' in the ground truth is synonymous with 'revolves' in the student's answer, and both imply the elliptical nature of this movement."


In [6]:
res.score

True

In [5]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="Sample dataset",
    evaluators=[
        accuracy,
        # can add multiple evaluators here
    ],
    experiment_prefix="first-eval-in-langsmith",
    max_concurrency=2,
)

View the evaluation results for experiment: 'first-eval-in-langsmith-c36b9a4e' at:
https://smith.langchain.com/o/eb122562-97bd-51d4-9e3f-86c9acffa2bc/datasets/5089c08c-d8d9-41fe-9474-f412e34bdcec/compare?selectedSessions=852bf841-39cc-4d60-89d9-70b6cff9ef1b




0it [00:00, ?it/s]