In [None]:
%pip install -U langsmith pydantic langchain_ollama


In [4]:
from getpass import getpass
import os

os.environ["LANGSMITH_API_KEY"] = getpass("Enter your LangSmith API Key: ")


In [5]:
from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama


In [6]:
client = Client()

# Define dataset examples
examples = [
    ("Which country is Mount Kilimanjaro located in?", "Mount Kilimanjaro is in Tanzania."),
    ("What is Earth's lowest point?", "Earth's lowest point is The Dead Sea."),
]

# Format inputs and expected outputs
inputs = [{"question": q} for q, _ in examples]
outputs = [{"answer": a} for _, a in examples]

# Create dataset in LangSmith
dataset = client.create_dataset(dataset_name="Sample dataset", description="A sample dataset for evaluation.")
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)


In [7]:
def target(inputs: dict) -> dict:
    llm = ChatOllama(model="llama3.2:latest")
    
    response = llm.invoke(f"Answer this question accurately: {inputs['question']}")
    
    return {"response": response}


In [10]:
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity.
- True: Most or full conceptual match and similarity.
- Key criteria: Concept should match, not exact wording.
"""

# Define output schema for the evaluation
class Grade(BaseModel):
    score: bool = Field(description="Indicates whether the response is conceptually accurate.")

# Define function to compare the LLM's response with the expected answer
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
    llm = ChatOllama(model="llama3.2:latest")
    
    evaluation_prompt = f"""
    Ground Truth: {reference_outputs['answer']};
    Student's Answer: {outputs['response']}
    Evaluate the similarity as per the given criteria.
    """

    response = llm.invoke(evaluation_prompt)

    response_content = outputs["response"].content if hasattr(outputs["response"], "content") else str(outputs["response"])

    
    return "true" in response_content.lower()  # Convert response into a boolean decision


In [None]:
experiment_results = client.evaluate(
    target,
    data="Sample dataset",
    evaluators=[accuracy],  # Adding accuracy function as evaluator
    experiment_prefix="first-eval-in-langsmith",
    max_concurrency=2,
)

print("Experiment completed. View results in LangSmith UI.")
