In [1]:
%pip install -U langsmith pydantic langchain_ollama


Collecting langsmith
  Downloading langsmith-0.3.5-py3-none-any.whl.metadata (14 kB)
Downloading langsmith-0.3.5-py3-none-any.whl (333 kB)
Installing collected packages: langsmith
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.3.4
    Uninstalling langsmith-0.3.4:
      Successfully uninstalled langsmith-0.3.4
Successfully installed langsmith-0.3.5
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-community 0.3.13 requires langsmith<0.3,>=0.1.125, but you have langsmith 0.3.5 which is incompatible.
open-webui 0.3.35 requires aiohttp==3.10.8, but you have aiohttp 3.10.11 which is incompatible.
open-webui 0.3.35 requires boto3==1.35.0, but you have boto3 1.34.69 which is incompatible.
open-webui 0.3.35 requires chromadb==0.5.9, but you have chromadb 0.5.23 which is incompatible.
open-webui 0.3.35 requires google-generativeai==0.7.2, but you have google-generativeai 0.5.4 which is incompatible.
open-webui 0.3.35 requires langchain==0.2.15, but you have langchain 0.3.17 which is incompatible.
open-webui 0.3.35 requires langchain-community==0.2.12, but you have langchain-community 0.3.13 which is incompatible.
open-webui 0.3.35 requires pydantic==2.9.2, but you have pydantic 2.10.6 which is incompatibl

In [2]:
from getpass import getpass
import os

os.environ["LANGSMITH_API_KEY"] = getpass("Enter your LangSmith API Key: ")


In [3]:
from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama


In [4]:
client = Client()

# Define dataset examples
examples = [
    ("Which country is Mount Kilimanjaro located in?", "Mount Kilimanjaro is in Tanzania."),
    ("What is Earth's lowest point?", "Earth's lowest point is The Dead Sea."),
]

# Format inputs and expected outputs
inputs = [{"question": q} for q, _ in examples]
outputs = [{"answer": a} for _, a in examples]

# Create dataset in LangSmith
dataset = client.create_dataset(dataset_name="Sample dataset 0502", description="A sample dataset for evaluation.")
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)


In [5]:
def target(inputs: dict) -> dict:
    llm = ChatOllama(model="llama3.2:latest")
    
    response = llm.invoke(f"Answer this question accurately: {inputs['question']}")
    
    return {"response": response}


In [6]:
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity.
- True: Most or full conceptual match and similarity.
- Key criteria: Concept should match, not exact wording.
"""

# Define output schema for the evaluation
class Grade(BaseModel):
    score: bool = Field(description="Indicates whether the response is conceptually accurate.")

# Define function to compare the LLM's response with the expected answer
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
    llm = ChatOllama(model="llama3.2:latest")
    
    evaluation_prompt = f"""
    Ground Truth: {reference_outputs['answer']};
    Student's Answer: {outputs['response']}
    Evaluate the similarity as per the given criteria.
    """

    response = llm.invoke(evaluation_prompt)

    response_content = outputs["response"].content if hasattr(outputs["response"], "content") else str(outputs["response"])

    
    return "true" in response_content.lower()  # Convert response into a boolean decision


In [7]:
experiment_results = client.evaluate(
    target,
    data="Sample dataset 0502",
    evaluators=[accuracy],  # Adding accuracy function as evaluator
    experiment_prefix="first-eval-in-langsmith",
    max_concurrency=2,
)

print("Experiment completed. View results in LangSmith UI.")


View the evaluation results for experiment: 'first-eval-in-langsmith-18df02be' at:
https://smith.langchain.com/o/3dd11e5e-2f8e-4ba7-9b0b-724b6b4535ca/datasets/746d011c-cfbe-41fd-82da-fe9633e003fc/compare?selectedSessions=14df6477-58ac-4bae-aea2-7e6ae280d702




0it [00:00, ?it/s]

Experiment completed. View results in LangSmith UI.
