<a href="https://colab.research.google.com/github/withpi/cookbook-withpi/blob/main/colabs/Langsmith_PI_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://withpi.ai"><img src="https://play.withpi.ai/logo/logoFullBlack.svg" width="240"></a>

<a href="https://code.withpi.ai"><font size="4">Documentation</font></a>

<a href="https://build.withpi.ai"><font size="4">Copilot</font></a>

[Pi-Scorer](https://build.withpi.ai) offers an alternative to LLM-as-a-judge with several advantages:

* Significantly faster

* Highly consistent — always returns the same score for the same inputs

* Eliminates the need for prompt tuning or adjustments

In [None]:
%%capture
%pip install -U langsmith openevals openai datasets

In [None]:
# @title Setup API Keys

import os
from google.colab import userdata

# Get keys for your project from the project settings page
# https://smith.langchain.com/
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get("LANGSMITH_API_KEY")

# Your openai key
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

# Get PI API key: https://build.withpi.ai/account/keys
os.environ["WITHPI_API_KEY"] = userdata.get('WITHPI_API_KEY')

In [None]:
# @title Load a blog-post dataset to Langsmith

from datasets import load_dataset
from langsmith import Client

# Load data from Huggingface
ds = load_dataset("withpi/mlmastery_com_blogs_condensed", split="train")
topics = ds["topic"][:5]
examples = [{"inputs": {"topic": topic}} for topic in topics]
display(examples)

# Upload to Langsmith
langsmith_client = Client()
dataset = langsmith_client.create_dataset(dataset_name="blog_posts", description="Blog post generator.")
langsmith_client.create_examples(dataset_id=dataset.id, examples=examples)

[{'inputs': {'topic': 'Tips for beginners to get started with deep learning, including mastering machine learning fundamentals, choosing a framework, understanding neural network architectures, starting with simple projects, and practicing regularly while engaging with the community.'}},
 {'inputs': {'topic': 'The topic of this blog post is: "Understanding the Data Science Mind Map: A comprehensive guide to essential Python packages for data preparation, visualization, and statistical analysis, with an emphasis on storytelling in data science."'}},
 {'inputs': {'topic': 'The 5 Most Influential Machine Learning Research Papers of 2024 and Their Contributions to AI Advancement'}},
 {'inputs': {'topic': 'Cross-validation techniques for comprehensive model evaluation beyond simple train-test splits'}},
 {'inputs': {'topic': 'Creating an effective machine learning portfolio that demonstrates practical skills and helps land job opportunities in the competitive ML industry.'}}]

{'example_ids': ['453b9839-e37b-4c38-a50d-412252eae45c',
  '377f5d49-a258-4cf2-b578-a0e4fd2debc5',
  '6f2aecf0-b72e-4875-be32-08d4c9f0a295',
  '6fa7f560-f011-44d7-82da-7a3f0ec93a82',
  'b3588b58-8b79-4f54-bc6c-42b1109fdad0'],
 'count': 5}

In [None]:
# @title My custom application setup: blog-post generator

from langsmith import wrappers
from openai import OpenAI

# Wrap the OpenAI client for LangSmith tracing
openai_client = wrappers.wrap_openai(OpenAI())

# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": """You are a specialized blog post writer. Given a topic, write a technical blog post. Here are specific instructions:
- Make sure that the blog is approximately under 500 words
- The blog should be technical in nature with clear instructions
"""},
            {"role": "user", "content": inputs["topic"]},
        ],
    )
    return { "blogpost": response.choices[0].message.content.strip() }

In [None]:
# @title Pi-Scorer Setup

import os
import requests

PI_API_URL = "https://api.withpi.ai/v1/scoring_system/score"
HEADERS = {
    "Content-Type": "application/json",
    "x-api-key": os.environ.get("WITHPI_API_KEY"),
}

def get_score(input: str, output: str, question: str):
    payload = {
      "llm_input": input,
      "llm_output": output,
      "scoring_spec": [{"question": question}]
    }
    response = requests.post(PI_API_URL, headers=HEADERS, json=payload)
    pi_score = response.json()
    return pi_score["total_score"]

def make_evaluator(key, question):
    def evaluator(inputs, outputs):
        return {
            "key": key,
            "score": get_score(inputs["topic"], outputs["blogpost"], question),
        }
    return evaluator

conclusion_evaluator = make_evaluator(
    "conclusion", "Does the post have a conclusion section?"
)

pitfall_evaluator = make_evaluator(
    "pitfall", "Does the post call out potential pitfalls or common mistakes?"
)

In [None]:
# @title Run Langsmith evaluation

experiment_results = langsmith_client.evaluate(
    target,
    data="blog_posts",
    evaluators=[
        conclusion_evaluator,
        pitfall_evaluator,
    ],
    experiment_prefix="blog_post_eval",
    max_concurrency=1,
)

See result at https://smith.langchain.com/public/ff72f973-5b38-4d91-a6fe-11960eb17448/d