<a href="https://colab.research.google.com/github/withpi/cookbook-withpi/blob/main/colabs/Langfuse_PI_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://withpi.ai"><img src="https://play.withpi.ai/logo/logoFullBlack.svg" width="240"></a>

<a href="https://code.withpi.ai"><font size="4">Documentation</font></a>

<a href="https://build.withpi.ai"><font size="4">Copilot</font></a>

[Pi-Scorer](https://build.withpi.ai) offers an alternative to LLM-as-a-judge with several advantages:

* Significantly faster

* Highly consistent — always returns the same score for the same inputs

* Eliminates the need for prompt tuning or adjustments

In [None]:
%%capture
%pip install langfuse openai langchain_openai langchain datasets --upgrade

In [None]:
# @title Setup API Keys

import os
from google.colab import userdata

# Get keys for your project from the project settings page
# https://cloud.langfuse.com
os.environ["LANGFUSE_PUBLIC_KEY"] = userdata.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = userdata.get("LANGFUSE_SECRET_KEY")
# os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com" # 🇪🇺 EU region
os.environ["LANGFUSE_HOST"] = "https://us.cloud.langfuse.com" # 🇺🇸 US region

# Your openai key
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

# Get PI API key: https://build.withpi.ai/account/keys
os.environ["WITHPI_API_KEY"] = userdata.get('WITHPI_API_KEY')

In [None]:
# @title Load a blog-post dataset to Langfuse

from datasets import load_dataset
from langfuse import Langfuse

# Load data from Huggingface
ds = load_dataset("withpi/mlmastery_com_blogs_condensed", split="train")
topics = ds["topic"][:5]
data = [{"input": topic} for topic in topics]
display(data)

# Upload to Langfuse
langfuse = Langfuse()
langfuse.create_dataset(name="blog_posts")
for item in data:
  langfuse.create_dataset_item(
      dataset_name="blog_posts",
      # any python object or value
      input=item["input"]
  )

[{'input': 'Tips for beginners to get started with deep learning, including mastering machine learning fundamentals, choosing a framework, understanding neural network architectures, starting with simple projects, and practicing regularly while engaging with the community.'},
 {'input': 'The topic of this blog post is: "Understanding the Data Science Mind Map: A comprehensive guide to essential Python packages for data preparation, visualization, and statistical analysis, with an emphasis on storytelling in data science."'},
 {'input': 'The 5 Most Influential Machine Learning Research Papers of 2024 and Their Contributions to AI Advancement'},
 {'input': 'Cross-validation techniques for comprehensive model evaluation beyond simple train-test splits'},
 {'input': 'Creating an effective machine learning portfolio that demonstrates practical skills and helps land job opportunities in the competitive ML industry.'}]

In [None]:
# @title My custom application setup: blog-post generator

from langfuse.openai import openai
from langfuse.decorators import observe, langfuse_context

@observe()
def run_my_custom_llm_app(input, system_prompt):
  messages = [
      {"role":"system", "content": system_prompt},
      {"role":"user", "content": input}
  ]

  completion = openai.chat.completions.create(
      model="gpt-4o-mini",
      messages=messages
  ).choices[0].message.content

  return completion

In [None]:
# @title Pi-Scorer Setup

import os
import requests

PI_API_URL = "https://api.withpi.ai/v1/scoring_system/score"
HEADERS = {
    "Content-Type": "application/json",
    "x-api-key": os.environ.get("WITHPI_API_KEY"),
}

def get_score(input: str, output: str, question: str):
    payload = {
      "llm_input": input,
      "llm_output": output,
      "scoring_spec": [{"question": question}]
    }
    response = requests.post(PI_API_URL, headers=HEADERS, json=payload)
    pi_score = response.json()
    return pi_score["total_score"]

In [None]:
# @title Setup Langfuse experiment

def run_experiment(experiment_name, system_prompt):
  dataset = langfuse.get_dataset("blog_posts")

  for item in dataset.items:
    # item.observe() returns a trace_id that can be used to add custom evaluations later
    # it also automatically links the trace to the experiment run
    with item.observe(run_name=experiment_name) as trace_id:

      # run application, pass input and system prompt
      output = run_my_custom_llm_app(item.input, system_prompt)

      # optional: add custom evaluation results to the experiment trace
      # we use the previously created example evaluation function
      langfuse.score(
        trace_id=trace_id,
        name="conclusion",
        value=get_score(item.input, output, question="Does the post have a conclusion section?")
      )
      langfuse.score(
        trace_id=trace_id,
        name="pitfall",
        value=get_score(item.input, output, question="Does the post call out potential pitfalls or common mistakes?")
      )

In [None]:
# @title Run a Langfuse experiment

from langfuse.decorators import langfuse_context

run_experiment(
    experiment_name="simple_prompt",
    system_prompt="""You are a specialized blog post writer. Given a topic, write a technical blog post. Here are specific instructions:
- Make sure that the blog is approximately under 500 words
- The blog should be technical in nature with clear instructions
"""
)

# Assert that all events were sent to the Langfuse API
langfuse_context.flush()
langfuse.flush()

Langfuse dataset screenshot.
![Result](https://raw.githubusercontent.com/withpi/cookbook-withpi/main/colabs/Langfuse_screenshot.png)