In [None]:
import asyncio
import json
import logging
import os
import random
from typing import Dict, Optional

import numpy as np
import pandas as pd
from dataset.dataset import get_examples, is_correct
from scipy.stats import norm
from tensorzero import AsyncTensorZeroGateway, InferenceResponse
from tqdm.asyncio import tqdm_asyncio

In [None]:
tensorzero_client = AsyncTensorZeroGateway("http://localhost:3000", timeout=20.0)
logger = logging.getLogger(__name__)

In [None]:
train_examples = get_examples("train")
random.shuffle(train_examples)
print(train_examples[0])
test_examples = get_examples("test")
random.shuffle(test_examples)

In [None]:
async def solve_math_problem(
    question: str,
    client: AsyncTensorZeroGateway,
    *,
    variant_name: Optional[str] = None,
    dryrun: bool = False,
) -> Optional[InferenceResponse]:
    try:
        response: InferenceResponse = await client.inference(
            function_name="solve_math_problem",
            input={"messages": [{"role": "user", "content": {"question": question}}]},
            variant_name=variant_name,
            dryrun=dryrun,
        )
    except Exception as e:
        print(f"Error: {e}", type(e), dir(e))
        return None
    return response

In the function below, the only feedback provided to TensorZero is whether the output of the function is correct.
We do not provide the correct answer in cases of mistakes.


In [None]:
async def solve_grade_math_problem(
    example: Dict[str, str],
    client: AsyncTensorZeroGateway,
    *,
    variant_name: Optional[str] = None,
    dryrun: bool = False,
) -> Optional[bool]:
    response = await solve_math_problem(
        example["question"], client, variant_name=variant_name, dryrun=dryrun
    )
    if response is None:
        return None
    first_block = response.content[0]
    if first_block.type != "text":
        return None
    else:
        correct = is_correct(first_block.text, example)
    await client.feedback(
        metric_name="correct",
        value=correct,
        inference_id=response.inference_id,
        dryrun=dryrun,
    )
    return correct

In [None]:
MAX_CONCURRENT_INFERENCES = 10

semaphore = asyncio.Semaphore(MAX_CONCURRENT_INFERENCES)


async def run_inference(
    example: Dict[str, str], *, variant_name: Optional[str] = None, dryrun: bool = False
) -> Optional[bool]:
    async with semaphore:
        return await solve_grade_math_problem(
            example, tensorzero_client, variant_name=variant_name, dryrun=dryrun
        )

Run the TensorZero function on the training examples, grade the answers, and store the feedback in the database.


In [None]:
NUM_TRAINING_INFERENCES = 1000
coroutines = [
    run_inference(example) for example in train_examples[:NUM_TRAINING_INFERENCES]
]
results = await tqdm_asyncio.gather(*coroutines, desc="Running training inferences")

In the cell below, we evaluate the accuracy of a variant on some of the test examples. If you generate a new variant, you should run this cell with the new variant name to evaluate it.
BONUS: Try evaluating the variants named `gpt-35-turbo-baseline` and `gpt-35-turbo-best-of-5` in this cell (they evaluate vanilla GPT 3.5 Turbo and our best-of-5 variant, respectively).


In [None]:
NUM_TEST_INFERENCES = 200
variant_name = "llama-8b-baseline"
# We use dryrun=True here to avoid leaking the test set into the database
coroutines = [
    run_inference(example, variant_name=variant_name, dryrun=True)
    for example in test_examples[:NUM_TEST_INFERENCES]
]

results = await tqdm_asyncio.gather(*coroutines, desc="Running test inferences")
# Filter out None values from results
total_results = len(results)
results = [result for result in results if result is not None]
success_rate = len(results) / total_results
print(f"Success rate: {success_rate:.1%}")

accuracy = sum(results) / len(results)
n = len(results)
z = norm.ppf(0.975)  # 95% confidence interval
margin_of_error = z * np.sqrt((accuracy * (1 - accuracy)) / n)

print(f"Accuracy: {accuracy:.4f}")
print(
    f"95% Confidence Interval: ({accuracy - margin_of_error:.4f}, {accuracy + margin_of_error:.4f})"
)

At this point, we could run any TensorZero recipe to generate a new variant which might perform better using this historical data. You can go try this!

Below, we include an example of how to use an external library, [DSPy](https://dspy-docs.vercel.app/), to automatically optimize a prompt for this function.
Given that the ClickHouse database TensorZero uses easily allows for the querying of historical inference and feedback data into Pandas DataFrames, it is easy to integrate with nearly any ML library yourself.


## Automated Prompt Engineering with DSPy

The rest of this notebook shows how we can pull data from the TensorZero data model in ClickHouse and use it to optimize a prompt for a function using DSPy.
Given that there are many strategies for prompt optimization in DSPy, we can use the same code skeleton to try a lot of different strategies.
However, there are a few things (table name, feedback name, chat function type, etc) that we have set specifically for this example.
You can change them to fit your needs.
At a high level the notebook below does the following:

1. Pull data from ClickHouse and convert it into a DSPy dataset.
2. Run a prompt optimization loop using one of the teleprompting classes supported by DSPy.
3. Print the optimized prompt from the history so that you can write it to a minijinja file.

**Note:** DSPy does not model the chat completion interface commonly used by language models. So, we only support functions that have inputs into the user prompt, that only use text output, that are single-turn functions, and that have a flat JSON schema for input, i.e. functions that take a list of primitive types as input into the user schema and output text or a flat JSON object.

To get started:

- Set the `TENSORZERO_CLICKHOUSE_URL` environment variable. 
- Set the `OPENAI_API_KEY` environment variable.
- Update the following parameters to those that apply to your use case.


In [None]:
import dspy
from clickhouse_connect import get_client
from dspy.datasets import Dataset

You can swap the client below for any of the ones supported [here](https://dspy-docs.vercel.app/api/category/language-model-api-clients) in case you want DSPy to use a different language model.


In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
lm_client = dspy.Together(model=model_name)
dspy.configure(lm=lm_client)

In [None]:
# A simple function signature for the `solve_math_problem` function
function_signature = "input -> output"

Initialize the ClickHouse client.


In [None]:
assert "TENSORZERO_CLICKHOUSE_URL" in os.environ, (
    "TENSORZERO_CLICKHOUSE_URL environment variable not set"
)
clickhouse_client = get_client(dsn=os.environ["TENSORZERO_CLICKHOUSE_URL"])

Grab the dataset of examples which were successful according to the metric.


In [None]:
query = """
SELECT 
    i.variant_name, 
    i.input, 
    i.output, 
    i.episode_id,
    f.value
FROM 
    ChatInference i
JOIN 
    (SELECT
        target_id,
        value,
        ROW_NUMBER() OVER (PARTITION BY target_id ORDER BY timestamp DESC) as rn
    FROM BooleanMetricFeedback
    WHERE
        metric_name = 'correct'
        AND value = true
    ) f ON i.id = f.target_id and f.rn = 1
WHERE 
    i.function_name = 'solve_math_problem'
LIMIT %(max_samples)s
"""

params = {
    "max_samples": 1000,
}

df = clickhouse_client.query_df(query, params)

df.head()

In [None]:
def parse_dspy_compatible_inputs(input_raw: str) -> Optional[Dict[str, str]]:
    """
    Checks that the input of this Inference is in the correct format for DSPy.
    Then returns the dictionary of inputs.
    """
    try:
        parsed_input = json.loads(input_raw)
    except json.JSONDecodeError:
        logger.warning(f"Input is not valid JSON: {input_raw}")
        return None
    messages = parsed_input.get("messages", None)
    if messages is None:
        logger.warning(f"Input contains no messages: {input_raw}")
        return None
    if len(messages) != 1:
        logger.warning(f"Input contains more than one message: {input_raw}")
        return None
    message = messages[0]
    content = message.get("content", None)
    if content is None:
        logger.warning(f"Input contains no content: {input_raw}")
        return None
    if len(content) != 1:
        logger.warning(f"Input must contain exactly one content item: {input_raw}")
        return None
    content = content[0]
    if content["type"] != "text":
        logger.warning(f"Input contains non-text content: {input_raw}")
        return None
    value = content.get("value", None)
    if value is None:
        logger.warning(f"Input contains no value: {input_raw}")
        return None
    return value

In [None]:
# Parse the input column into a list of dicts and create a new DataFrame with parsed content
parsed_inputs = df["input"].apply(parse_dspy_compatible_inputs)

In [None]:
def parse_outputs(output_raw: str) -> Optional[str]:
    try:
        parsed_output = json.loads(output_raw)
    except json.JSONDecodeError:
        logger.warning(f"Output is not valid JSON: {output_raw}")
        return None
    if len(parsed_output) != 1:
        logger.warning(f"Output contains more than one message: {output_raw}")
        return None
    message = parsed_output[0]
    if message["type"] != "text":
        logger.warning(f"Output contains non-text content: {output_raw}")
        return None
    value = message.get("text", None)
    if value is None:
        logger.warning(f"Output contains no value: {output_raw}")
        return None
    return value

In [None]:
# Parse the output column and create a new DataFrame with parsed content
parsed_outputs = df["output"].apply(parse_outputs)

In [None]:
all_data = pd.concat([parsed_inputs, parsed_outputs], axis=1)
all_data = all_data.dropna()

In [None]:
all_data.head()

In [None]:
class TensorZeroDSPyDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        dev_fraction: float = 0.1,
    ):
        # Randomly shuffle the DataFrame
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Extract the 'question' string from the 'input' column
        df["input"] = df["input"].apply(lambda x: x["question"])

        # Calculate the number of samples for train and dev sets
        total_samples = len(df)
        dev_samples = int(total_samples * dev_fraction)
        train_samples = total_samples - dev_samples

        # Split the DataFrame
        train_df = df.iloc[:train_samples]
        dev_df = df.iloc[train_samples:]

        # Split the DataFrame
        self._train = train_df.to_dict(orient="records")
        self._dev = dev_df.to_dict(orient="records")
        self._test = None
        self.train_size = len(self._train)
        self.dev_size = len(self._dev)
        super().__init__(
            train_size=self.train_size,
            dev_size=self.dev_size,
            test_size=0,
        )

        print(f"Train set: {len(self._train)} samples")
        print(f"Dev set: {len(self._dev)} samples")

In [None]:
dataset = TensorZeroDSPyDataset(all_data)

In [None]:
dspy_function = dspy.Predict(function_signature)


class Predictor(dspy.Module):
    def __init__(self, signature: dspy.Signature):
        super().__init__()
        self.prog = dspy.Predict(signature)

    def forward(self, **inputs):
        return self.prog(**inputs)

You can swap the teleprompter with any of the teleprompting classes supported by DSPy [here](https://dspy-docs.vercel.app/docs/building-blocks/optimizers).


In [None]:
from dspy.teleprompt import LabeledFewShot

teleprompter = LabeledFewShot(k=5)
optimized_function = teleprompter.compile(
    Predictor(function_signature), trainset=dataset.train
)

In [None]:
input_ = dataset.dev[0]["input"]
print(input_)

We run an example inference to get the prompt from the history.


In [None]:
optimized_function(input="test_input")

Let's parse out the prompt from the history.


In [None]:
dspy_prompt = lm_client.history[-1]["prompt"]
# we parse the actual inference input out of the prompt (DSPy does not separate the prompt from the inputs in this history)
dspy_prompt = "---".join(dspy_prompt.split("---")[:-1])

# DSPy does not know the output format for GSM8k, so we add it manually
merged_prompt = f"""
You are tasked with solving a math problem. You will be given an open-ended question that should require arithmetic to solve.

Feel free to work through the problem step-by-step in your response, but once you have found the solution, please complete your response with:
#### your_answer


---

{dspy_prompt}

---

REMEMBER: End your response with `#### your_answer`, where `your_answer` should be an integer with no other punctuation.
""".strip()

print(merged_prompt)

Write the optimized user prompt to a minijinja file and try it out! You can skip to the training cell and use the new variant name to evaluate.
