In [None]:
import asyncio
from pprint import pprint

import pandas as pd
from sklearn.metrics import confusion_matrix
from tensorzero import AsyncTensorZeroGateway, DICLOptimizationConfig
from tqdm.asyncio import tqdm

## Load the SMS spam classification dataset

In [None]:
def load_data():
    df = pd.read_csv("data/clean_data.csv")

    # Print dataset statistics
    print("Training Samples: ", df[df["is_train"] == 1].shape[0])
    print("Validation Samples: ", df[df["is_train"] == 0].shape[0])
    print(f"Spam {df[df['class'] == 1].shape[0] / df.shape[0]:.2%}")

    # Split dataset into training and validation sets
    train_df, val_df = df[df["is_train"] == 1], df[df["is_train"] == 0]

    return train_df, val_df


train_df, val_df = load_data()
train_df.head(5)

## Set up the TensorZero Gateway client

In [None]:
t0 = await AsyncTensorZeroGateway.build_embedded(
    clickhouse_url="http://chuser:chpassword@localhost:8123/tensorzero",
    config_file="config/tensorzero.toml",
)

## Building a dataset for optimization

Let's convert the SMS spam dataset to the TensorZero format.
For education purposes, let's store the dataset in TensorZero and query it back later.

Alternatively, you could use historical inferences and feedback to build samples for optimization.
See the documentation for `t0.experimental_list_inferences` for more information.


In [None]:
def df_to_tensorzero_datapoints(df):
    datapoints = []
    for _, row in df.iterrows():
        datapoints.append(
            {
                "function_name": "classify_spam",
                "input": {
                    "messages": [
                        {
                            "role": "user",
                            "content": row["text"],
                        }
                    ]
                },
                "output": {"spam": row["class"] == 1},
            }
        )
    return datapoints


async def insert_datapoints(t0, df):
    # Convert our DataFrame into a list of TensorZero datapoints
    datapoints = df_to_tensorzero_datapoints(df)

    # Print the first datapoint for sanity checking
    pprint(datapoints[0])

    # Insert the datapoints into the TensorZero dataset
    await t0.bulk_insert_datapoints(dataset_name="spam_train", datapoints=datapoints)


# Convert the training dataset to TensorZero datapoints and store them in TensorZero
await insert_datapoints(t0, train_df)

## Launch the dynamic in-context learning optimization workflow

Let's load the stored datapoints from TensorZero.

In [None]:
stored_datapoints = await t0.list_datapoints(dataset_name="spam_train", limit=100_000)

Let's render these datapoints. The stored datapoint is a variant-agnostic representation of the datapoint.
Rendering makes the datapoints ready for the optimization workflow.
The `experimental_render_samples` function applies templates and schemas, loads input files from object storage, and so on.


In [None]:
train_samples = await t0.experimental_render_samples(
    stored_samples=stored_datapoints,
    variants={"classify_spam": "baseline"},
)

Finally, let's launch the optimization workflow.

In [None]:
job_handle = await t0.experimental_launch_optimization(
    train_samples=train_samples,
    optimization_config=DICLOptimizationConfig(
        embedding_model="openai::text-embedding-3-small",
        function_name="classify_spam",
        variant_name="dicl",
        append_to_existing_variants=True,
    ),
)

## Compare the baseline and the DICL variants

Let's define a function that runs inference and parses the classification result.

In [None]:
async def infer_spam(t0, text, variant_name=None):
    result = await t0.inference(
        function_name="classify_spam",
        variant_name=variant_name,
        input={
            "messages": [
                {
                    "role": "user",
                    "content": text,
                }
            ]
        },
        cache_options={"enabled": "on"},
    )

    assert result.output.parsed is not None
    is_spam = result.output.parsed.get("spam")
    assert isinstance(is_spam, bool)

    return is_spam

Let's create a semaphore to limit the number of concurrent inference requests to the API.
Adjust this value based on your API rate limit.

In [None]:
semaphore = asyncio.Semaphore(50)

Let's define a function that evaluates a variant's performance on an entire dataset.

In [None]:
async def process_row(row, variant_name=None):
    async with semaphore:
        predicted_is_spam = await infer_spam(t0, row["text"], variant_name=variant_name)
        real_is_spam = bool(row["class"])

    return (predicted_is_spam, real_is_spam)


async def evaluate_variant(df, variant_name):
    results = await tqdm.gather(
        *[process_row(row, variant_name) for _, row in df.iterrows()]
    )

    tn, fp, fn, tp = (
        confusion_matrix(
            y_pred=[x[0] for x in results],
            y_true=[x[1] for x in results],
        )
        .ravel()
        .tolist()
    )

    print(f"True Positives: {tp}")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"F1 Score: {2 * tp / (2 * tp + fp + fn):.2f}")
    print(f"Precision: {tp / (tp + fp):.2f}")
    print(f"Recall: {tp / (tp + fn):.2f}")

In [None]:
await evaluate_variant(val_df, "baseline")

In [None]:
await evaluate_variant(val_df, "dicl")

At the time of writing, the DICL variant materially outperforms the baseline variant:

- False Positives: 37 → 21
- False Negatives: 7 → 3
- F1 Score: 0.85 → 0.91
- Precision: 77% → 86%
- Recall: 95% → 97%
