# Example: Optimizing Data Extraction (NER) with TensorZero

## Setup

In [None]:
import asyncio
import json
from collections import Counter
from typing import Dict, List, Optional

import altair as alt
import pandas as pd
from tensorzero import AsyncTensorZeroGateway, InferenceResponse
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio

> **IMPORTANT:** Update the gateway URL below if you're not using the standard setup provided in this example

In [None]:
TENSORZERO_GATEWAY_URL = "http://localhost:3000"

## Load the Dataset

In [None]:
# Select only a subset of the dataset to speed things up
NUM_TRAIN_DATAPOINTS = 500
NUM_VAL_DATAPOINTS = 500

In [None]:
def load_dataset(path: str) -> (pd.DataFrame, pd.DataFrame):
    # Load the dataset
    df = pd.read_csv(path)
    df.output = df.output.apply(json.loads)

    # Split the dataset into train and validation sets
    train_df = df[df["split"] == 0]
    val_df = df[df["split"] == 1]

    # Shuffle the splits
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
    val_df = val_df.sample(frac=1, random_state=0).reset_index(drop=True)

    # Select only a subset of the dataset to speed things up
    train_df = train_df.iloc[:NUM_TRAIN_DATAPOINTS]
    val_df = val_df.iloc[:NUM_VAL_DATAPOINTS]

    return train_df, val_df

In [None]:
train_df, val_df = load_dataset("data/conllpp.csv")

print(f"Train data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")

## Extract Entities

> **IMPORTANT:** Reduce the number of concurrent requests if you're running into rate limits

In [None]:
MAX_CONCURRENT_REQUESTS = 10

In [None]:
tensorzero_client = AsyncTensorZeroGateway(TENSORZERO_GATEWAY_URL, timeout=15)
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

In [None]:
async def get_entities(
    text: str,
    variant_name: Optional[str] = None,
    dryrun: bool = False,
) -> Optional[InferenceResponse]:
    # Use a semaphore to avoid rate limits
    async with semaphore:
        try:
            return await tensorzero_client.inference(
                function_name="extract_entities",
                input={"messages": [{"role": "user", "content": text}]},
                dryrun=dryrun,
                variant_name=variant_name,
            )
        except Exception as e:
            print(f"Error occurred: {type(e).__name__}: {e}")
            return None

In [None]:
# Run inference in parallel to speed things up
responses = await tqdm_asyncio.gather(
    *[get_entities(text) for text in train_df["input"]]
)

## Evaluate the Performance

In [None]:
def flatten_dict(d: Dict[str, List[str]]) -> List[str]:
    res = []
    for k, v in d.items():
        assert isinstance(v, list)
        for elt in v:
            res.append(f"__{k.upper()}__::{elt}")
    return res


# Exact match between the predicted and ground truth entities (the sharpest metric we use to evaluate NER)
def compute_exact_match(
    predicted: Dict[str, List[str]], ground_truth: Dict[str, List[str]]
) -> bool:
    return set(flatten_dict(predicted)) == set(flatten_dict(ground_truth))


# Jaccard similarity between the predicted and ground_truth entities
# (a more lenient metric that gives partial credit for correct entities)
# This is a different implementation from the original code by Predibase, so the metrics won't be directly comparable.
def compute_jaccard_similarity(
    predicted: Dict[str, List[str]], ground_truth: Dict[str, List[str]]
) -> float:
    target_entities = flatten_dict(ground_truth)
    pred_entities = flatten_dict(predicted)
    target_count = Counter(target_entities)
    pred_count = Counter(pred_entities)
    num = 0
    den = 0
    all_keys = set(target_entities).union(set(pred_entities))
    for key in all_keys:
        num += min(target_count.get(key, 0), pred_count.get(key, 0))
        den += max(target_count.get(key, 0), pred_count.get(key, 0))
    if den == 0:
        return 1
    return num / den

In [None]:
def evaluate_response(
    response: Optional[InferenceResponse], ground_truth_data: Dict[str, List[str]]
):
    predicted = response.output.parsed if response else None

    # `predicted` is None if the model failed to return a valid JSON that complies with the output schema
    valid_output = predicted is not None

    # Compute the other metrics
    exact_match = (
        compute_exact_match(predicted, ground_truth_data) if predicted else False
    )
    jaccard_similarity = (
        compute_jaccard_similarity(predicted, ground_truth_data) if predicted else 0
    )

    return valid_output, exact_match, jaccard_similarity

In [None]:
for response, ground_truth in tqdm(
    zip(responses, train_df["output"]), total=len(responses)
):
    # Don't send feedback if the request failed completely
    if response is None:
        continue

    # Evaluate the example
    valid_output, exact_match, jaccard_similarity = evaluate_response(
        response, ground_truth
    )

    # Send the metrics feedback to TensorZero
    await tensorzero_client.feedback(
        metric_name="valid_output",
        value=valid_output,
        inference_id=response.inference_id,
    )

    await tensorzero_client.feedback(
        metric_name="exact_match",
        value=exact_match,
        inference_id=response.inference_id,
    )

    await tensorzero_client.feedback(
        metric_name="jaccard_similarity",
        value=jaccard_similarity,
        inference_id=response.inference_id,
    )

    # Send the demonstration feedback to TensorZero
    await tensorzero_client.feedback(
        metric_name="demonstration",
        value=ground_truth,
        inference_id=response.inference_id,
    )

## Validation Set

> **IMPORTANT:** Update the list below when you create new variants in `tensorzero.toml`

In [None]:
# Include the variants in `tensorzero.toml` that we want to evaluate
VARIANTS_TO_EVALUATE = [
    "gpt_4o",
    "gpt_4o_mini",
    # "gpt_4o_mini_fine_tuned",
]

In [None]:
scores = {}  # variant_name => (valid_output, exact_match, jaccard_similarity)

for variant_name in VARIANTS_TO_EVALUATE:
    # Run inference on the validation set
    responses = await tqdm_asyncio.gather(
        *[
            get_entities(
                text,
                variant_name=variant_name,  # pin to the specific variant we want to evaluate
                dryrun=True,  # don't store results to avoid leaking data
            )
            for text in val_df["input"]
        ],
        desc=f"Evaluating variant: {variant_name}",
    )

    # Evaluate the performance of the variant
    valid_output_scores = []
    exact_match_scores = []
    jaccard_similarity_scores = []

    for response, ground_truth in zip(responses, val_df["output"]):
        valid_output, exact_match, jaccard_similarity = evaluate_response(
            response, ground_truth
        )
        valid_output_scores.append(valid_output)
        exact_match_scores.append(exact_match)
        jaccard_similarity_scores.append(jaccard_similarity)

    scores[variant_name] = {
        "valid_output": valid_output_scores,
        "exact_match": exact_match_scores,
        "jaccard_similarity": jaccard_similarity_scores,
    }

    # Print the performance of the variant
    print(f"Valid Output: {sum(valid_output_scores) / len(valid_output_scores):.1%}")
    print(f"Exact Match: {sum(exact_match_scores) / len(exact_match_scores):.1%}")
    print(
        f"Jaccard Similarity (mean): {sum(jaccard_similarity_scores) / len(jaccard_similarity_scores):.1%}"
    )
    print()

## Plot Results

In [None]:
scores_df = []

for variant_name, variant_scores in scores.items():
    exact_match_score = sum(variant_scores["exact_match"]) / len(
        variant_scores["exact_match"]
    )
    scores_df.append(
        {
            "Variant": variant_name,
            "Metric": "exact_match",
            "Score": exact_match_score,
        }
    )

    jaccard_similarity_score = sum(variant_scores["jaccard_similarity"]) / len(
        variant_scores["jaccard_similarity"]
    )

    scores_df.append(
        {
            "Variant": variant_name,
            "Metric": "jaccard_similarity",
            "Score": jaccard_similarity_score,
        }
    )

scores_df = pd.DataFrame(scores_df)

In [None]:
chart = (
    alt.Chart(scores_df)
    .encode(
        x=alt.X("Score:Q", axis=alt.Axis(format="%"), scale=alt.Scale(domain=[0, 1])),
        y="Variant:N",
        yOffset="Metric:N",
        color="Metric:N",
        text=alt.Text("Score:Q", format=".1%"),
    )
    .properties(title="Metrics by Variant")
)

chart = chart.mark_bar() + chart.mark_text(align="left", dx=2)

chart