# Understanding Failure Modes with AI Assisted Root Cause Analysis

This recipe allows TensorZero users to analyze failure modes of their LLM application with help from Root Cause Analysis AI.
Since TensorZero automatically logs all inferences and feedback, it is straightforward to analyze the failure modes of your LLM application on your data.

To get started:

- Set the `TENSORZERO_CLICKHOUSE_URL` environment variable. For example: `TENSORZERO_CLICKHOUSE_URL="http://chuser:chpassword@localhost:8123/tensorzero"`
- Set the `OPENAI_API_KEY` environment variable.
- Update the following parameters:

In [None]:
CONFIG_PATH = "../../examples/data-extraction-ner/config/tensorzero.toml"

FUNCTION_NAME = "extract_entities"

METRIC_NAME = "jaccard_similarity"

# The name of the variant to use to grab the templates used for root cause analysis
TEMPLATE_VARIANT_NAME = "gpt_4o_mini"

# Optional list of tools available if your function supports them.
# Each entry is formatted as as a dictionary.
# {"name": "<The tool's identifier.>", "description": "<A brief description of what the tool does.>"}
# These will be passed to the assistant to aid in root cause analysis.
TOOLS_AVAILABLE = []

# If the metric is a float metric, you can set the threshold to define a failure and filter the data
FLOAT_METRIC_THRESHOLD = 0.5

# Maximum number of samples to use for root cause analysis
MAX_SAMPLES = 100_000

# The name of the variant to use for root cause and failure mode analysis
ANALYSIS_VARIANT_NAME = "o4-mini"

# Embedding model to use for root cause and failure mode analysis
EMBEDDING_MODEL = "text-embedding-3-small"

# Number of root cause clusters to use
N_CLUSTERS = 3

In [None]:
import asyncio
import json
import os
from pathlib import Path
from pprint import pprint
from typing import Any, Dict, List, Optional

import altair as alt
import numpy as np
import pandas as pd
import toml
from clickhouse_connect import get_client
from minijinja import Environment
from openai import AsyncOpenAI
from sklearn.decomposition import PCA
from sklearn.mixture import BayesianGaussianMixture
from tensorzero import AsyncTensorZeroGateway
from tqdm.asyncio import tqdm_asyncio
from utils import generate_root_causes

Load the TensorZero configuration file.

In [None]:
config_path = Path(CONFIG_PATH)

assert config_path.exists(), f"{CONFIG_PATH} does not exist"
assert config_path.is_file(), f"{CONFIG_PATH} is not a file"

with config_path.open("r") as f:
    config = toml.load(f)

Retrieve the metric configuration.

In [None]:
assert "metrics" in config, "No `[metrics]` section found in config"
assert METRIC_NAME in config["metrics"], (
    f"No metric named `{METRIC_NAME}` found in config"
)

metric = config["metrics"][METRIC_NAME]

metric

Retrieve the configuration for the variant with the templates we want to analyze.

In [None]:
assert "functions" in config, "No `[functions]` section found in config"
assert FUNCTION_NAME in config["functions"], (
    f"No function named `{FUNCTION_NAME}` found in config"
)
assert "variants" in config["functions"][FUNCTION_NAME], (
    f"No variants section found for function `{FUNCTION_NAME}`"
)
assert TEMPLATE_VARIANT_NAME in config["functions"][FUNCTION_NAME]["variants"], (
    f"No variant named `{TEMPLATE_VARIANT_NAME}` found in function `{FUNCTION_NAME}`"
)

function_type = config["functions"][FUNCTION_NAME]["type"]
variant = config["functions"][FUNCTION_NAME]["variants"][TEMPLATE_VARIANT_NAME]

variant

Retrieve the system, user, and assistant templates in the variant (if any), and initialize a minijinja environment with them.

In [None]:
templates = {}

if "assistant_template" in variant:
    assistant_template_path = config_path.parent / variant["assistant_template"]
    with assistant_template_path.open("r") as f:
        templates["assistant"] = f.read()

if "system_template" in variant:
    system_template_path = config_path.parent / variant["system_template"]
    with system_template_path.open("r") as f:
        system_template = f.read()
        templates["system"] = system_template
else:
    system_template = None

if "user_template" in variant:
    user_template_path = config_path.parent / variant["user_template"]
    with user_template_path.open("r") as f:
        templates["user"] = f.read()

env = Environment(templates=templates)

Initialize the ClickHouse client.

In [None]:
assert "TENSORZERO_CLICKHOUSE_URL" in os.environ, (
    "TENSORZERO_CLICKHOUSE_URL environment variable not set"
)

clickhouse_client = get_client(dsn=os.environ["TENSORZERO_CLICKHOUSE_URL"])

Determine the ClickHouse table name for the function.

In [None]:
inference_table_name = {"chat": "ChatInference", "json": "JsonInference"}.get(
    function_type
)

if inference_table_name is None:
    raise ValueError(f"Unsupported function type: {function_type}")

Determine the ClickHouse table name for the metric.

In [None]:
feedback_table_name = {
    "float": "FloatMetricFeedback",
    "boolean": "BooleanMetricFeedback",
}.get(metric["type"])

if feedback_table_name is None:
    raise ValueError(f"Unsupported metric type: {metric['type']}")

Determine the correct join key to use for the metric on the inference table.

In [None]:
inference_join_key = {
    "episode": "episode_id",
    "inference": "id",
}.get(metric["level"])

if inference_join_key is None:
    raise ValueError(f"Unsupported metric level: {metric['level']}")

Query the inferences and feedback from ClickHouse.

If the metric is a float metric, we need to filter the data for failures based on the threshold.

In [None]:
assert "optimize" in metric, "Metric is missing the `optimize` field"

threshold = FLOAT_METRIC_THRESHOLD if metric["type"] == "float" else 0.5
comparison_operator = "<" if metric["optimize"] == "max" else ">"

query = f"""
SELECT 
    i.variant_name,
    i.input,
    i.output,
    i.id,
    f.value,
    i.episode_id,
FROM {inference_table_name} AS i
JOIN (
    SELECT
        target_id,
        value,
        ROW_NUMBER() OVER (PARTITION BY target_id ORDER BY timestamp DESC) AS rn
    FROM 
        {feedback_table_name}
    WHERE
        metric_name = %(metric_name)s
        AND value {comparison_operator} %(threshold)s
) f ON i.{inference_join_key} = f.target_id AND f.rn = 1

WHERE 
    i.function_name  = %(function_name)s
LIMIT %(max_samples)s
"""

params = {
    "function_name": FUNCTION_NAME,
    "metric_name": METRIC_NAME,
    "threshold": threshold,
    "max_samples": MAX_SAMPLES,
}

df = clickhouse_client.query_df(query, params)

df.head()

Render the inputs using the templates.

In [None]:
def render_message(content: List[Dict[str, Any]], role: str, env: Environment) -> str:
    assert role in ["user", "assistant"], f"Invalid role: {role}"
    message = ""
    delimeter = "\n" if len(content) > 1 else ""
    for c in content:
        if c["type"] == "text":
            c = c["value"]
            if isinstance(c, str):
                message += c + delimeter
            else:
                message += env.render_template(role, **c)  # type: ignore
        elif c["type"] == "tool_call":
            message += f"Tool call {c['name']}: {c['arguments']}{delimeter}"
        elif c["type"] == "tool_result":
            message += f"Tool result {c['name']}: {c['result']}{delimeter}"
        else:
            raise ValueError(f"Unsupported content type: {c['type']}")
    return message


def render_input(sample: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
    function_input = json.loads(sample["input"])
    rendered_input = {}

    # Add the system message to the rendered messages
    # If there is data passed in or a system template there must be a system message
    system = function_input.get("system", {})
    if len(system) > 0 or system_template:
        if system_template:
            rendered_input["system"] = env.render_template("system", **system)
        else:
            rendered_input["system"] = system

    rendered_messages: List[Dict[str, Any]] = []
    # Add the input messages to the rendered messages
    for message in function_input["messages"]:
        rendered_message = render_message(message["content"], message["role"], env)
        rendered_messages.append({"role": message["role"], "content": rendered_message})

    # Add the output to the messages
    output = json.loads(sample["output"])
    if function_type == "chat":
        if len(output) != 1:
            raise ValueError(f"Output {output} must have exactly one content block.")

        if output[0]["type"] == "text":
            rendered_messages.append(
                {"role": "assistant", "content": output[0]["text"]}
            )
        elif output[0]["type"] == "tool_call":
            rendered_messages.append(
                {
                    "role": "assistant",
                    "content": f"Tool call {output[0]['name']}: {output[0]['arguments']}",
                }
            )
        elif output[0]["type"] == "tool_result":
            rendered_messages.append(
                {
                    "role": "assistant",
                    "content": f"Tool result {output[0]['name']}: {output[0]['result']}",
                }
            )
        else:
            raise ValueError(f"Unsupported output type: {output[0]['type']}")
    elif function_type == "json":
        rendered_messages.append({"role": "assistant", "content": output["raw"]})
    else:
        raise ValueError(f"Unsupported function type: {function_type}")
    rendered_input["messages"] = rendered_messages
    return rendered_input


df["rendered_input"] = df.apply(render_input, axis=1)
df.head()

## Root Cause Analysis

Generate a list of root causes for each failure.

In [None]:
gateway = await AsyncTensorZeroGateway.build_embedded(
    config_file="config/tensorzero.toml",
    clickhouse_url=os.environ["TENSORZERO_CLICKHOUSE_URL"],
)
semaphore = asyncio.Semaphore(10)

In [None]:
tasks = [
    generate_root_causes(
        gateway=gateway,
        row=row.to_dict(),
        variant_name=ANALYSIS_VARIANT_NAME,
        metric_name=METRIC_NAME,
        semaphore=semaphore,
        tools_available=TOOLS_AVAILABLE,
        dryrun=True,
    )
    for _, row in df.iterrows()
]

root_causes = await tqdm_asyncio.gather(*tasks)

In [None]:
root_causes_concat = [
    "\n".join(root_cause) for root_cause in root_causes if root_cause is not None
]

## Failure Mode Analysis

**Step 1**: Get a vector representation of each root cause.

In [None]:
openai_client = AsyncOpenAI()

In [None]:
async def get_embedding(text: str) -> Optional[list[float]]:
    try:
        async with semaphore:
            response = await openai_client.embeddings.create(
                input=text, model=EMBEDDING_MODEL
            )
            return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None

In [None]:
tasks = [get_embedding(root_cause) for root_cause in root_causes_concat]

embeddings = await tqdm_asyncio.gather(*tasks, desc="Embedding inputs")
embeddings = np.array(embeddings)

**Step 2**: Find failure modes by clustering the root cause embeddings

In [None]:
# Use Bayesian GMM instead of KMeans
bgmm = BayesianGaussianMixture(
    n_components=N_CLUSTERS,
    covariance_type="full",
    weight_concentration_prior_type="dirichlet_process",
    random_state=42,
)
bgmm.fit(embeddings)
labels = bgmm.predict(embeddings)

# Assign root cause labels to DataFrame
df["root_cause"] = root_causes_concat
df["cluster"] = labels

In [None]:
# Ensure embeddings is a NumPy array
embeddings = np.array(embeddings)

# Fit PCA and transform
pca = PCA(n_components=2)
vis_dims2 = pca.fit_transform(embeddings)

# Create DataFrame for plotting
pca_df = pd.DataFrame(
    {"PC1": vis_dims2[:, 0], "PC2": vis_dims2[:, 1], "failure_mode": df["cluster"]}
)

# Compute cluster centroids
centroids_df = pca_df.groupby("failure_mode")[["PC1", "PC2"]].mean().reset_index()
centroids_df["label"] = centroids_df["failure_mode"].apply(lambda c: f"Cluster {c}")

# Scatter plot of points
points_chart = (
    alt.Chart(pca_df)
    .mark_circle(opacity=0.3, size=60)
    .encode(
        x=alt.X("PC1", title="Principal Component 1"),
        y=alt.Y("PC2", title="Principal Component 2"),
        color=alt.Color("failure_mode:N", title="Failure Mode"),
        tooltip=["failure_mode"],
    )
)

# Cross markers for centroids
centroids_chart = (
    alt.Chart(centroids_df)
    .mark_point(filled=True, size=100, shape="cross")
    .encode(x="PC1", y="PC2", color=alt.Color("failure_mode:N"), tooltip=["label"])
)

# Combine
(points_chart + centroids_chart).properties(
    title="Failure Modes visualized using Principal Component Analysis (PCA)"
)

**Step 3**: Summarize the failure modes in natural language.

In [None]:
summaries = []
for i in range(N_CLUSTERS):
    print(f"Cluster {i}:", end=" ")

    # Sample for summarization
    root_causes_sample = df[df.cluster == i].root_cause.to_list()
    # break

    gateway_input = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "arguments": {
                            "root_causes": root_causes_sample,
                            "system_template": system_template,
                        },
                    }
                ],
            }
        ]
    }
    response = await gateway.inference(
        input=gateway_input,
        function_name="summarize_failure_modes",
        variant_name=ANALYSIS_VARIANT_NAME,
    )
    summary = response.output.parsed["summary"]

    # Sample for representative examples (different random seed to avoid duplication)
    examples = df[df.cluster == i]["rendered_input"].to_list()
    examples = [example["messages"][1:] for example in examples]
    summaries.append(summary)
    pprint(f"\nSummary: {summary}")

    # Show example root causes
    print("\nRepresentative examples:")
    for ex in examples[:10]:
        print(f" - {ex}")
    print("-" * 100)

You're all set!

We encourage you to experiment with other parameters (e.g. `N_CLUSTERS`, `EMBEDDING_MODEL`, or the clustering algorithm).

We use OpenAI o4-mini for the root cause and failure mode analysis.
You can try using other models by adding variants to `config/tensorzero.toml` and updating `ANALYSIS_VARIANT_NAME`.