# 📊 Client-Side Custom Metrics with TruLens

This notebook demonstrates how to create and use client-side custom metrics with TruLens SDK and export computed metrics (eval spans) to Snowflake with batch evaluation runs. Client-side custom metrics allow you to define your own evaluation functions that run locally on the client instead of on the server (Snowflake).

## Key Features

- **Custom Metric**: We can implement a client side custom metric with arbitrary python code
- **MetricConfig**: Explicit configuration for mapping metric parameters to span attributes
- **Flexible Selectors**: Map metric parameters to span attributes using selectors
- **Client-Side Computation**: Metrics are computed locally and results uploaded as OTel spans

In [None]:
import os

from dotenv import load_dotenv
from trulens.apps.app import TruApp
from trulens.core.feedback.custom_metric import MetricConfig
from trulens.core.feedback.selector import Selector
from trulens.core.otel.instrument import instrument
from trulens.core.run import RunConfig
from trulens.otel.semconv.trace import SpanAttributes

load_dotenv()


class Text2SQLApp:
    @instrument(
        span_type=SpanAttributes.SpanType.RECORD_ROOT,
        attributes={
            SpanAttributes.RECORD_ROOT.INPUT: "query",
            SpanAttributes.RECORD_ROOT.OUTPUT: "return",
        },
    )
    def generate_sql(self, query: str) -> str:
        """Generate SQL from natural language query."""
        if "users" in query.lower():
            return "SELECT * FROM users"
        elif "orders" in query.lower():
            return "SELECT * FROM orders"
        else:
            return "SELECT 1"

### Fill out Snowflake account credentails

In [None]:
os.environ["SNOWFLAKE_ACCOUNT"] = "..."
os.environ["SNOWFLAKE_USER"] = "..."
os.environ["SNOWFLAKE_USER_PASSWORD"] = "..."
os.environ["SNOWFLAKE_DATABASE"] = "..."
os.environ["SNOWFLAKE_SCHEMA"] = "..."
os.environ["SNOWFLAKE_WAREHOUSE"] = "..."
os.environ["SNOWFLAKE_ROLE"] = "..."

## Define custom metrics with plain python functions

In [None]:
def text2sql_quality(query: str, sql: str) -> float:
    """
    Custom metric to evaluate text-to-SQL quality.

    Args:
        query: The natural language query
        sql: The generated SQL query

    Returns:
        Quality score between 0 and 1
    """
    # Simple heuristic - check if SQL contains expected keywords
    if "SELECT" in sql.upper() and len(query) > 10:
        return 0.9
    elif "SELECT" in sql.upper():
        return 0.7
    else:
        return 0.3


def custom_accuracy(query: str) -> float:
    """
    Custom accuracy metric based on query length.

    Args:
        query: The input query

    Returns:
        Accuracy score between 0 and 1
    """
    return max(len(query) / 100.0, 1.0)

## Create MetricConfig Objects with selector

Evaluation configurations map OTel span attributes to metric function parameters. This effectively tells our custom metric what OTel spans to look for (query from) in the Snowflake event table, where spans emitted from the app should be uploaded to and ingested into.  


Notice we define 2 configs using the same metric implementation `text2sql_quality`, but with different names so that it can be used in multiple places in the app flow.

In [None]:
text2sql_config = MetricConfig(
    metric_name="text2sql_evaluation_v1",  # Unique semantic identifier
    metric_implementation=text2sql_quality,
    metric_type="text2sql",  # Implementation identifier
    computation_type="client",
    description="Evaluates text-to-SQL conversion quality",
    selectors={
        "query": Selector(  # Parameter name in the function
            span_type=SpanAttributes.SpanType.RECORD_ROOT,
            span_attribute=SpanAttributes.RECORD_ROOT.INPUT,
        ),
        "sql": Selector(  # Parameter name in the function
            span_type=SpanAttributes.SpanType.RECORD_ROOT,
            span_attribute=SpanAttributes.RECORD_ROOT.OUTPUT,
        ),
    },
)

accuracy_config = MetricConfig(
    metric_name="query_length_accuracy_v1",  # Unique semantic identifier
    metric_implementation=custom_accuracy,
    metric_type="accuracy",  # Implementation identifier
    computation_type="client",
    description="Custom accuracy based on query length",
    selectors={
        "query": Selector(
            span_type=SpanAttributes.SpanType.RECORD_ROOT,
            span_attribute=SpanAttributes.RECORD_ROOT.INPUT,
        ),
    },
)


# Example: Using the same implementation with different configurations
text2sql_config_v2 = MetricConfig(
    metric_name="text2sql_evaluation_strict",  # Different semantic identifier
    metric_implementation=text2sql_quality,  # Same implementation
    metric_type="text2sql",  # Same implementation identifier
    computation_type="client",
    description="Strict text-to-SQL evaluation with different thresholds",
    selectors={
        "query": Selector(
            span_type=SpanAttributes.SpanType.RECORD_ROOT,
            span_attribute=SpanAttributes.RECORD_ROOT.INPUT,
        ),
        "sql": Selector(
            span_type=SpanAttributes.SpanType.RECORD_ROOT,
            span_attribute=SpanAttributes.RECORD_ROOT.OUTPUT,
        ),
    },
)

In [None]:
# Create snowpark session.
import os

from snowflake.snowpark import Session
from trulens.connectors.snowflake import SnowflakeConnector

snowflake_connection_parameters = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
    "database": os.environ["SNOWFLAKE_DATABASE"],
    "schema": os.environ["SNOWFLAKE_SCHEMA"],
    "role": os.environ["SNOWFLAKE_ROLE"],
    "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
}
snowpark_session = Session.builder.configs(
    snowflake_connection_parameters
).create()

# TruSession is no longer required as long as snowflake connector exists
sf_connector = SnowflakeConnector(snowpark_session=snowpark_session)

In [None]:
# Create TruLens instrumented app from custom app.
import uuid

APP_NAME = f"{os.getlogin()} custom metrics client-side flow {uuid.uuid4()}"
APP_VERSION = "V1"

app = Text2SQLApp()
tru_app = TruApp(
    app,
    app_name="Text2SQLApp",
    app_version="v1",
    main_method=app.generate_sql,
    connector=sf_connector,
)

metrics_to_compute = [
    # Server-side metrics (strings)
    "answer_relevance",
    # Client-side metrics (MetricConfig objects)
    text2sql_config,
    accuracy_config,
    text2sql_config_v2,  # Same implementation, different configuration
]

In [None]:
import pandas as pd

test_data_entries = {"query": ["select * from users", "select * from orders"]}


user_input_data_df = pd.DataFrame(test_data_entries)

In [None]:
import uuid

from trulens.core.run import Run

run_name = f"test_run_for_custom_metrics_{uuid.uuid4()}"

run_config = RunConfig(
    run_name=run_name,
    dataset_name="dummy_test_rag_set",
    source_type="DATAFRAME",
    dataset_spec={"RECORD_ROOT.INPUT": "query"},
)

run: Run = tru_app.add_run(run_config=run_config)

In [None]:
run.start(input_df=user_input_data_df)

### Compute out-of-box metric (answer_relevance) and 2 custom metrics using Snowflake batch evaluation flow

In [None]:
import time

while run.get_status() != "INVOCATION_COMPLETED":
    time.sleep(3)

run.compute_metrics(metrics_to_compute)

In [None]:
run.get_status()