In [1]:
# https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/llm_application_tracing_evaluating_and_analysis.ipynb

Tracing, Evaluation and Analysis:

- Build, observe and analyze and llm powered application.
- LLM driven chat with Docs that will answer questions.

Key Concepts:
- LLM traces are a category of telemetry data that is used to understand the execution of LLMs and the associated context (such as retrieving, use of internal tolls etc.).
- Traces are made up of a sequence of spans (a unit of work or operation).


In [1]:
import phoenix as px
from llama_index.core import (ServiceContext, StorageContext, load_index_from_storage)

from llama_index.core.graph_stores import SimpleGraphStore
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from tqdm import tqdm


In [2]:
# Used the embeddings fro HuggingFace
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Load the model
from llama_index.llms.ollama import Ollama

llm = Ollama(model="phi3", request_timeout=120.0)
# Configure the settings to so that the desired llm and embedding_model is used.
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [3]:
px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x1c713abc790>

In [11]:
# file_system = GCSFileSystem(project="public-assets-275721")
# index_path = "arize-phoenix-assets/datasets/unstructured/llm/llama-index/arize-docs/index/"
# storage_context = StorageContext.from_defaults(
#     fs=file_system,
#     persist_dir=index_path,
#     graph_store=SimpleGraphStore(),  # prevents unauthorized request to GCS
# )
# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embed_model,
# )
# index = load_index_from_storage(
#     storage_context,
#     service_context=service_context,
# )
# query_engine = index.as_query_engine()

  service_context = ServiceContext.from_defaults(


In [4]:

storage_context = StorageContext.from_defaults(
    persist_dir="./storage",
    graph_store=SimpleGraphStore(),  # prevents unauthorized request to GCS
)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)
index = load_index_from_storage(
    storage_context,
    service_context=service_context,
)
query_engine = index.as_query_engine()

  service_context = ServiceContext.from_defaults(


In [5]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://127.0.0.1:6006/v1/traces"
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

In [6]:
# Asking the Application questions about the Arize product
queries = [
    "How can I query for a monitor's status using GraphQL?",
    "How do I delete a model?",
    "How much does an enterprise license of Arize cost?",
    "How do I log a prediction using the python SDK?",
]

for query in tqdm(queries):
    response = query_engine.query(query)
    print(f"Query: {query}")
    print(f"Response: {response}")

 25%|██▌       | 1/4 [00:34<01:44, 34.78s/it]

Query: How can I query for a monitor's status using GraphQL?
Response:  To retrieve monitoring data such as CPU usage or memory utilization with high performance and low latency from Prometheus via Grafana UI on AWS EC2 instance running Ubuntu server (version 16.04), you would typically create custom queries in the Query Builder tab within each panel of your dashboard designed to display this monitoring data, using time range filters for real-time or historical analysis:

```plaintext
# Custom Prometheus query example with label_values filter and aggregate functions (minimum) 
label_values(up{job=~"your_monitoring_service",namespace="default"} as monitored, up{} as _) group by [__name__:concat(__labels__," ",metric)] order by -timestamp min over() as lastValueInWindow asc limit 10 offset 0
```
To execute this query in a web browser using the Grafana Query Editor or through an API call to your Prometheus instance, follow these general steps. Please note that since I cannot interact with

 50%|█████     | 2/4 [00:41<00:36, 18.04s/it]

Query: How do I delete a model?
Response:  The documentation does not provide specific instructions on how to delete a model. It mainly focuses on initializing models using OpenAIModel with different configurations for various AI evaluation purposes within an Eval harness framework and doesn't mention any deletion process or command syntax related to deleting the instantiated models like GPT-4, GPT-3.5 Turbo etc., from the given context information in your query.


 75%|███████▌  | 3/4 [00:48<00:13, 13.06s/it]

Query: How much does an enterprise license of Arize cost?
Response:  I'm sorry, but as per my guidelines to avoid referring directly from provided contexts or using specific phrases that suggest looking up external resources for pricing information, it is not possible for me to provide the current price point for Arize enterprise licenses. For accurate and detailed subscription options including prices, please visit their official website where they offer comprehensive product details aligned with your requirements at affordable rates suited for large-scale deployments in a production setting.

In addition, joining the Phoenix Slack community can provide access to additional resources or answers from fellow users that might help guide you towards understanding more about Arize's pricing structure and services offered within different subscription plans available on their official site.


100%|██████████| 4/4 [01:00<00:00, 15.08s/it]

Query: How do I log a prediction using the python SDK?
Response:  To log predictions with Python SDK for Arize (or similar systems), you would typically collect your data points in an organized manner during or after making API requests to retrieve responses from LLMs such as OpenAI's GPT-3.5 Turbo model, and then structure these logs appropriately using a suitable library like pandas if needed, which is commonly used for handling tabular data efficiently.

Here are the steps you would follow:
1. After obtaining predictions through API requests to LLM models in Python SDKs (for Arize or similar systems), collect all relevant information into structured logs that include details such as request IDs and prediction results, possibly including timestamps for chronological tracking.
2. Optionally format these collected data points using the pandas library if needed by converting them into a DataFrame structure to facilitate easier analysis later on. This could involve specifying columns lik




In [7]:
# Convert traces into workable datasets

spans_df = px.Client().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value", "attributes.retrieval.documents"]].head()

from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.Client())
queries_df = get_qa_with_reference(px.Client())

  df_attributes = pd.DataFrame.from_records(


In [9]:
# Generating the Hallucination & Q&A Eval

import nest_asyncio
from phoenix.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    QA_PROMPT_RAILS_MAP,
    QA_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

nest_asyncio.apply()  # Speeds up OpenAI API calls

# Creating Hallucination Eval which checks if the application hallucinated
hallucination_eval = llm_classify(
    dataframe=queries_df,
    model=OpenAIModel(model='phi3', base_url='http://localhost:11434/v1', api_key='ollama'),
    template=HALLUCINATION_PROMPT_TEMPLATE,
    rails=list(HALLUCINATION_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
    concurrency=4,
)
hallucination_eval["score"] = (
    hallucination_eval.label[~hallucination_eval.label.isna()] == "factual"
).astype(int)

# Creating Q&A Eval which checks if the application answered the question correctly
qa_correctness_eval = llm_classify(
    dataframe=queries_df,
    model=OpenAIModel(model='phi3', base_url='http://localhost:11434/v1', api_key='ollama'),
    template=QA_PROMPT_TEMPLATE,
    rails=list(QA_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
    concurrency=4,
)

qa_correctness_eval["score"] = (
    hallucination_eval.label[~qa_correctness_eval.label.isna()] == "correct"
).astype(int)

# Logs the Evaluations to Phoenix
px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval),
)

llm_classify |          | 0/4 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/4 (0.0%) | ⏳ 00:00<? | ?it/s

In [10]:
hallucination_eval.head(2)

Unnamed: 0_level_0,label,explanation,exceptions,execution_status,execution_seconds,score
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31ed42a4b0f220b6,NOT_PARSABLE,,[],COMPLETED,28.666406,0
c41ce31f946d80c5,NOT_PARSABLE,,[],COMPLETED,27.968769,0


In [11]:
qa_correctness_eval.head(2)

Unnamed: 0_level_0,label,explanation,exceptions,execution_status,execution_seconds,score
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31ed42a4b0f220b6,NOT_PARSABLE,,[],COMPLETED,20.665933,0
c41ce31f946d80c5,incorrect,,[],COMPLETED,24.103995,0


In [13]:
# Generating Retrieval Relevance Eval

from phoenix.evals import (
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

retrieved_documents_eval = llm_classify(
    dataframe=retrieved_documents_df,
    model=OpenAIModel(model='phi3', base_url='http://localhost:11434/v1', api_key='ollama'),
    template=RAG_RELEVANCY_PROMPT_TEMPLATE,
    rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)

retrieved_documents_eval["score"] = (
    retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == "relevant"
).astype(int)

px.Client().log_evaluations(
    DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval)
)

llm_classify |          | 0/8 (0.0%) | ⏳ 00:00<? | ?it/s

In [14]:
retrieved_documents_eval.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,label,explanation,exceptions,execution_status,execution_seconds,score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12449c60da03ceea,0,unrelated,,[],COMPLETED,29.741873,0
12449c60da03ceea,1,relevant,,[],COMPLETED,22.438882,1
