In [1]:
# https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/llm_application_tracing_evaluating_and_analysis.ipynb

Tracing, Evaluation and Analysis:

- Build, observe and analyze and llm powered application.
- LLM driven chat with Docs that will answer questions.

Key Concepts:
- LLM traces are a category of telemetry data that is used to understand the execution of LLMs and the associated context (such as retrieving, use of internal tolls etc.).
- Traces are made up of a sequence of spans (a unit of work or operation).


In [1]:
import phoenix as px
from llama_index.core import (ServiceContext, StorageContext, load_index_from_storage)

from llama_index.core.graph_stores import SimpleGraphStore
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from tqdm import tqdm


In [2]:
# Used the embeddings fro HuggingFace
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Load the model
from llama_index.llms.ollama import Ollama

llm = Ollama(model="phi3", request_timeout=120.0)
# Configure the settings to so that the desired llm and embedding_model is used.
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [3]:
px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x1e2163a6550>

In [11]:
# file_system = GCSFileSystem(project="public-assets-275721")
# index_path = "arize-phoenix-assets/datasets/unstructured/llm/llama-index/arize-docs/index/"
# storage_context = StorageContext.from_defaults(
#     fs=file_system,
#     persist_dir=index_path,
#     graph_store=SimpleGraphStore(),  # prevents unauthorized request to GCS
# )
# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embed_model,
# )
# index = load_index_from_storage(
#     storage_context,
#     service_context=service_context,
# )
# query_engine = index.as_query_engine()

  service_context = ServiceContext.from_defaults(


In [5]:

storage_context = StorageContext.from_defaults(
    persist_dir="./storage",
    graph_store=SimpleGraphStore(),  # prevents unauthorized request to GCS
)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)
index = load_index_from_storage(
    storage_context,
    service_context=service_context,
)
query_engine = index.as_query_engine()

  service_context = ServiceContext.from_defaults(


In [6]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://127.0.0.1:6006/v1/traces"
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

In [7]:
# Asking the Application questions about the Arize product
queries = [
    "How can I query for a monitor's status using GraphQL?",
    "How do I delete a model?",
    "How much does an enterprise license of Arize cost?",
    "How do I log a prediction using the python SDK?",
]

for query in tqdm(queries):
    response = query_engine.query(query)
    print(f"Query: {query}")
    print(f"Response: {response}")

 25%|██▌       | 1/4 [00:14<00:44, 14.84s/it]

Query: How can I query for a monitor's status using GraphQL?
Response:  To find out how to query for a specific piece of hardware equipment such as a computer or network monitoring device named "monitor" in your system via its unique identifier (UID) and return details like the UID, hostname, CPU usage, memory information along with their latest values using GraphQL API.

Firstly ensure you have an endpoint that exposes these details which could be something similar to this: 
```graphql
query GetHardwareStatus($uid: ID!) {
  hardware(id: $uid) {
    uid
    hostname
    cpuUsage
    memoryInfo
  }
}
```
Here is an example of how you could implement it in JavaScript using fetch API, assuming that the GraphQL endpoint URL and query are correct according to your setup. This request sends a variable 'uid' representing unique identifiers for hardware devices which helps return specific details:

```javascript
const uid = "your_unique_identifier"; // Replace with actual UID of desired device

 50%|█████     | 2/4 [00:20<00:19,  9.50s/it]

Query: How do I delete a model?
Response:  Incorrect. The provided text does not contain instructions on how to delete a model. It discusses instantiating models using OpenAIModel in Python for evaluation purposes. To remove or deactivate a pre-tested evals setup, you would typically look at the specific system's documentation regarding its functionality management rather than inferring from this context which does not address such an action directly.


 75%|███████▌  | 3/4 [00:26<00:07,  7.71s/it]

Query: How much does an enterprise license of Arize cost?
Response:  I'm sorry, but based on the provided document content alone, it doesn't contain specific details about pricing for different types of Arize licenses such as individual or enterprise ones. It encourages users to sign up and visit their documentation page or contact them directly via email at support@arize.com if they have questions regarding costs, features, or deployment guides related to Arize's observability platform.


100%|██████████| 4/4 [00:33<00:00,  8.43s/it]

Query: How do I log a prediction using the python SDK?
Response:  You can make an API call to get predictions for each example in your dataset by following these steps with pseudocode as we are avoiding direct references from provided text:

1. Initialize OpenAI's LLM model and instrument it if necessary (for logging purposes).
2. Define a function that constructs the prompt using user input, system_prompt, or any other context required for prediction generation by your task-specific logic in Python.
3. Create an API call within this function to send requests to OpenAI's LLM service with appropriate parameters and handle responses accordingly, capturing spans/traces if needed.
4. Iterate through each example from the dataset using a loop structure where you apply your prediction generation logic for individual examples (like `task` defined above).
5. Call this function within another that orchestrates running predictions over all of them while also invoking evaluation functions to anal




In [8]:
# Convert traces into workable datasets

spans_df = px.Client().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value", "attributes.retrieval.documents"]].head()

from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.Client())
queries_df = get_qa_with_reference(px.Client())

  df_attributes = pd.DataFrame.from_records(


In [10]:
# Generating the Hallucination & Q&A Eval

import nest_asyncio
from phoenix.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    QA_PROMPT_RAILS_MAP,
    QA_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

nest_asyncio.apply()  # Speeds up OpenAI API calls

# Creating Hallucination Eval which checks if the application hallucinated
hallucination_eval = llm_classify(
    dataframe=queries_df,
    model=llm,
    template=HALLUCINATION_PROMPT_TEMPLATE,
    rails=list(HALLUCINATION_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
    concurrency=4,
)
hallucination_eval["score"] = (
    hallucination_eval.label[~hallucination_eval.label.isna()] == "factual"
).astype(int)

# Creating Q&A Eval which checks if the application answered the question correctly
qa_correctness_eval = llm_classify(
    dataframe=queries_df,
    model=llm,
    template=QA_PROMPT_TEMPLATE,
    rails=list(QA_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
    concurrency=4,
)

qa_correctness_eval["score"] = (
    hallucination_eval.label[~qa_correctness_eval.label.isna()] == "correct"
).astype(int)

# Logs the Evaluations to Phoenix
px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval),
)

AttributeError: 'Ollama' object has no attribute 'reload_client'