In [50]:
import os
import pandas as pd
import numpy as np
import nest_asyncio
from langchain.chains import RetrievalQA
from langchain.retrievers import KNNRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from urllib.request import urlopen
import json
# from langchain_openai import ChatOpenAI, OpenAIEMbeddings
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals, 
)
from langchain_huggingface import HuggingFaceEmbeddings
from openai import OpenAI
import phoenix as px
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from phoenix.trace.langchain import LangChainInstrumentor
from tqdm import tqdm, tqdm_notebook

nest_asyncio.apply()


In [51]:
session = px.launch_app()
     

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [5]:
llm = ChatOllama(model='phi3', temperature=0.0)
embedding_model = HuggingFaceEmbeddings(
    model_name='mixedbread-ai/mxbai-embed-large-v1',
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)


In [9]:
df = pd.read_parquet(
    "http://storage.googleapis.com/arize-phoenix-assets/datasets/"
    "unstructured/llm/context-retrieval/langchain/database.parquet"
)
print(df.shape)
df.head(3)

(1086, 2)


Unnamed: 0,text,text_vector
0,\nAccess tutorials of what's possible with Ari...,"[-0.0036313518733896957, 0.0026182831234325553..."
1,"\n{% hint style=""info"" %}\nYour model type det...","[-0.006279695251247202, -0.0005314257337369415..."
2,\nExamples for logging explainability metrics....,"[-0.007306394403318994, 0.0007199545863166625,..."


In [26]:
# df['text_embedding_bge'] = df.text.apply(lambda x: embedding_model.embed_query(x))
embedding_model.embed_query(df.iloc[2]['text'])

[0.5222705006599426,
 0.3841533660888672,
 -0.15000851452350616,
 0.2376929521560669,
 -0.43989720940589905,
 0.006094262003898621,
 -0.8401291966438293,
 -0.20802666246891022,
 -0.12276878952980042,
 0.6840602159500122,
 -0.20088237524032593,
 0.7620967626571655,
 -0.014210522174835205,
 -0.4881494641304016,
 0.04106529802083969,
 0.5747261047363281,
 -0.38870906829833984,
 -0.3908386528491974,
 -0.9535424709320068,
 -0.340037077665329,
 0.6715689301490784,
 0.5610557794570923,
 -0.9577788710594177,
 -0.40011611580848694,
 -0.45081111788749695,
 0.902858316898346,
 0.31196388602256775,
 -0.12017923593521118,
 0.6604458093643188,
 0.3868192732334137,
 -0.15026453137397766,
 0.1389276087284088,
 -0.11175327003002167,
 -1.4587552547454834,
 -0.08540473133325577,
 -0.3283596932888031,
 0.6989537477493286,
 -0.18379484117031097,
 -0.334386944770813,
 -0.8390719890594482,
 -0.34405890107154846,
 0.266710102558136,
 0.8795782327651978,
 -1.0262759923934937,
 -0.8161568641662598,
 -0.29941201

In [33]:
tqdm.pandas()
df['text_vector_bge'] = df.progress_apply(lambda x: embedding_model.embed_query(x['text']), axis=1)

100%|██████████| 1086/1086 [11:05<00:00,  1.63it/s]


In [36]:
df.to_feather("./data/arize_phoenix_dataset_bge.feather")

In [39]:
knn_retriever = KNNRetriever(
    index=np.stack(df['text_vector_bge']),
    texts = df['text'].tolist(),
    embeddings=embedding_model
)

In [42]:
chain_type = "stuff"

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=knn_retriever,
    metadata={'application_type': 'question_answering'}
)


In [52]:
LangChainInstrumentor().instrument()

WARNI [opentelemetry.instrumentation.instrumentor] Attempting to instrument while already instrumented


In [48]:
url = "http://storage.googleapis.com/arize-phoenix-assets/datasets/unstructured/llm/context-retrieval/arize_docs_queries.jsonl"
queries = []
with urlopen(url) as response:
    for line in response:
        line = line.decode("utf-8").strip()
        data = json.loads(line)
        queries.append(data["query"])
queries[:10]

['How do I use the SDK to upload a ranking model?',
 'What drift metrics are supported in Arize?',
 'Does Arize support batch models?',
 'Does Arize support training data?',
 'How do I configure a threshold if my data has seasonality trends?',
 'How are clusters in the UMAP calculated? When are the clusters refreshed?',
 'How does Arize calculate AUC?',
 'Can I send truth labels to Arize separtely? ',
 'How do I send embeddings to Arize?',
 'Can I copy a dashboard']

In [53]:
for query in tqdm(queries[:10]):
    chain.invoke(query)

100%|██████████| 10/10 [02:18<00:00, 13.83s/it]


# Export and Evaluate the trace data


In [54]:
queries_df = get_qa_with_reference(px.Client())
retrieved_documents_df = get_retrieved_documents(px.Client())

In [55]:
queries_df.head()

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47bd3278edb320e3,How do I use the SDK to upload a ranking model?,To upload your ranking model using the ARIZE ...,\n```python\nfrom arize.utils.types import Env...
ae5456d885ec0720,What drift metrics are supported in Arize?,"In Arize, users have access to a variety of d...",\nArize calculates drift metrics such as Popul...
4c72e90563eb7d27,Does Arize support batch models?,"Yes, Arize supports both single record and ba...",\nArize helps you visualize your model perform...
56624d801f1ffe88,Does Arize support training data?,"No, the context provided does not explicitly ...","\nArize integrates with your ML stack, no matt..."
fc61f64b9fb4c6a0,How do I configure a threshold if my data has ...,To accommodate for seasonality in your time s...,"\nWith auto thresholds turned off, set the thr..."


In [56]:
retrieved_documents_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d034eb9eb2a821a6,0,1d03007b975be7de51bae7b5a3190eae,How do I use the SDK to upload a ranking model?,\n```python\nfrom arize.utils.types import Env...
d034eb9eb2a821a6,1,1d03007b975be7de51bae7b5a3190eae,How do I use the SDK to upload a ranking model?,"response = arize_client.log(\n model_id=""de..."
d034eb9eb2a821a6,2,1d03007b975be7de51bae7b5a3190eae,How do I use the SDK to upload a ranking model?,"response = arize_client.log(\n model_id=""de..."
d034eb9eb2a821a6,3,1d03007b975be7de51bae7b5a3190eae,How do I use the SDK to upload a ranking model?,"response = arize_client.log(\n model_id=""de..."
493f7d6338b0c42e,0,9f95ac74603cb6b0eca563ed92bd45d4,What drift metrics are supported in Arize?,\nArize calculates drift metrics such as Popul...


In [57]:
eval_model = OpenAIModel(model='phi3', base_url='http://localhost:11434/v1', api_key='ollama')
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

hallucination_eval_df, qa_correctness_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_correctness_evaluator],
    provide_explanation=True,
)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df,
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
    DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)

run_evals |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s

Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing
Worker timeo

run_evals |          | 0/40 (0.0%) | ⏳ 00:00<? | ?it/s