In [145]:
import warnings
warnings.filterwarnings('ignore')


# Test Cases

In [102]:
test_cases = []

In [104]:
# 1. Need to retrieve many documents
texts = [
    "X = 5",
    "Y = 3",
    "Z = 2",
    "A = 12",
    "B = 50",
]
query = "What are the values of A, B, X, Y, Z?"
expected = "A is 12, B is 50, X is 5, Y is 3, Z is 2"
test_case = {
    "name": "Many documents 1",
    "texts": texts,
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

In [105]:
obj_color_pairs = [
    ("dog", "green"), ("cat", "blue"), ("building", "grey"), ("car", "orange")
]
texts = ["The {obj} is {color}".format(obj=obj, color=color) for obj, color in obj_color_pairs]
query = "What colors are the {objects}?".format(objects =", ".join((obj for obj, _ in obj_color_pairs)))
expected = ", ".join(texts).capitalize()
test_case = {
    "name": "Many documents 2",
    "texts": texts,
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

In [106]:
# 2. Redundant docs
texts = [
    "The color of the cat is blue",
    "Blue is the color of the cat",
    "The cat's color is blue",
    "The cat is blue",
    "I believe the cat was blue",
    "The cat was definitely blue",
    "The dog is green"
]
query = "What colors are the cat and the dog?"
expected = "The cat is blue and the dog is green"
test_case = {
    "name": "Redundant documents",
    "texts": texts,
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

In [107]:
# 3. Split required information across two docs in a way that changes semantic meaning of each half
texts = [
    "The cat was fat and it's color",
    " was green. The cat liked whole milk.",
    "The dog was blue."
]
query = "What color was the cat?"
expected = "The cat is green"
test_case = {
    "name": "Split statement",
    "texts": texts,
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

In [108]:
# 4. Metadata question (temporal), e.g. "What did I say write before I said X"
import datetime

now = datetime.datetime.now()
now_timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
future = now + datetime.timedelta(seconds=5)
future_timestamp = future.strftime("%Y-%m-%d %H:%M:%S")

texts = [
    "The cat is green",
    "The dog is yellow",
]
metadatas = [{"timestamp": now_timestamp}, {"timestamp": future_timestamp}]
query = "What did I say right before I mentioned the dog?"
expected = "You said the cat is green."
test_case = {
    "name": "Metadata question (temporal information)",
    "texts": texts,
    "query": query,
    "expected": expected,
    "metadatas": metadatas
}
test_cases.append(test_case)

In [109]:
# 5. Store conflicting statements, retrieve both and state there's an inconsistency
texts = [
    "The cat is green",
    "The cat is blue",
    "The dog is yellow",
]
query = "What color is the cat?"
expected = "I don't know, there are conflicting statements about the color of the cat."
test_case = {
    "name": "Conflicting statements",
    "texts": texts,
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

In [110]:
# 6. Add K facts to a single document and ask about only one of them
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

loader = TextLoader('../docs/modules/state_of_the_union.txt')
documents = loader.load()
documents[0].page_content += " The color of the cat is purple."
query = "What color is the cat?"
expected = "The cat is purple"
test_case = {
    "name": "One fact in long document",
    "texts": [d.page_content for d in documents],
    "query": query,
    "expected": expected
}
test_cases.append(test_case)

# Evaluate single retrieval qa system

In [111]:
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [18]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [38]:
embeddings = OpenAIEmbeddings()
for i, (texts, query, expected) in enumerate(test_cases):
    docsearch = Chroma.from_texts(texts, embeddings)
    qa = RetrievalQA.from_llm(llm=OpenAI(), retriever=docsearch.as_retriever(search_kwargs={"k": min(4, len(texts))}))
    pred = qa.apply([query])[0]['result']
    print(f"Example {i}")
    print(f"{query=}")
    print(f"{expected=}")
    print(f"{pred=}")
    print("-" * 20)

Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient


Example 0
query='What are the values of A, B, X, Y, Z?'
expected='A is 12, B is 50, X is 5, Y is 3, Z is 2'
pred=' A = 12, X = 5, Y = 3, Z = 2. There is no value for B.'
--------------------


Using embedded DuckDB without persistence: data will be transient


Example 1
query='What colors are the dog, cat, building, car?'
expected='The dog is green, the cat is blue, the building is grey, the car is orange'
pred=' The dog is green, the cat is blue, the building is grey, and the car is orange.'
--------------------


Using embedded DuckDB without persistence: data will be transient


Example 2
query='What color was the cat?'
expected='The cat is green'
pred=' The cat was fat and its color was green.'
--------------------


Using embedded DuckDB without persistence: data will be transient


Example 3
query='What color is the cat?'
expected='There are conflicting statements about the color of the cat.'
pred=' The cat is green.'
--------------------


Using embedded DuckDB without persistence: data will be transient


Example 4
query='What color is the cat?'
expected='The cat is purple'
pred=" I don't know."
--------------------
Example 5
query='What colors are the cat and the dog?'
expected='The cat is blue and the dog is green'
pred=" I don't know."
--------------------


# Compare multiple retrieval systems

In [125]:
candidates = []

In [126]:
def get_retrieval_qa(documents):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(documents)
    docsearch = Chroma.from_documents(documents, OpenAIEmbeddings())
    retriever = docsearch.as_retriever(search_kwargs={"k": min(4, len(texts))})
    return RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)
candidate = {
    "params": {"search": "similarity", "k": 4, "chunk_size": 1000},
    "getter": get_retrieval_qa,
}
candidates.append(candidate)

In [None]:
def get_retrieval_qa(documents):
    text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
    documents = text_splitter.split_documents(documents)
    docsearch = Chroma.from_documents(documents, OpenAIEmbeddings())
    retriever = docsearch.as_retriever(search_type="mmr", search_kwargs={"k": min(6, len(texts)), "fetch_k": min(20, len(texts))})
    return RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)
candidate = {
    "params": {"search": "mmr", "k": 6, "chunk_size": 200},
    "getter": get_retrieval_qa,
}
candidates.append(candidate)

In [116]:
from langchain.evaluation.qa import QAEvalChain

def default_evaluate(example, prediction) -> bool:
    """Return number of correct predictions."""
    eval_chain = QAEvalChain.from_llm(OpenAI(temperature=0))
    grades = eval_chain.evaluate([example], [prediction])
    return grades[0]['text'].strip().upper() == "CORRECT"

In [117]:
from langchain.schema import Document

In [129]:
test_results = []
for i, candidate in enumerate(candidates):
    res = {"params": candidate["params"], "test_cases": {}}
    for tc in test_cases:
        texts = tc["texts"]
        metadatas = tc.get("metadatas", [{}] * len(texts))
        docs = [Document(page_content=text, metadata=metadata) for text, metadata in zip(texts, metadatas)]
        qa = candidate["getter"](docs)
        example = {"query": tc["query"], "answer": tc["expected"]}
        prediction = qa.apply([example])[0]
        evaluate = tc.get("evaluate", default_evaluate)
        res["test_cases"][tc["name"]] = evaluate(example, prediction)
    print(f"{candidate['params']}: {sum(res['test_cases'].values())} / {len(test_cases)}")
    test_results.append(res)

Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient


candidate['params']={'store': 'Chroma', 'search': 'similarity', 'k': 4}: 1 / 7


Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient


candidate['params']={'store': 'Chroma', 'search': 'similarity', 'k': 6}: 2 / 7


Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient


candidate['params']={'store': 'Chroma', 'search': 'mmr', 'k': 4}: 1 / 7


In [147]:
import pandas as pd

test_result_df = pd.DataFrame([{"System": f"{res['params']}", **res["test_cases"]} for res in test_results])

def highlight(s):
    return ['background-color: honeydew' if s_ else 'background-color: mistyrose' for s_ in s]

test_result_df.style.apply(highlight, subset=test_result_df.columns.drop("System"))

Unnamed: 0,System,Many documents 1,Many documents 2,Redundant documents,Split statement,Metadata question (temporal information),Conflicting statements,One fact in long document
0,"{'store': 'Chroma', 'search': 'similarity', 'k': 4}",False,True,False,False,False,False,False
1,"{'store': 'Chroma', 'search': 'similarity', 'k': 6}",True,True,False,False,False,False,False
2,"{'store': 'Chroma', 'search': 'mmr', 'k': 4}",False,True,False,False,False,False,False
