# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 _LangChain_ Quickstart

In this quickstart you will create a simple LCEL Chain and learn how to log it and get feedback on an LLM response.

For evaluation, we will leverage the RAG triad of groundedness, context relevance and answer relevance.

You'll also learn how to use feedbacks for guardrails, via filtering retrieved context.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb)

## Setup
### Add API keys
For this quickstart you will need Open AI and Huggingface keys

In [1]:
# ! pip install trulens_eval openai langchain langchain-openai langchain_community faiss-cpu bs4 tiktoken

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

### Import from LangChain and TruLens

In [2]:
# Imports main tools:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()

# Imports from LangChain to build app
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough



🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


### Load documents

In [3]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

### Create Vector Store

In [4]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents, embeddings)

### Create RAG

In [5]:
retriever = vectorstore.as_retriever()

prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

  warn_deprecated(


### Send your first request

In [6]:
rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique that breaks down complex tasks into smaller and simpler steps to enhance model performance. It involves transforming big tasks into manageable tasks by decomposing them into multiple steps. Task decomposition can be achieved through various methods, such as using simple prompting, task-specific instructions, or relying on external classical planners.'

## Initialize Feedback Function(s)

In [7]:
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
import numpy as np

# Initialize provider class
provider = OpenAI()

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(rag_chain)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(context.collect()) # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)
# Context relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content .


## Instrument chain for logging with TruLens

In [8]:
tru_recorder = TruChain(rag_chain,
    app_id='Chain1_ChatApplication',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])

In [9]:
with tru_recorder as recording:
    llm_response = rag_chain.invoke("What is Task Decomposition?")

display(llm_response)

'Task Decomposition is a technique that breaks down complex tasks into smaller and simpler steps to enhance model performance. It involves transforming big tasks into manageable tasks by decomposing them into multiple steps. Task decomposition can be achieved through various methods, such as using simple prompting, task-specific instructions, or relying on external classical planners.'

Check results

In [11]:
tru.get_leaderboard()

Unnamed: 0_level_0,Answer Relevance,Groundedness,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chain1_ChatApplication,0.9,1.0,0.6,2.0,0.004985


By looking closer at context relevance, we see that our retriever is returning irrelevant context.

In [12]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

Unnamed: 0,question,context,ret
0,What is Task Decomposition?,Fig. 1. Overview of a LLM-powered autonomous a...,0.8
1,What is Task Decomposition?,Fig. 10. A picture of a sea otter using rock t...,0.4
2,What is Task Decomposition?,(3) Task execution: Expert models execute on t...,0.4
3,What is Task Decomposition?,Fig. 6. Illustration of how Algorithm Distilla...,0.8


## Use guardrails

In addition to making informed iteration, we can also directly use feedback results as guardrails at inference time. In particular, here we show how to use the context relevance score as a guardrail to filter out irrelevant context before it gets passed to the LLM. This both reduces hallucination and improves efficiency.

Below, you can see the TruLens feedback display of each context relevance chunk retrieved by our RAG.

Wouldn't it be great if we could automatically filter out context chunks with relevance scores below 0.5?

We can do so with the TruLens guardrail, *WithFeedbackFilterDocuments*. All we have to do is use the method `of_retriever` to create a new filtered retriever, passing in the original retriever along with the feedback function and threshold we want to use.

In [13]:
from trulens_eval.guardrails.langchain import WithFeedbackFilterDocuments

# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = (
    Feedback(provider.context_relevance)
)

filtered_retriever = WithFeedbackFilterDocuments.of_retriever(
        retriever=retriever,
        feedback=f_context_relevance_score,
        threshold=0.75
    )

rag_chain = (
    {"context": filtered_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Then we can operate as normal

In [14]:
tru_recorder = TruChain(rag_chain,
    app_id='Chain1_ChatApplication_Filtered',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])

with tru_recorder as recording:
    llm_response = rag_chain.invoke("What is Task Decomposition?")

display(llm_response)

'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.'

## See the power of context filters!

If we inspect the context relevance of our retreival now, you see only relevant context chunks!

In [15]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

Unnamed: 0,question,context,ret
0,What is Task Decomposition?,Fig. 1. Overview of a LLM-powered autonomous a...,0.8


In [16]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.4.206:1236 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

## Retrieve records and feedback

In [16]:
# The record of the app invocation can be retrieved from the `recording`:

rec = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple

display(rec)

Record(record_id='record_hash_b58260de958b6a44f38e92067bd75220', app_id='Chain1_ChatApplication_Filtered', cost=Cost(n_requests=6, n_successful_requests=21, n_classes=0, n_tokens=15902, n_stream_chunks=0, n_prompt_tokens=15823, n_completion_tokens=79, cost=0.023838500000000002), perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 168333), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718205)), ts=datetime.datetime(2024, 7, 3, 5, 51, 13, 718234), tags='-', meta=None, main_input='What is Task Decomposition?', main_output='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', main_error=None, calls=[RecordAppCall(call_id='201a07bb-6b2f-4616-ae75-0320ec1bc97b', stack=[Reco

In [17]:
# The results of the feedback functions can be rertrieved from
# `Record.feedback_results` or using the `wait_for_feedback_result` method. The
# results if retrieved directly are `Future` instances (see
# `concurrent.futures`). You can use `as_completed` to wait until they have
# finished evaluating or use the utility method:

for feedback, feedback_result in rec.wait_for_feedback_results().items():
    print(feedback.name, feedback_result.result)

# See more about wait_for_feedback_results:
# help(rec.wait_for_feedback_results)

Answer Relevance 0.9
Context Relevance 0.8
Groundedness 0.9666666666666667


In [18]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Groundedness,Context Relevance,Answer Relevance,Groundedness_calls,Context Relevance_calls,Answer Relevance_calls,latency,total_tokens,total_cost
0,Chain1_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_402adfcfca139d116e6a0dd79b7813c8,"""What is Task Decomposition?""","""Task Decomposition is a technique that breaks...",-,"{""record_id"": ""record_hash_402adfcfca139d116e6...","{""n_requests"": 2, ""n_successful_requests"": 3, ...","{""start_time"": ""2024-07-03T05:50:39.251674"", ""...",2024-07-03T05:50:41.272357,1.0,0.6,0.9,[{'args': {'source': ['Fig. 1. Overview of a L...,[{'args': {'question': 'What is Task Decomposi...,[{'args': {'prompt': 'What is Task Decompositi...,2,3314,0.004985
1,Chain1_ChatApplication_Filtered,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_b58260de958b6a44f38e92067bd75220,"""What is Task Decomposition?""","""Task decomposition is a technique used to bre...",-,"{""record_id"": ""record_hash_b58260de958b6a44f38...","{""n_requests"": 6, ""n_successful_requests"": 21,...","{""start_time"": ""2024-07-03T05:51:11.168333"", ""...",2024-07-03T05:51:13.718234,0.966667,0.8,0.9,[{'args': {'source': ['Fig. 1. Overview of a L...,[{'args': {'question': 'What is Task Decomposi...,[{'args': {'prompt': 'What is Task Decompositi...,2,15902,0.023839


In [19]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevance,Groundedness,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chain1_ChatApplication,0.9,1.0,0.6,2.0,0.004985
Chain1_ChatApplication_Filtered,0.9,0.966667,0.8,2.0,0.023839


## Explore in a Dashboard

In [None]:
tru.run_dashboard() # open a local streamlit app to explore

# tru.stop_dashboard() # stop if needed

Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard.

## Learn more about the call stack

In [20]:
json_like = last_record.layout_calls_as_app()

In [21]:
json_like

Munch({'record_id': 'record_hash_b58260de958b6a44f38e92067bd75220', 'app_id': 'Chain1_ChatApplication_Filtered', 'cost': {'n_requests': 6, 'n_successful_requests': 21, 'n_classes': 0, 'n_tokens': 15902, 'n_stream_chunks': 0, 'n_prompt_tokens': 15823, 'n_completion_tokens': 79, 'cost': 0.023838500000000002}, 'perf': {'start_time': '2024-07-03T05:51:11.168333', 'end_time': '2024-07-03T05:51:13.718205'}, 'ts': '2024-07-03T05:51:13.718234', 'tags': '-', 'meta': None, 'main_input': 'What is Task Decomposition?', 'main_output': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'main_error': None, 'calls': [{'call_id': '201a07bb-6b2f-4616-ae75-0320ec1bc97b', 'stack': [{'path': 'app', 

In [22]:
from ipytree import Tree, Node

def display_call_stack(data):
    tree = Tree()
    tree.add_node(Node('Record ID: {}'.format(data['record_id'])))
    tree.add_node(Node('App ID: {}'.format(data['app_id'])))
    tree.add_node(Node('Cost: {}'.format(data['cost'])))
    tree.add_node(Node('Performance: {}'.format(data['perf'])))
    tree.add_node(Node('Timestamp: {}'.format(data['ts'])))
    tree.add_node(Node('Tags: {}'.format(data['tags'])))
    tree.add_node(Node('Main Input: {}'.format(data['main_input'])))
    tree.add_node(Node('Main Output: {}'.format(data['main_output'])))
    tree.add_node(Node('Main Error: {}'.format(data['main_error'])))
    
    calls_node = Node('Calls')
    tree.add_node(calls_node)
    
    for call in data['calls']:
        call_node = Node('Call')
        calls_node.add_node(call_node)
        
        for step in call['stack']:
            step_node = Node('Step: {}'.format(step['path']))
            call_node.add_node(step_node)
            if 'expanded' in step:
                expanded_node = Node('Expanded')
                step_node.add_node(expanded_node)
                for expanded_step in step['expanded']:
                    expanded_step_node = Node('Step: {}'.format(expanded_step['path']))
                    expanded_node.add_node(expanded_step_node)
    
    return tree

# Usage
tree = display_call_stack(json_like)
tree

Tree(nodes=(Node(name='Record ID: record_hash_b58260de958b6a44f38e92067bd75220'), Node(name='App ID: Chain1_Ch…

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 LlamaIndex Quickstart

In this quickstart you will create a simple Llama Index app and learn how to log it and get feedback on an LLM response.

You'll also learn how to use feedbacks for guardrails, via filtering retrieved context.

For evaluation, we will leverage the RAG triad of groundedness, context relevance and answer relevance.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb)

## Setup

### Install dependencies
Let's install some of the dependencies for this notebook if we don't have them already

In [None]:
# pip install trulens_eval llama_index openai

### Add API keys
For this quickstart, you will need an Open AI key. The OpenAI key is used for embeddings, completion and evaluation.

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

### Import from TruLens

In [2]:
from trulens_eval import Tru
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


### Download data

This example uses the text of Paul Graham’s essay, [“What I Worked On”](https://paulgraham.com/worked.html), and is the canonical llama-index example.

The easiest way to get it is to [download it via this link](https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt) and save it in a folder called data. You can do so with the following command:

In [3]:
import os
import urllib.request

url = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"
file_path = 'data/paul_graham_essay.txt'

if not os.path.exists('data'):
    os.makedirs('data')

if not os.path.exists(file_path):
    urllib.request.urlretrieve(url, file_path)


### Create Simple LLM Application

This example uses LlamaIndex which internally uses an OpenAI LLM.

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

Settings.chunk_size = 128
Settings.chunk_overlap = 16
Settings.llm = OpenAI()

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(similarity_top_k=3)

### Send your first request

In [5]:
response = query_engine.query("What did the author do growing up?")
print(response)

The author worked on writing and programming outside of school before college.


## Initialize Feedback Function(s)

In [6]:
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
import numpy as np

# Initialize provider class
provider = OpenAI()

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(context.collect()) # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


## Instrument app for logging with TruLens

In [7]:
from trulens_eval import TruLlama
tru_query_engine_recorder = TruLlama(query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])

In [8]:
# or as context manager
with tru_query_engine_recorder as recording:
    query_engine.query("What did the author do growing up?")

## Use guardrails

In addition to making informed iteration, we can also directly use feedback results as guardrails at inference time. In particular, here we show how to use the context relevance score as a guardrail to filter out irrelevant context before it gets passed to the LLM. This both reduces hallucination and improves efficiency.

Below, you can see the TruLens feedback display of each context relevance chunk retrieved by our RAG.

In [9]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

Wouldn't it be great if we could automatically filter out context chunks with relevance scores below 0.5?

We can do so with the TruLens guardrail, *WithFeedbackFilterNodes*. All we have to do is use the method `of_query_engine` to create a new filtered retriever, passing in the original retriever along with the feedback function and threshold we want to use.

In [10]:
from trulens_eval.guardrails.llama import WithFeedbackFilterNodes

# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = Feedback(provider.context_relevance)

filtered_query_engine = WithFeedbackFilterNodes(query_engine, feedback=f_context_relevance_score, threshold=0.5)

Then we can operate as normal

In [11]:
tru_recorder = TruLlama(filtered_query_engine,
    app_id='LlamaIndex_App1_Filtered',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])

with tru_recorder as recording:
    llm_response = filtered_query_engine.query("What did the author do growing up?")

display(llm_response)

Response(response='The author focused on writing and programming outside of school before college. Specifically, the author wrote short stories, which were described as having characters with strong feelings but lacking in plot.', source_nodes=[NodeWithScore(node=TextNode(id_='a98829e7-c59e-4906-9ec8-d1a84ab231e4', embedding=None, metadata={'file_path': '/Users/jreini/Desktop/development/trulens/trulens_eval/examples/quickstart/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-07-03', 'last_modified_date': '2024-07-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='01d1924b-a1ae-4a1b-a728-02c0d2076cdd', node_type=<Obje

## See the power of context filters!

If we inspect the context relevance of our retreival now, you see only relevant context chunks!

In [12]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

In [13]:
tru.get_leaderboard()

Unnamed: 0_level_0,Groundedness,Context Relevance,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LlamaIndex_App1_Filtered,1.0,0.8,0.8,1.0,0.005268
LlamaIndex_App1,0.8,0.4,0.8,1.0,0.000713


## Retrieve records and feedback

In [14]:
# The record of the app invocation can be retrieved from the `recording`:

rec = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple

display(rec)

Record(record_id='record_hash_9f960b879e7fbb4c48a58b1cdfb87b3f', app_id='LlamaIndex_App1_Filtered', cost=Cost(n_requests=5, n_successful_requests=15, n_classes=0, n_tokens=3537, n_stream_chunks=0, n_prompt_tokens=3493, n_completion_tokens=44, cost=0.005267500000000001), perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 47, 20, 778856), end_time=datetime.datetime(2024, 7, 3, 5, 47, 22, 824169)), ts=datetime.datetime(2024, 7, 3, 5, 47, 22, 824425), tags='-', meta=None, main_input='What did the author do growing up?', main_output='The author focused on writing and programming outside of school before college. Specifically, the author wrote short stories, which were described as having characters with strong feelings but lacking in plot.', main_error=None, calls=[RecordAppCall(call_id='74117bff-ce0f-4fd8-9f34-d299e7a8d80f', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=trulens_eval.guardrails.llama.WithFeedbackFilterNodes, id=14295036320, init_bindings=None), na

In [None]:
tru.run_dashboard()

In [15]:
# The results of the feedback functions can be rertireved from
# `Record.feedback_results` or using the `wait_for_feedback_result` method. The
# results if retrieved directly are `Future` instances (see
# `concurrent.futures`). You can use `as_completed` to wait until they have
# finished evaluating or use the utility method:

for feedback, feedback_result in rec.wait_for_feedback_results().items():
    print(feedback.name, feedback_result.result)

# See more about wait_for_feedback_results:
# help(rec.wait_for_feedback_results)

Answer Relevance 0.8
Context Relevance 0.8
Groundedness 1.0


In [16]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Groundedness,Answer Relevance,Context Relevance,Groundedness_calls,Answer Relevance_calls,Context Relevance_calls,latency,total_tokens,total_cost
0,LlamaIndex_App1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_d8d7b57e2f4927576f58e09f26469c42,"""What did the author do growing up?""","""The author worked on writing and programming ...",-,"{""record_id"": ""record_hash_d8d7b57e2f4927576f5...","{""n_requests"": 2, ""n_successful_requests"": 3, ...","{""start_time"": ""2024-07-03T05:47:14.007165"", ""...",2024-07-03T05:47:15.029467,0.8,0.8,0.4,[{'args': {'source': ['I remember taking the b...,[{'args': {'prompt': 'What did the author do g...,[{'args': {'question': 'What did the author do...,1,487,0.000713
1,LlamaIndex_App1_Filtered,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",WithFeedbackFilterNodes(trulens_eval.guardrail...,record_hash_9f960b879e7fbb4c48a58b1cdfb87b3f,"""What did the author do growing up?""","""The author focused on writing and programming...",-,"{""record_id"": ""record_hash_9f960b879e7fbb4c48a...","{""n_requests"": 5, ""n_successful_requests"": 15,...","{""start_time"": ""2024-07-03T05:47:20.778856"", ""...",2024-07-03T05:47:22.824425,1.0,0.8,0.8,"[{'args': {'source': [""What I Worked On\n\nFeb...",[{'args': {'prompt': 'What did the author do g...,[{'args': {'question': 'What did the author do...,1,3537,0.005268


In [17]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LlamaIndex_App1_Filtered,0.8,0.8,1.0,2.0,0.005268
LlamaIndex_App1,0.8,0.4,0.8,2.0,0.000713


## Explore in a Dashboard

In [None]:
tru.run_dashboard() # open a local streamlit app to explore

# tru.stop_dashboard() # stop if needed

Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard.

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 TruLens Quickstart

In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response.

For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb)

In [1]:
# ! pip install trulens_eval chromadb openai

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

## Get Data

In this case, we'll just initialize some simple text in the notebook.

In [3]:
uw_info = """
The University of Washington, founded in 1861 in Seattle, is a public research university
with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.
As the flagship institution of the six public universities in Washington state,
UW encompasses over 500 buildings and 20 million square feet of space,
including one of the largest library systems in the world.
"""

wsu_info = """
Washington State University, commonly known as WSU, founded in 1890, is a public research university in Pullman, Washington.
With multiple campuses across the state, it is the state's second largest institution of higher education.
WSU is known for its programs in veterinary medicine, agriculture, engineering, architecture, and pharmacy.
"""

seattle_info = """
Seattle, a city on Puget Sound in the Pacific Northwest, is surrounded by water, mountains and evergreen forests, and contains thousands of acres of parkland.
It's home to a large tech industry, with Microsoft and Amazon headquartered in its metropolitan area.
The futuristic Space Needle, a legacy of the 1962 World's Fair, is its most iconic landmark.
"""

starbucks_info = """
Starbucks Corporation is an American multinational chain of coffeehouses and roastery reserves headquartered in Seattle, Washington.
As the world's largest coffeehouse chain, Starbucks is seen to be the main representation of the United States' second wave of coffee culture.
"""

## Create Vector Store

Create a chromadb vector store in memory.

In [4]:
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'),
                                             model_name="text-embedding-ada-002")


chroma_client = chromadb.Client()
vector_store = chroma_client.get_or_create_collection(name="Washington",
                                                      embedding_function=embedding_function)

Populate the vector store.

In [5]:
vector_store.add("uw_info", documents=uw_info)
vector_store.add("wsu_info", documents=wsu_info)
vector_store.add("seattle_info", documents=seattle_info)
vector_store.add("starbucks_info", documents=starbucks_info)

## Build RAG from scratch

Build a custom RAG from scratch, and add TruLens custom instrumentation.

In [6]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [7]:
from openai import OpenAI
oai_client = OpenAI()

In [8]:
from openai import OpenAI
oai_client = OpenAI()

class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        results = vector_store.query(
            query_texts=query,
            n_results=4
        )
        # Flatten the list of lists into a single list
        return [doc for sublist in results['documents'] for doc in sublist]

    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.generate_completion(query, context_str)
        return completion

rag = RAG_from_scratch()

## Set up feedback functions.

Here we'll use groundedness, answer relevance and context relevance to detect hallucination.

In [9]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback.provider.openai import OpenAI

import numpy as np

provider = OpenAI(model_engine="gpt-4o")

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
)
# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input()
    .on_output()
)

# Context relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(Select.RecordCalls.retrieve.rets[:])
    .aggregate(np.mean) # choose a different aggregation method if you wish
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.retrieve.rets[:] .


## Construct the app
Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval

In [10]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'RAG v1',
    feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])

## Run the app
Use `tru_rag` as a context manager for the custom RAG-from-scratch app.

In [11]:
with tru_rag as recording:
    rag.query("When was the University of Washington founded?")

## Check results

We can view results in the leaderboard.

In [12]:
tru.get_leaderboard()

Unnamed: 0_level_0,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1
RAG v1,3.0,0.000511


In [13]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

Unnamed: 0,question,context,ret
0,When was the University of Washington founded?,"\nThe University of Washington, founded in 186...",1.0
1,When was the University of Washington founded?,"\nWashington State University, commonly known ...",0.0
2,When was the University of Washington founded?,"\nSeattle, a city on Puget Sound in the Pacifi...",0.0
3,When was the University of Washington founded?,\nStarbucks Corporation is an American multina...,0.0


## Use guardrails

In addition to making informed iteration, we can also directly use feedback results as guardrails at inference time. In particular, here we show how to use the context relevance score as a guardrail to filter out irrelevant context before it gets passed to the LLM. This both reduces hallucination and improves efficiency.

To do so, we'll rebuild our RAG using the @context-filter decorator on the method we want to filter, and pass in the feedback function and threshold to use for guardrailing.

In [14]:
# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = (
    Feedback(provider.context_relevance, name = "Context Relevance")
)

from trulens_eval.guardrails.base import context_filter

class filtered_RAG_from_scratch:
    @instrument
    @context_filter(f_context_relevance_score, 0.75, keyword_for_prompt="query")
    def retrieve(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        results = vector_store.query(
        query_texts=query,
        n_results=4
    )
        return [doc for sublist in results['documents'] for doc in sublist]

    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query=query)
        completion = self.generate_completion(query=query, context_str=context_str)
        return completion

filtered_rag = filtered_RAG_from_scratch()

## Record and operate as normal

In [15]:
from trulens_eval import TruCustomApp
filtered_tru_rag = TruCustomApp(filtered_rag,
    app_id = 'RAG v2',
    feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])

with filtered_tru_rag as recording:
    filtered_rag.query(query="when was the university of washington founded?")

In [19]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevance,Groundedness,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RAG v2,1.0,1.0,1.0,1.0,0.000203
RAG v1,1.0,1.0,0.25,1.0,0.000511


See the power of filtering!

In [17]:
last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

Unnamed: 0,question,context,ret
0,when was the university of washington founded?,"\nThe University of Washington, founded in 186...",1.0


In [23]:
tru.run_dashboard(port=3453, force=True)

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.4.206:3453 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# Prototype Evals
This notebook shows the use of the dummy feedback function provider which
behaves like the huggingface provider except it does not actually perform any
network calls and just produces constant results. It can be used to prototype
feedback function wiring for your apps before invoking potentially slow (to
run/to load) feedback functions.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/prototype_evals.ipynb)

## Import libraries

In [None]:
# ! pip install trulens_eval

In [None]:
from trulens_eval import Feedback
from trulens_eval import Tru

tru = Tru()

tru.run_dashboard()

## Set keys

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

## Build the app

In [None]:
from openai import OpenAI
oai_client = OpenAI()

from trulens_eval.tru_custom_app import instrument

class APP:
    @instrument
    def completion(self, prompt):
        completion = oai_client.chat.completions.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages=
                [
                    {"role": "user",
                    "content": 
                    f"Please answer the question: {prompt}"
                    }
                ]
                ).choices[0].message.content
        return completion
    
llm_app = APP()

## Create dummy feedback

By setting the provider as `Dummy()`, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later.

In [None]:
from trulens_eval.feedback.provider.hugs import Dummy

# hugs = Huggingface()
hugs = Dummy()

f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()

## Create the app

In [None]:
# add trulens as a context manager for llm_app with dummy feedback
from trulens_eval import TruCustomApp
tru_app = TruCustomApp(llm_app,
                       app_id = 'LLM App v1',
                       feedbacks = [f_positive_sentiment])

## Run the app

In [None]:
with tru_app as recording:
    llm_app.completion('give me a good name for a colorful sock company')

In [None]:
tru.get_leaderboard(app_ids=[tru_app.app_id])

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 Logging Human Feedback

In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/human_feedback.ipynb)

In [None]:
# ! pip install trulens_eval openai

In [None]:
import os

from trulens_eval import Tru
from trulens_eval import TruCustomApp

tru = Tru()

## Set Keys

For this example, you need an OpenAI key.

In [None]:
os.environ["OPENAI_API_KEY"] = "sk-..."

## Set up your app

Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app.

In [None]:
from openai import OpenAI
oai_client = OpenAI()

from trulens_eval.tru_custom_app import instrument

class APP:
    @instrument
    def completion(self, prompt):
        completion = oai_client.chat.completions.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages=
                [
                    {"role": "user",
                    "content": 
                    f"Please answer the question: {prompt}"
                    }
                ]
                ).choices[0].message.content
        return completion
    
llm_app = APP()

# add trulens as a context manager for llm_app
tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1')


## Run the app

In [None]:
with tru_app as recording:
    llm_app.completion("Give me 10 names for a colorful sock company")

In [None]:
# Get the record to add the feedback to.
record = recording.get()

## Create a mechamism for recording human feedback.

Be sure to click an emoji in the record to record `human_feedback` to log.

In [None]:
from ipywidgets import Button, HBox, VBox

thumbs_up_button = Button(description='👍')
thumbs_down_button = Button(description='👎')

human_feedback = None

def on_thumbs_up_button_clicked(b):
    global human_feedback
    human_feedback = 1

def on_thumbs_down_button_clicked(b):
    global human_feedback
    human_feedback = 0

thumbs_up_button.on_click(on_thumbs_up_button_clicked)
thumbs_down_button.on_click(on_thumbs_down_button_clicked)

HBox([thumbs_up_button, thumbs_down_button])

In [None]:
# add the human feedback to a particular app and record
tru.add_feedback(
    name="Human Feedack",
    record_id=record.record_id,
    app_id=tru_app.app_id,
    result=human_feedback
)

## See the result logged with your app.

In [None]:
tru.get_leaderboard(app_ids=[tru_app.app_id])

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 Ground Truth Evaluations

In this quickstart you will create a evaluate a _LangChain_ app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right.

Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/groundtruth_evals.ipynb)

### Add API keys
For this quickstart, you will need Open AI keys.

In [None]:
# ! pip install trulens_eval openai

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

In [3]:
from trulens_eval import Tru

tru = Tru()

### Create Simple LLM Application

In [4]:
from openai import OpenAI
oai_client = OpenAI()

from trulens_eval.tru_custom_app import instrument

class APP:
    @instrument
    def completion(self, prompt):
        completion = oai_client.chat.completions.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages=
                [
                    {"role": "user",
                    "content": 
                    f"Please answer the question: {prompt}"
                    }
                ]
                ).choices[0].message.content
        return completion
    
llm_app = APP()

## Initialize Feedback Function(s)

In [5]:
from trulens_eval import Feedback
from trulens_eval.feedback import GroundTruthAgreement

golden_set = [
    {"query": "who invented the lightbulb?", "response": "Thomas Edison"},
    {"query": "¿quien invento la bombilla?", "response": "Thomas Edison"}
]

f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = "Ground Truth").on_input_output()

✅ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .


## Instrument chain for logging with TruLens

In [6]:
# add trulens as a context manager for llm_app
from trulens_eval import TruCustomApp
tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1', feedbacks = [f_groundtruth])

In [7]:
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
    llm_app.completion("¿quien invento la bombilla?")
    llm_app.completion("who invented the lightbulb?")

## See results

In [8]:
tru.get_leaderboard(app_ids=[tru_app.app_id])

Unnamed: 0_level_0,Ground Truth,positive_sentiment,Human Feedack,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LLM App v1,1.0,0.38994,1.0,1.75,7.6e-05


# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# Logging Methods

## Automatic Logging

The simplest method for logging with TruLens is by wrapping with TruChain and
including the tru argument, as shown in the quickstart.

This is done like so:

In [None]:
# Imports main tools:
from trulens_eval import Feedback
from trulens_eval import Huggingface
from trulens_eval import Tru
from trulens_eval import TruChain

tru = Tru()

Tru().migrate_database()

from langchain.chains import LLMChain
from langchain_community.llms import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import PromptTemplate

full_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template=
        "Provide a helpful response with relevant background information for the following: {prompt}",
        input_variables=["prompt"],
    )
)

chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt])

llm = OpenAI(temperature=0.9, max_tokens=128)

chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True)

truchain = TruChain(
    chain,
    app_id='Chain1_ChatApplication',
    tru=tru
)
with truchain:
    chain("This will be automatically logged.")

Feedback functions can also be logged automatically by providing them in a list
to the feedbacks arg.

In [None]:
# Initialize Huggingface-based feedback function collection class:
hugs = Huggingface()

# Define a language match feedback function using HuggingFace.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will check language match on the main app input and main app
# output.

In [None]:
truchain = TruChain(
    chain,
    app_id='Chain1_ChatApplication',
    feedbacks=[f_lang_match], # feedback functions
    tru=tru
)
with truchain:
    chain("This will be automatically logged.")

## Manual Logging

### Wrap with TruChain to instrument your chain

In [None]:
tc = TruChain(chain, app_id='Chain1_ChatApplication')

### Set up logging and instrumentation

Making the first call to your wrapped LLM Application will now also produce a log or "record" of the chain execution.


In [None]:
prompt_input = 'que hora es?'
gpt3_response, record = tc.with_record(chain.__call__, prompt_input)

We can log the records but first we need to log the chain itself.

In [None]:
tru.add_app(app=truchain)

Then we can log the record:

In [None]:
tru.add_record(record)

### Log App Feedback
Capturing app feedback such as user feedback of the responses can be added with
one call.

In [None]:
thumb_result = True
tru.add_feedback(
    name="👍 (1) or 👎 (0)", 
    record_id=record.record_id, 
    result=thumb_result
)

### Evaluate Quality

Following the request to your app, you can then evaluate LLM quality using
feedback functions. This is completed in a sequential call to minimize latency
for your application, and evaluations will also be logged to your local machine.

To get feedback on the quality of your LLM, you can use any of the provided
feedback functions or add your own.

To assess your LLM quality, you can provide the feedback functions to
`tru.run_feedback()` in a list provided to `feedback_functions`.


In [None]:
feedback_results = tru.run_feedback_functions(
    record=record,
    feedback_functions=[f_lang_match]
)
for result in feedback_results:
    display(result)

After capturing feedback, you can then log it to your local database.

In [None]:
tru.add_feedbacks(feedback_results)

### Out-of-band Feedback evaluation

In the above example, the feedback function evaluation is done in the same
process as the chain evaluation. The alternative approach is the use the
provided persistent evaluator started via
`tru.start_deferred_feedback_evaluator`. Then specify the `feedback_mode` for
`TruChain` as `deferred` to let the evaluator handle the feedback functions.

For demonstration purposes, we start the evaluator here but it can be started in
another process.

In [None]:
truchain: TruChain = TruChain(
    chain,
    app_id='Chain1_ChatApplication',
    feedbacks=[f_lang_match],
    tru=tru,
    feedback_mode="deferred"
)

with truchain:
    chain("This will be logged by deferred evaluator.")

tru.start_evaluator()
# tru.stop_evaluator()

# Clear trulens modules

This section is needed to clear the 'trulens' modules from the sys.modules dictionary.
It ensures that any 'trulens' modules are loaded properly and alleviates potential state issues across notebooks.

In [None]:
import sys

for module in list(sys.modules.keys()):
    if module.startswith('trulens'):
        del sys.modules[module]

# 📓 Custom Feedback Functions

Feedback functions are an extensible framework for evaluating LLMs. You can add your own feedback functions to evaluate the qualities required by your application by updating `trulens_eval/feedback.py`, or simply creating a new provider class and feedback function in youre notebook. If your contributions would be useful for others, we encourage you to contribute to TruLens!

Feedback functions are organized by model provider into Provider classes.

The process for adding new feedback functions is:
1. Create a new Provider class or locate an existing one that applies to your feedback function. If your feedback function does not rely on a model provider, you can create a standalone class. Add the new feedback function method to your selected class. Your new method can either take a single text (str) as a parameter or both prompt (str) and response (str). It should return a float between 0 (worst) and 1 (best).

In [None]:
from trulens_eval import Provider, Feedback, Select, Tru

class StandAlone(Provider):
    def custom_feedback(self, my_text_field: str) -> float:
        """
        A dummy function of text inputs to float outputs.

        Parameters:
            my_text_field (str): Text to evaluate.

        Returns:
            float: square length of the text
        """
        return 1.0 / (1.0 + len(my_text_field) * len(my_text_field))


2. Instantiate your provider and feedback functions. The feedback function is wrapped by the trulens-eval Feedback class which helps specify what will get sent to your function parameters (For example: Select.RecordInput or Select.RecordOutput)

In [None]:
standalone = StandAlone()
f_custom_function = Feedback(standalone.custom_feedback).on(
    my_text_field=Select.RecordOutput
)

3. Your feedback function is now ready to use just like the out of the box feedback functions. Below is an example of it being used.

In [None]:
tru = Tru()
feedback_results = tru.run_feedback_functions(
    record=record,
    feedback_functions=[f_custom_function]
)
tru.add_feedbacks(feedback_results)

## Extending existing providers.

In addition to calling your own methods, you can also extend stock feedback providers (such as `OpenAI`, `AzureOpenAI`, `Bedrock`) to custom feedback implementations. This can be especially useful for tweaking stock feedback functions, or running custom feedback function prompts while letting TruLens handle the backend LLM provider.

This is done by subclassing the provider you wish to extend, and using the `generate_score` method that runs the provided prompt with your specified provider, and extracts a float score from 0-1. Your prompt should request the LLM respond on the scale from 0 to 10, then the `generate_score` method will normalize to 0-1.

See below for example usage:

In [None]:
from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.utils.generated import re_0_10_rating

class Custom_AzureOpenAI(AzureOpenAI):
    def style_check_professional(self, response: str) -> float:
        """
        Custom feedback function to grade the professional style of the resposne, extending AzureOpenAI provider.

        Args:
            response (str): text to be graded for professional style.

        Returns:
            float: A value between 0 and 1. 0 being "not professional" and 1 being "professional".
        """
        professional_prompt = str.format("Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}", response)
        return self.generate_score(system_prompt=professional_prompt)

Running "chain of thought evaluations" is another use case for extending providers. Doing so follows a similar process as above, where the base provider (such as `AzureOpenAI`) is subclassed.

For this case, the method `generate_score_and_reasons` can be used to extract both the score and chain of thought reasons from the LLM response.

To use this method, the prompt used should include the `COT_REASONS_TEMPLATE` available from the TruLens prompts library (`trulens_eval.feedback.prompts`).

See below for example usage:

In [None]:
from typing import Tuple, Dict
from trulens_eval.feedback import prompts

class Custom_AzureOpenAI(AzureOpenAI):
    def context_relevance_with_cot_reasons_extreme(self, question: str, context: str) -> Tuple[float, Dict]:
        """
        Tweaked version of context relevance, extending AzureOpenAI provider.
        A function that completes a template to check the relevance of the statement to the question.
        Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
        Also uses chain of thought methodology and emits the reasons.

        Args:
            question (str): A question being asked. 
            context (str): A statement to the question.

        Returns:
            float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
        """

        # remove scoring guidelines around middle scores
        system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(
        "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n", "")
        
        user_prompt = str.format(prompts.CONTEXT_RELEVANCE_USER, question = question, context = context)
        user_prompt = user_prompt.replace(
            "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
        )

        return self.generate_score_and_reasons(system_prompt, user_prompt)

## Multi-Output Feedback functions
Trulens also supports multi-output feedback functions. As a typical feedback function will output a float between 0 and 1, multi-output should output a dictionary of `output_key` to a float between 0 and 1. The feedbacks table will display the feedback with column `feedback_name:::outputkey`

In [None]:
multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi").on(
    input_param=Select.RecordOutput
)
feedback_results = tru.run_feedback_functions(
    record=record,
    feedback_functions=[multi_output_feedback]
)
tru.add_feedbacks(feedback_results)

In [None]:
# Aggregators will run on the same dict keys.
import numpy as np
multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg").on(
    input_param=Select.RecordOutput
).aggregate(np.mean)
feedback_results = tru.run_feedback_functions(
    record=record,
    feedback_functions=[multi_output_feedback]
)
tru.add_feedbacks(feedback_results)


In [None]:
# For multi-context chunking, an aggregator can operate on a list of multi output dictionaries.
def dict_aggregator(list_dict_input):
    agg = 0
    for dict_input in list_dict_input:
        agg += dict_input['output_key1']
    return agg
multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg-dict").on(
    input_param=Select.RecordOutput
).aggregate(dict_aggregator)
feedback_results = tru.run_feedback_functions(
    record=record,
    feedback_functions=[multi_output_feedback]
)
tru.add_feedbacks(feedback_results)
