In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

sys.path.append(str(Path().cwd().parent.parent.parent.resolve()))

from pprint import PrettyPrinter
pp = PrettyPrinter()

# Uncomment to get more debugging printouts:
"""
import logging

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)
"""

In [2]:
from trulens_eval.keys import *

2023-06-20 19:21:06,120 - numexpr.utils - INFO - Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-06-20 19:21:06,134 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
Using /Users/piotrm/Dropbox/repos/github/trulens/trulens_eval/examples/.env
KEY SET: OPENAI_API_KEY
KEY SET: PINECONE_API_KEY
KEY SET: PINECONE_ENV
KEY SET: HUGGINGFACE_API_KEY
KEY SET: SLACK_TOKEN
KEY SET: SLACK_SIGNING_SECRET
KEY SET: COHERE_API_KEY


In [3]:
# For aggregation,
import numpy as np

from trulens_eval import feedback, Feedback, Tru, TruLlama

In [4]:
# Construct feedback functions.

hugs = feedback.Huggingface()
openai = feedback.OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# Same default inputs as the above.

# Question/statement relevance between question and each context chunk.
f_qs_relevance = Feedback(openai.qs_relevance).on_input().on(
    TruLlama.select_source_nodes().node.text
).aggregate(np.min)
# First feedback arg is the main app input while the second is the context
# chunks retrieved from the main app output, output of `query`.

feedbacks = [
    f_lang_match, 
    f_qa_relevance, 
    f_qs_relevance
]

2023-06-20 19:21:08,373 - trulens_eval.util - DEBUG - *** Creating new HuggingfaceEndpoint singleton instance for name = huggingface ***
2023-06-20 19:21:08,374 - trulens_eval.provider_apis - DEBUG - *** Creating huggingface endpoint ***
2023-06-20 19:21:08,375 - trulens_eval.provider_apis - DEBUG - Instrumenting requests.post for huggingface
2023-06-20 19:21:08,375 - trulens_eval.provider_apis - DEBUG - Instrumenting post for huggingface .
2023-06-20 19:21:08,375 - trulens_eval.util - DEBUG - *** Creating new OpenAIEndpoint singleton instance for name = openai ***
2023-06-20 19:21:08,376 - trulens_eval.provider_apis - DEBUG - *** Creating openai endpoint ***
2023-06-20 19:21:08,376 - trulens_eval.provider_apis - DEBUG - Instrumenting openai.*.create for openai
2023-06-20 19:21:08,376 - trulens_eval.provider_apis - DEBUG - Instrumenting ChatCompletion.create for openai
2023-06-20 19:21:08,376 - trulens_eval.provider_apis - DEBUG - Instrumenting create for openai .
2023-06-20 19:21:08,3

In [5]:
hugs.language_match("Wie gehts?", "How are you?")

2023-06-20 19:21:11,286 - trulens_eval.util - DEBUG - *** Creating new TP singleton instance for name = None ***
2023-06-20 19:21:11,636 - trulens_eval.provider_apis - DEBUG - Calling wrapped post for huggingface.
2023-06-20 19:21:11,643 - trulens_eval.provider_apis - DEBUG - Calling wrapped post for huggingface.
2023-06-20 19:21:11,702 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api-inference.huggingface.co:443
2023-06-20 19:21:11,702 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api-inference.huggingface.co:443
2023-06-20 19:21:12,058 - urllib3.connectionpool - DEBUG - https://api-inference.huggingface.co:443 "POST /models/papluca/xlm-roberta-base-language-detection HTTP/1.1" 200 905
2023-06-20 19:21:12,060 - trulens_eval.util - DEBUG - <code object wrapper at 0x160754920, file "/Users/piotrm/Dropbox/repos/github/trulens/trulens_eval/trulens_eval/provider_apis.py", line 294>
2023-06-20 19:21:12,060 - trulens_eval.util - DEBUG - <code 

0.00838542406927445

In [6]:
hugs.endpoint.global_callback.cost

Cost(n_requests=2, n_successful_requests=2, n_classes=40, n_tokens=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0)

In [7]:
openai.qs_relevance("Who is Piotr?", "Piotr is a person.")

2023-06-20 19:21:30,702 - trulens_eval.provider_apis - DEBUG - Calling wrapped create for openai.
2023-06-20 19:21:30,703 - openai - DEBUG - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/chat/completions
2023-06-20 19:21:30,703 - openai - DEBUG - api_version=None data='{"model": "gpt-3.5-turbo", "temperature": 0.0, "messages": [{"role": "system", "content": "You are a RELEVANCE classifier; providing the relevance of the given STATEMENT to the given QUESTION.\\nRespond only as a number from 1 to 10 where 1 is the least relevant and 10 is the most relevant.\\nNever elaborate.\\n\\nQUESTION: Who is Piotr?\\n\\nSTATEMENT: Piotr is a person.\\n\\nRELEVANCE: "}]}' message='Post details'
2023-06-20 19:21:30,704 - urllib3.util.retry - DEBUG - Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)
2023-06-20 19:21:30,736 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api.openai.com:443
2023-06-20 19:2

1.0

In [8]:
openai.endpoint.global_callback

OpenAICallback(cost=Cost(n_requests=1, n_successful_requests=1, n_classes=0, n_tokens=82, n_prompt_tokens=81, n_completion_tokens=1, cost=0.000164), langchain_handler=Tokens Used: 82
	Prompt Tokens: 81
	Completion Tokens: 1
Successful Requests: 1
Total Cost (USD): $0.000164)

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('data').load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
# response = query_engine.query("What did the author do growing up?")
# print(response)

In [None]:
f_lang_match.selectors

In [None]:
l = TruLlama(app=query_engine, feedbacks=feedbacks)

In [None]:
# Show which components of the llama index app have been instrumented (will be
# tracked as components in the dashboard).

l.print_instrumented()

In [None]:
res, record = l.query_with_record("Who is Shayak?")

In [None]:
# Start the dashboard here:
proc = Tru().start_dashboard(force=True, _dev=Path.cwd().parent.parent.parent)

# If using deferred feedback evaluation, need to start this too:
# thread = Tru().start_evaluator(restart=True)

In [None]:
f = f_qs_relevance.run(record=record, app=l)

In [None]:
f.dict()