In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

sys.path.append(str(Path().cwd().parent.parent.parent.resolve()))

from pprint import PrettyPrinter
pp = PrettyPrinter()

# Uncomment to get more debugging printouts:
"""
import logging

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)
"""

In [None]:
from trulens_eval.keys import *

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('data').load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
# response = query_engine.query("What did the author do growing up?")
# print(response)

In [None]:
# For aggregation,
import numpy as np

from trulens_eval import feedback, Feedback, Tru, TruLlama

In [None]:
# Construct feedback functions.

hugs = feedback.Huggingface()
openai = feedback.OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# Same default inputs as the above.

# Question/statement relevance between question and each context chunk.
f_qs_relevance = Feedback(openai.qs_relevance).on_input().on(
    TruLlama.select_source_nodes().node.text
).aggregate(np.min)
# First feedback arg is the main app input while the second is the context
# chunks retrieved from the main app output, output of `query`.

feedbacks = [
    f_lang_match, 
    f_qa_relevance, 
    f_qs_relevance
]

In [None]:
f_lang_match.selectors

In [None]:
l = TruLlama(app=query_engine, feedbacks=feedbacks)

In [None]:
# Show which components of the llama index app have been instrumented (will be
# tracked as components in the dashboard).

l.print_instrumented()

In [None]:
res, record = l.query_with_record("Who is Shayak?")

In [None]:
# record.dict()

In [None]:
# Start the dashboard here:
proc = Tru().start_dashboard(force=True, _dev=Path.cwd().parent.parent.parent)

# If using deferred feedback evaluation, need to start this too:
# thread = Tru().start_evaluator(restart=True)

In [None]:
f = f_qs_relevance.run(record=record, app=l)

In [None]:
f.dict()