## Eval existing runs

This is a demonstration how to use trulens without an app but with logs of the results of some app.

In [None]:
# ! pip install trulens_eval==0.20.3

In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

base = Path().cwd()
while not (base / "trulens_eval").exists():
    base = base.parent

print(base)

# If running from github repo, can use this:
sys.path.append(str(base))

from trulens_eval.keys import check_keys

check_keys(
    "OPENAI_API_KEY",
)

from trulens_eval import Tru
tru = Tru()
tru.reset_database()

tru.run_dashboard(_dev=base, force=True)

In [None]:
# VirtualApp setup. You can store any information you would like by passing in a
# dictionry to TruVirtual (later). This may involve an index of components or
# versions, or anything else. You can refer to these values for evaluating feedback.

app = dict(
    retriever=dict(
        configkey1="anything else you want to store about the app or its components"
    ),
    some_other_component="can be put into the app dictionary"
)

In [None]:
# Data. To add data to the database, you can either create the `Record`, or use
# `VirtualRecord` class which helps you construct records for virtual models.
# The arguments to VirtualRecord are the same as for Record except that calls
# are specified using selectors. In the below example, we add two records with
# both containing the inputs and outputs to some context retrieval component.
# You do not need to provide information that you do not wish to track or
# evaluate on. The selectors refer to methods which can be selected for in
# feedback which we show below.

from trulens_eval.schema import Record
from trulens_eval.schema import Select
from trulens_eval.tru_virtual import VirtualRecord

rec1 = VirtualRecord(
    main_input="Where is Germany?",
    main_output="Germany is in Europe",
    calls=
        {
            Select.RecordCalls.retriever.get_context: dict(
                args=["Where is Germany?"],
                rets=["Germany is a country located in Europe."]
            ),
            Select.RecordCalls.some_other_component.do_something: dict(
                args=["Some other inputs."],
                rets=["Some other output."]
            )
        }
    )
rec2 = VirtualRecord(
    main_input="Where is Germany?",
    main_output="Poland is in Europe",
    calls=
        {
            Select.RecordCalls.retriever.get_context: dict(
                args=["Where is Germany?"],
                rets=["Poland is a country located in Europe."]
            )
        }
    )

data = [rec1, rec2]

In [None]:
# The same feedback function as the langchain quickstart except the selector for
# context is different.

from trulens_eval.feedback.provider import OpenAI
import numpy as np

from trulens_eval.feedback.feedback import Feedback

# Initialize provider class
openai = OpenAI()

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = Select.RecordCalls.retriever.get_context.rets[:]

from trulens_eval.feedback import Groundedness
grounded = Groundedness(groundedness_provider=OpenAI())
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on(context.collect()) # collect context chunks into a list
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(openai.qs_relevance)
    .on_input()
    .on(context)
    .aggregate(np.mean)
    )

In [None]:
# Create the virtual recorder and add the records.

from trulens_eval.tru_virtual import TruVirtual

virtual_recorder = TruVirtual(
    app=app,
    feedbacks=[f_groundedness, f_qa_relevance, f_context_relevance]
)

for rec in data:
    virtual_recorder.add_record(rec)