# TruBot

Example setup and monitoring of a conversational bot with context made up of the
TruEra website. This example requires either a pinecone vector db set up with
some contexts to answer questions with or alternatively can use the local
database for use with hnswlib provided here. To use hnswlib, some additional
requirements need to be installed with pip. Regardless of the vector db
provider, the example feedback functions here use openai and huggingface free
inference APIs and need their respective keys to be provided in a .env file.

## HNSWLIB additional requirements

Run the following in your shell or the equivalent in the following cell to
install additional requirements for use with HNSWLIB. This is not required if
you are running this example with a pinecone db.

```bash
pip install docarray hnswlib
```

In [None]:
# !pip install docarray hnswlib

In [None]:
# !pip install -U pydantic

In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

# If running from github repo, can use this:
sys.path.append(str(Path().cwd().parent.parent.parent.parent.resolve()))

# Uncomment for more debugging printouts.
"""
import logging
root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)
"""
None

## API keys setup

In [None]:
from trulens.core.utils.keys import check_keys

check_keys("OPENAI_API_KEY", "HUGGINGFACE_API_KEY")

In [None]:
import os
from pprint import PrettyPrinter

# Imports from LangChain to build app:
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.memory import ConversationSummaryBufferMemory
import numpy as np
from trulens.apps.langchain import WithFeedbackFilterDocuments

# Imports main tools:
from trulens.core import Feedback
from trulens.core import FeedbackMode
from trulens.core import Select
from trulens.core import TruSession
from trulens.core.utils.threading import TP
from trulens.dashboard import run_dashboard

pp = PrettyPrinter()

# Tru object manages the database of apps, records, and feedbacks; and the
# dashboard to display these.
tru = TruSession()

# Start the dashboard. If you running from github repo, you will need to adjust
# the path the dashboard streamlit app starts in by providing the _dev argument.
run_dashboard(
    tru, force=True, _dev=Path().cwd().parent.parent.parent.parent.resolve()
)

# If needed, you can reset the trulens dashboard database by running the
# below line:

# tru.reset_database()

In [None]:
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI

# Select vector db provider. Pinecone requires setting up a pinecone database
# first while the hnsw database is included with trulens.
# db_host = "pinecone"
db_host = "pinecone"

model_name = "gpt-3.5-turbo"
app_name = "TruBot"

# Embedding for vector db.
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")  # 1536 dims

if db_host == "pinecone":
    check_keys("PINECONE_API_KEY", "PINECONE_ENV")

    # Pinecone configuration if using pinecone.

    from langchain_community.vectorstores import Pinecone
    import pinecone

    pinecone.init(
        api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
        environment=os.environ.get(
            "PINECONE_ENV"
        ),  # next to api key in console
    )

    # If using pinecone, make sure you create your index under name 'llmdemo' or
    # change the below.

    def get_doc_search():
        docsearch = Pinecone.from_existing_index(
            index_name="llmdemo", embedding=embedding
        )

        return docsearch

elif db_host == "hnsw":
    # Local pinecone alternative. Requires precomputed 'hnswlib_truera' folder.

    from langchain.vectorstores import DocArrayHnswSearch

    def get_doc_search():
        # We need to create this object in the thread in which it is used so we
        # wrap it in this function for later usage.

        docsearch = DocArrayHnswSearch.from_params(
            embedding=embedding,
            work_dir="hnswlib_trubot",
            n_dim=1536,
            max_elements=1024,
        )

        return docsearch

else:
    raise RuntimeError("Unhandled db_host, select either 'pinecone' or 'hnsw'.")

# LLM for completing prompts, and other tasks.
llm = OpenAI(temperature=0, max_tokens=256)

# Construct feedback functionfs.

# API endpoints for models used in feedback functions:

hugs = Huggingface()
openai = OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(openai.context_relevance)
    .on_input()
    .on(
        Select.Record.app.combine_docs_chain._call.args.inputs.input_documents[
            :
        ].page_content
    )
    .aggregate(np.min)
)
# First feedback argument is set to main app input, and the second is taken from
# the context sources as passed to an internal `combine_docs_chain._call`.

all_feedbacks = [f_lang_match, f_qa_relevance, f_context_relevance]

# TruBot Version 1

In [None]:
from trulens.apps.langchain import TruChain


def v1_new_conversation(feedback_mode=FeedbackMode.WITH_APP):
    """
    Create a _LangChain_ app for a new conversation with a question-answering bot.

    Feedback_mode controls when feedback is evaluated:

    - FeedbackMode.WITH_APP -- app will wait until feedback is evaluated before
      returning from calls.

    - FeedbackMode.WITH_APP_THREAD -- app will return from calls and evaluate
      feedback in a new thread.

    - FeedbackMode.DEFERRED -- app will return and a separate runner thread (see
      usage later in this notebook) will evaluate feedback.
    """

    # Blank conversation memory.
    memory = ConversationSummaryBufferMemory(
        max_token_limit=650,
        llm=llm,
        memory_key="chat_history",
        output_key="answer",
    )

    docsearch = get_doc_search()

    # Context retriever.
    retriever = docsearch.as_retriever()

    # Conversational app puts it all together.
    app = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        memory=memory,
        get_chat_history=lambda a: a,
        max_tokens_limit=4096,
    )

    # Trulens instrumentation.

    tc = TruChain(
        app_name=app_name,
        app_version="v1",
        app=app,
        feedbacks=all_feedbacks,
        feedback_mode=feedback_mode,
    )

    return app, tc

In [None]:
# Instantiate the app with fresh memory:

import traceback

try:
    app1, tc1 = v1_new_conversation()
except Exception:
    print(traceback.format_exc())

# Call the app:

res, record = tc1.with_record(app1, "Who is Shayak?")
res

# Notice the `source_documents` returned include chunks about Shameek and the
# answer includes bits about Shameek as a result.

In [None]:
# The feedback should already be present in the dashboard, but we can check the
# context_relevance here manually as well:
feedback = f_context_relevance.run(record=record, app=tc1)
feedback.model_dump()

In [None]:
# Now a question about QII (quantitative input influence is a base technology
# employed in TruEra's products) question but in a non-English language:

# Start a new conversation as the app keeps prior questions in its memory which
# may cause you some testing woes.
app1, tc1 = v1_new_conversation()

# res, record = tc1.with_record(app1, "Co jest QII?") # Polish
res, record = tc1.with_record(app1, "Was ist QII?")  # German
res

# Note here the response is in English. This example sometimes matches language
# so other variants may need to be tested.

In [None]:
# Language match failure can be seen using the f_lang_match (and is visible in
# dashboard):
feedback = f_lang_match.run(record=record, app=tc1)
feedback.model_dump()

# TruBot Version 2 - Language match fix

In [None]:
def v2_new_conversation(feedback_mode=FeedbackMode.WITH_APP):
    """
    Create a _LangChain_ app for a new conversation with a question-answering bot.
    """

    # Blank conversation memory.
    memory = ConversationSummaryBufferMemory(
        max_token_limit=650,
        llm=llm,
        memory_key="chat_history",
        output_key="answer",
    )

    docsearch = get_doc_search()

    # Context retriever.
    retriever = docsearch.as_retriever()

    # Conversational app puts it all together.
    app = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        memory=memory,
        get_chat_history=lambda a: a,
        max_tokens_limit=4096,
    )

    ### DIFFERENCES START HERE

    # Need to copy these otherwise various apps will feature templates that
    # point to the same objects.
    app.combine_docs_chain.llm_chain.prompt = (
        app.combine_docs_chain.llm_chain.prompt.copy()
    )
    app.combine_docs_chain.document_prompt = (
        app.combine_docs_chain.document_prompt.copy()
    )

    # Language mismatch fix via a prompt adjustment:
    app.combine_docs_chain.llm_chain.prompt.template = (
        "Use the following pieces of context to answer the question at the end "
        "in the same language as the question. If you don't know the answer, "
        "just say that you don't know, don't try to make up an answer.\n\n"
        "{context}\n\n"
        "Question: {question}\n"
        "Helpful Answer: "
    )

    ### END OF DIFFERENCES

    # Trulens instrumentation.
    tc = TruChain(
        app_name=app_name,
        app_version="v2",
        app=app,
        feedbacks=all_feedbacks,
        feedback_mode=feedback_mode,
    )

    return app, tc

In [None]:
# Instantiate the version 2 app:

app2, tc2 = v2_new_conversation()

# Now the non-English question again:

res, record = tc2.with_record(app2, "Was ist QII?")
res

# Note that the response is now the appropriate language.

In [None]:
# And the language match feedback is happy:

feedback = f_lang_match.run(record=record, app=tc2)
feedback.model_dump()

# TruBot Version 3: Context Filtering with Relevance

In [None]:
def v3_new_conversation(feedback_mode=FeedbackMode.WITH_APP):
    """
    Create a _LangChain_ app for a new conversation with a question-answering bot.
    """

    # Blank conversation memory.
    memory = ConversationSummaryBufferMemory(
        max_token_limit=650,
        llm=llm,
        memory_key="chat_history",
        output_key="answer",
    )

    docsearch = get_doc_search()

    # Context retriever.
    retriever = docsearch.as_retriever()

    ### DIFFERENCES START HERE

    # Modified retriever that first filters returned contexts using
    # f_context_relevance with a minimum relevance threshold (of 0.5):
    retriever_filtered = WithFeedbackFilterDocuments.of_retriever(
        retriever=retriever, feedback=f_context_relevance, threshold=0.5
    )

    ### END OF DIFFERENCES

    # Conversational app puts it all together.
    app = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever_filtered,
        return_source_documents=True,
        memory=memory,
        get_chat_history=lambda a: a,
        max_tokens_limit=4096,
    )

    # Trulens instrumentation.
    tc = TruChain(
        app_name=app_name,
        app_version="v3",
        app=app,
        feedbacks=all_feedbacks,
        feedback_mode=feedback_mode,
    )

    return app, tc

In [None]:
# Instantiate the version 3 app:

app3, tc3 = v3_new_conversation()

# Call the app:

res, record = tc3.with_record(app3, "Who is Shayak?")
res

# Notice the `source_documents` returned now does not include the low-relevance
# chunks and the answer likewise does not reference them.

# TruBot Version 4: Lang match fix and context filter

This is left as an exercise to the reader. 

In [None]:
def v4_new_conversation(feedback_mode=FeedbackMode.WITH_APP):
    """
    Create a _LangChain_ app for a new conversation with a question-answering bot.
    """

    ### TO FILL IN HERE ###
    app = ...
    ### END OF TO FILL IN ###

    # Trulens instrumentation.
    tc = TruChain(
        app_name=app_name,
        app_version="v4",
        app=app,
        feedbacks=all_feedbacks,
        feedback_mode=feedback_mode,
    )

    return app, tc

# Test conversations

Lets try out the 3 (or 4) trubot versions on a collection of test instances
about Shayak and some technical terms in several languages.

In [None]:
apps = [
    v1_new_conversation,
    v2_new_conversation,
    v3_new_conversation,
    # v4_new_conversation # include this if you completed the exercise
]

questions = [
    "Who is Shayak?",
    "Wer ist Shayak?",
    "Kim jest Shayak?",
    "¿Quién es Shayak?",
    "What is QII?",
    "Was ist QII?",
    "Co jest QII?",
    "¿Que es QII?",
]

# Comment out the next two lines to try all of the version and question
# combinations. Otherwise we select here only 2 questions and 2 models to start with.
apps = apps[0:2]
questions = questions[0:2]


def test_app_on_question(new_convo, question):
    print(new_convo.__name__, question)
    app, tc = new_convo(feedback_mode=FeedbackMode.DEFERRED)
    answer = tc.with_(app, question)
    return answer


# This asks all of the questions in parallel:
for new_convo in apps:
    for question in questions:
        TP().submit(
            test_app_on_question, new_convo=new_convo, question=question
        )

In [None]:
# For deferred feedback evaluation. Start this:

TruSession().start_evaluator(restart=True)