# Iterating on LLM Apps with TruLens

1. Start with basic RAG.
2. Show failures of RAG Triad.
3. Address failures with context filtering, advanced RAG (e.g., sentence windows, auto-retrieval)
4. Showcase experiment tracking to choose best app configuration. 
5. Weave in different types of evals into narrative
6. Weave in user/customer stories into narrative

In [None]:
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
import openai

os.environ["OPENAI_API_KEY"] = "..."
openai.api_key = os.environ["OPENAI_API_KEY"]

os.environ["HUGGINGFACE_API_KEY"] = "..."

In [None]:
from trulens_eval import Tru

Tru().reset_database()

In [None]:
from trulens_eval import Tru

tru = Tru()

In [None]:
tru.run_dashboard()

## Start with basic RAG.

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./Insurance_Handbook_20103.pdf"]
).load_data()

In [None]:
from llama_index import Document

from llama_index import ServiceContext, VectorStoreIndex, StorageContext

from llama_index.llms import OpenAI

# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)

# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))

from llama_index import VectorStoreIndex

# service context for index
service_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model="local:BAAI/bge-small-en-v1.5")

# create index
index = VectorStoreIndex.from_documents([document], service_context=service_context)

from llama_index import Prompt

system_prompt = Prompt("We have provided context information below that you may use. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Please answer the question: {query_str}\n")

# basic rag query engine
rag_basic = index.as_query_engine(text_qa_template = system_prompt)

## Load test set

In [None]:
# Load some questions for evaluation
honest_evals = []
with open('honest_eval.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        honest_evals.append(item)

## Set up Evaluation

In [None]:
import numpy as np
from trulens_eval import Tru, Feedback, TruLlama, OpenAI as fOpenAI

tru = Tru()

# start fresh
tru.reset_database()

from trulens_eval.feedback import Groundedness

openai = fOpenAI()

qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

# embedding distance
from langchain.embeddings.openai import OpenAIEmbeddings
from trulens_eval.feedback import Embeddings

model_name = 'text-embedding-ada-002'

embed_model = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

embed = Embeddings(embed_model=embed_model)
f_embed_dist = (
    Feedback(embed.cosine_distance)
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
)

from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=openai)

f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text.collect())
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)

honest_feedbacks = [qa_relevance, qs_relevance, f_embed_dist, f_groundedness]

from trulens_eval import FeedbackMode

tru_recorder_rag_basic = TruLlama(
        rag_basic,
        app_id='1) Basic RAG - Honest Eval',
        feedbacks=honest_feedbacks
    )

In [None]:
tru.run_dashboard()

In [None]:
# Run evaluation on 10 sample questions
with tru_recorder_rag_basic as recording:
    for question in honest_evals:
        response = rag_basic.query(question)

Our simple RAG often struggles with retrieving not enough information from the insurance manual to properly answer the question. The information needed may be just outside the chunk that is identified and retrieved by our app. Let's try sentence window retrieval to retrieve a wider chunk.

In [None]:
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
import os

def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

def get_sentence_window_query_engine(
    sentence_index,
    system_prompt,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank], text_qa_template = system_prompt
    )
    return sentence_window_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index, system_prompt=system_prompt)

tru_recorder_rag_sentencewindow = TruLlama(
        sentence_window_engine,
        app_id='2) Sentence Window RAG - Honest Eval',
        feedbacks=honest_feedbacks
    )

In [None]:
# Run evaluation on 10 sample questions
with tru_recorder_rag_sentencewindow as recording:
    for question in honest_evals:
        response = sentence_window_engine.query(question)

## Evals for Harmless

In [None]:
f_controversiality = Feedback(openai.controversiality_with_cot_reasons, name = "Criminality", higher_is_better = False).on_output()
f_criminality = Feedback(openai.criminality_with_cot_reasons, name = "Controversiality", higher_is_better = False).on_output()
f_harmfulness = Feedback(openai.harmfulness_with_cot_reasons, name = "Harmfulness", higher_is_better = False).on_output()
f_insensitivity = Feedback(openai.insensitivity_with_cot_reasons, name = "Insensitivity", higher_is_better = False).on_output()
f_maliciousness = Feedback(openai.maliciousness_with_cot_reasons, name = "Maliciousness", higher_is_better = False).on_output()
f_misogyny = Feedback(openai.misogyny_with_cot_reasons, name = "Misogyny", higher_is_better = False).on_output()
f_stereotypes = Feedback(openai.stereotypes_with_cot_reasons, name = "Stereotypes", higher_is_better = False).on_output()

# Moderation feedback functions
f_hate = Feedback(openai.moderation_hate, name = "Hate", higher_is_better = False).on_output()
f_hatethreatening = Feedback(openai.moderation_hatethreatening, name = "Hate/Threatening", higher_is_better = False).on_output()
f_violent = Feedback(openai.moderation_violence, name = "Violent", higher_is_better = False).on_output()
f_violentgraphic = Feedback(openai.moderation_violencegraphic, name = "Violent/Graphic", higher_is_better = False).on_output()
f_selfharm = Feedback(openai.moderation_selfharm, name = "Self Harm", higher_is_better = False).on_output()
f_sexual = Feedback(openai.moderation_sexual, name = "Sexual", higher_is_better = False).on_output()
f_sexualminors = Feedback(openai.moderation_sexualminors, name = "Sexual/Minors", higher_is_better = False).on_output()


harmless_feedbacks = [f_controversiality, f_criminality, f_harmfulness,
    f_insensitivity, f_maliciousness, f_misogyny,
    f_stereotypes, f_hate, f_hatethreatening,
    f_violent, f_violentgraphic, f_selfharm,
    f_sexual, f_sexualminors]

tru_recorder_harmless_eval = TruLlama(
        sentence_window_engine,
        app_id='3) Sentence Window RAG - Harmless Eval',
        feedbacks=harmless_feedbacks
    )

In [None]:
# Load some questions for safety evaluation
harmless_evals = []
with open('harmless_eval.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        harmless_evals.append(item)

In [None]:
# Run evaluation on harmless eval questions
for question in harmless_evals:
    with tru_recorder_harmless_eval as recording:
        response = sentence_window_engine.query(question)

## Add safe prompting

In [None]:
# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

safe_system_prompt = Prompt("SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
    "\n---------------------\n"
    "Given this system prompt and context, please answer the question: {query_str}\n")

sentence_window_engine_safe = get_sentence_window_query_engine(sentence_index, system_prompt = safe_system_prompt)

tru_recorder_rag_sentencewindow_safe = TruLlama(
        sentence_window_engine_safe,
        app_id='4) Sentence Window - Harmless Eval - Safe Prompt',
        feedbacks=harmless_feedbacks
    )

In [None]:
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_safe as recording:
    for question in harmless_evals:
        response = sentence_window_engine_safe.query(question)

## Evals for Helpful

In [None]:
from trulens_eval import Huggingface

# HuggingFace based feedback function collection class
hugs = Huggingface()

f_langmatch = Feedback(hugs.language_match, name = "Language Match").on_input_output()
f_conciseness = Feedback(openai.conciseness, name = "Conciseness").on_output()

helpful_feedbacks = [f_langmatch, f_conciseness]

In [None]:
tru_recorder_rag_sentencewindow_helpful = TruLlama(
        sentence_window_engine_safe,
        app_id='5) Sentence Window - Helpful Eval',
        feedbacks=helpful_feedbacks
    )

In [None]:
# Load some questions for safety evaluation
helpful_evals = []
with open('helpful_eval.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        helpful_evals.append(item)

In [None]:
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
    for question in helpful_evals:
        response = sentence_window_engine.query(question)