# Iterating on LLM Apps with TruLens

Now that we have improved our prototype RAG to reduce or stop hallucination and respond harmlessly, we can move on to ensure it is helpfulness. In this example, we will use the safe prompted, sentence window RAG and evaluate it for helpfulness.

In [None]:
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."

In [None]:
from trulens_eval import Tru
tru = Tru()
tru.run_dashboard()

## Load data and helpful test set.

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./Insurance_Handbook_20103.pdf"]
).load_data()

# Load some questions for helpful evaluation
helpful_evals = []
with open('helpful_eval.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        helpful_evals.append(item)

## Set up helpful evaluations

In [None]:
from trulens_eval import Feedback
from trulens_eval.feedback.provider import OpenAI
from trulens_eval.feedback.provider import Huggingface

# Initialize provider classes
provider = OpenAI()
hugs_provider = Huggingface()

# LLM-based feedback functions
f_coherence = Feedback(
    provider.coherence_with_cot_reasons, name="Coherence"
    ).on_output()

f_input_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Input Sentiment"
    ).on_input()

f_output_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Output Sentiment"
    ).on_output()
        
f_langmatch = Feedback(
    hugs_provider.language_match, name="Language Match"
    ).on_input_output()

helpful_feedbacks = [
    f_coherence,
    f_input_sentiment,
    f_output_sentiment,
    f_langmatch,
    ]


In [None]:
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
from llama_index import Document
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.llms import OpenAI
import os

# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)

# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))

# set system prompt
from llama_index import Prompt
system_prompt = Prompt("We have provided context information below that you may use. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Please answer the question: {query_str}\n")

def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

def get_sentence_window_query_engine(
    sentence_index,
    system_prompt,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank], text_qa_template = system_prompt
    )
    return sentence_window_engine

# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

# safe prompt
safe_system_prompt = Prompt("SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
    "\n---------------------\n"
    "Given this system prompt and context, please answer the question: {query_str}\n")

sentence_window_engine_safe = get_sentence_window_query_engine(sentence_index, system_prompt=safe_system_prompt)

In [None]:
from trulens_eval import TruLlama
tru_recorder_rag_sentencewindow_helpful = TruLlama(
        sentence_window_engine_safe,
        app_id='5) Sentence Window - Helpful Eval',
        feedbacks=helpful_feedbacks
    )

In [None]:
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
    for question in helpful_evals:
        response = sentence_window_engine_safe.query(question)

## Check helpful evaluation results

In [None]:
tru.get_leaderboard(app_ids=["5) Sentence Window - Helpful Eval"])

Check helpful evaluation results. How can you improve the RAG on these evals? We'll leave that to you!