### Imports and Global Declarations

In [1]:
import csv
import os
import random
import re
from typing import List, NamedTuple

import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.retrieve.weaviate_rm import WeaviateRM

from wcs_client_adapter import COLLECTION_TEXT_KEY, WCS_COLLECTION_NAME, WcsClientAdapter
from indexers import NaiveWcsIndexer

### Index Paper for Retrieval

In [2]:
doc_uri = "https://arxiv.org/html/2312.10997v5"
indexer = NaiveWcsIndexer(doc_uri) # TODO: this calling syntax doesn't make it clear what side effects the constructor has

### Configure Language Model and Retrieval Model

In [3]:
default_lm = dspy.OpenAI(model="gpt-3.5-turbo")

wcs_client = WcsClientAdapter.get_wcs_client()
wcs_rm = WeaviateRM(WCS_COLLECTION_NAME, weaviate_client=wcs_client, weaviate_collection_text_key=COLLECTION_TEXT_KEY)
dspy.settings.configure(lm=default_lm, rm=wcs_rm)

### Load Questions Dataset

In [4]:
answerable_questions_path = "./data/answerable-questions.csv"
unanswerable_questions_path = "./data/unanswerable-questions.csv"

def load_questions_from_csv(file_path: str) -> List[str]:
    questions = []
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:
                questions.append(row[0])
    return questions

class DataSplits(NamedTuple):
    train: List
    dev: List
    test: List

def split_data(data: List, train_size: float, dev_size: float, test_size: float) -> DataSplits:
    if train_size + dev_size + test_size != 1:
        raise ValueError("The sum of train_size, dev_size, and test_size must be 1.")

    random.shuffle(data)  
    
    train_end = int(train_size * len(data))
    dev_end = train_end + int(dev_size * len(data))
    
    train_set = data[:train_end]
    dev_set = data[train_end:dev_end]
    test_set = data[dev_end:]
    
    return DataSplits(train=train_set, dev=dev_set, test=test_set)

answerable_questions = load_questions_from_csv(answerable_questions_path)
unanswerable_questions = load_questions_from_csv(unanswerable_questions_path)
all_questions = answerable_questions + unanswerable_questions
all_qs_as_dspy_examples = trainset = [dspy.Example(question=question).with_inputs("question") for question in all_questions]

splits = split_data(all_qs_as_dspy_examples, 0.7, 0.15, 0.15)
trainset = splits.train
devset = splits.dev
testset = splits.test

### Build Signatures

In [5]:
metricLM = dspy.OpenAI(model='gpt-4-turbo', max_tokens=1000, model_type='chat')

class GenerateAnswer(dspy.Signature):
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="between 1 and 4 sentences")
    
class Assess(dspy.Signature):
    """Assess the quality of an answer to a question."""
    
    context = dspy.InputField(desc="The context for answering the question.")
    assessed_question = dspy.InputField(desc="The evaluation criterion.")
    assessed_answer = dspy.InputField(desc="The answer to the question.")
    assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")

def structured_llm_feedback(gold, pred, trace=None):
    predicted_answer = pred.answer
    question = gold.question
    
    print(f"Test Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    
    detail = "Is the assessed answer detailed?"
    faithful = "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
    overall = f"Please rate how well this answer answers the question, `{question}` based on the context.\n `{predicted_answer}`"
    
    with dspy.context(lm=metricLM):
        context = dspy.Retrieve(k=5)(question).passages
        detail = dspy.ChainOfThought(Assess)(context="N/A", assessed_question=detail, assessed_answer=predicted_answer)
        faithful = dspy.ChainOfThought(Assess)(context=context, assessed_question=faithful, assessed_answer=predicted_answer)
        overall = dspy.ChainOfThought(Assess)(context=context, assessed_question=overall, assessed_answer=predicted_answer)
    
    print(f"Faithful: {faithful.assessment_answer}")
    print(f"Detail: {detail.assessment_answer}")
    print(f"Overall: {overall.assessment_answer}")
    
    total = float(detail.assessment_answer) + float(faithful.assessment_answer)*2 + float(overall.assessment_answer)
    
    return total / 5.0

### Build Rag Pipeline as DSPy Module

In [6]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

### Optimize Pipeline

In [7]:
def save_compiled_rag(compiled_rag, teleprompter):
    teleprompter_name = teleprompter.__class__.__name__
    directory = 'dspy-optimized-programs'
    os.makedirs(directory, exist_ok=True)
    
    highest_number = 0
    pattern = re.compile(rf'^rag-{re.escape(teleprompter_name)}-(\d+)\.dspy\.txt$')
    
    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            current_number = int(match.group(1))
            if current_number > highest_number:
                highest_number = current_number
    
    next_number = highest_number + 1
    
    file_path = os.path.join(directory, f'rag-{teleprompter_name}-{next_number}.dspy.txt')
    
    compiled_rag.save(file_path)
    print(f"Saved optimized program to {file_path}")

teleprompter = BootstrapFewShotWithRandomSearch(
    metric=structured_llm_feedback,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=1,
    num_candidate_programs=2,
    num_threads=2
    ) 

import contextlib
import sys

# Follow output of this cell using tail -f due to large output size
stdout_file_path = 'dspy-optimized-programs/compile_stdout.txt'
stderr_file_path = 'dspy-optimized-programs/compile_stderr.txt'
with open(stdout_file_path, 'w') as stdout_file, open(stderr_file_path, 'w') as stderr_file:
    with contextlib.redirect_stdout(stdout_file), contextlib.redirect_stderr(stderr_file):
        compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
save_compiled_rag(compiled_rag, teleprompter)

NameError: name 'save_optimized_rag' is not defined

### Execute Pipeline

In [14]:
my_question = "What methodologies are used for evaluating RAG systems?"

pred = uncompiled_rag(my_question)

print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What methodologies are used for evaluating RAG systems?
Predicted Answer: The methodologies used for evaluating RAG systems include assessing retrieval quality through metrics like Hit Rate, MRR, and NDCG, and evaluating generation quality based on metrics such as answer faithfulness, answer relevance, and counterfactual robustness. These evaluations aim to measure the effectiveness of the context sourced by the retriever component and the generator's capacity to synthesize coherent answers.
Retrieved Contexts (truncated): ['Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, que...', 'charting its evolution and anticipated future paths, with a focus on the integration of RAG within LLMs. This paper considers both technical paradigms and research methods, summarizing three main rese...', 'information from multiple documents to addres

In [15]:
default_lm.inspect_history(n=1)




Given the fields `context`, `question`, produce the fields `answer`.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: between 1 and 4 sentences

---

Context:
[1] «Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, question answering evaluations might rely on EM and F1 scores[45,72,59,7], whereas fact-checking tasks often hinge on Accuracy as the primary metric[4,42,14]. BLEU and ROUGE metrics are also commonly used to evaluate answer quality[26,78,52,32]. Tools like RALLE, designed for the automatic evaluation of RAG applications, similarly base their assessments on these task-specific metrics[160]. Despite this, there is a notable paucity of research dedicated to evaluating the distinct characteristics of 

"\n\n\nGiven the fields `context`, `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: between 1 and 4 sentences\n\n---\n\nContext:\n[1] «Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, question answering evaluations might rely on EM and F1 scores[45,72,59,7], whereas fact-checking tasks often hinge on Accuracy as the primary metric[4,42,14]. BLEU and ROUGE metrics are also commonly used to evaluate answer quality[26,78,52,32]. Tools like RALLE, designed for the automatic evaluation of RAG applications, similarly base their assessments on these task-specific metrics[160]. Despite this, there is a notable paucity of research dedicated to evaluating the distinc

In [9]:
my_question = "What methodologies are used for evaluating RAG systems?"

pred = compiled_rag(my_question)

print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What methodologies are used for evaluating RAG systems?
Predicted Answer: The methodologies used for evaluating RAG systems include assessing retrieval quality through metrics like Hit Rate, MRR, and NDCG, and evaluating generation quality based on metrics such as answer faithfulness, answer relevance, and counterfactual robustness. These evaluations aim to measure the effectiveness of the context sourced by the retriever component and the generator's capacity to synthesize coherent answers.
Retrieved Contexts (truncated): ['Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, que...', 'charting its evolution and anticipated future paths, with a focus on the integration of RAG within LLMs. This paper considers both technical paradigms and research methods, summarizing three main rese...', 'information from multiple documents to addres

In [10]:
default_lm.inspect_history(n=1)




Given the fields `context`, `question`, produce the fields `answer`.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: between 1 and 4 sentences

---

Context:
[1] «Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, question answering evaluations might rely on EM and F1 scores[45,72,59,7], whereas fact-checking tasks often hinge on Accuracy as the primary metric[4,42,14]. BLEU and ROUGE metrics are also commonly used to evaluate answer quality[26,78,52,32]. Tools like RALLE, designed for the automatic evaluation of RAG applications, similarly base their assessments on these task-specific metrics[160]. Despite this, there is a notable paucity of research dedicated to evaluating the distinct characteristics of 

"\n\n\nGiven the fields `context`, `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: between 1 and 4 sentences\n\n---\n\nContext:\n[1] «Target Historically, RAG models assessments have centered on their execution in specific downstream tasks. These evaluations employ established metrics suitable to the tasks at hand. For instance, question answering evaluations might rely on EM and F1 scores[45,72,59,7], whereas fact-checking tasks often hinge on Accuracy as the primary metric[4,42,14]. BLEU and ROUGE metrics are also commonly used to evaluate answer quality[26,78,52,32]. Tools like RALLE, designed for the automatic evaluation of RAG applications, similarly base their assessments on these task-specific metrics[160]. Despite this, there is a notable paucity of research dedicated to evaluating the distinc

### Evaluate Complete RAG Pipeline

In [11]:
from dspy.evaluate.evaluate import Evaluate

evaluate_with_llm_feedback = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
metric = structured_llm_feedback

In [12]:
uncompiled_rag = RAG()
evaluate_with_llm_feedback(uncompiled_rag, metric=metric)

  0%|          | 0/15 [00:00<?, ?it/s]Test Question: What role does RAG play in the development of virtual assistants and conversational agents?
Predicted Answer: RAG plays a crucial role in the development of virtual assistants and conversational agents by providing chunked retrieval and on-demand input to improve operational efficiency. Additionally, RAG-based generation helps quickly locate original references for LLMs, enabling users to verify generated answers. The observable retrieval and reasoning process of RAG contrasts with the black box nature of generation solely relying on long context, making it an essential tool for developing advanced AI technologies.
Faithful: 5
Detail: 5
Overall: 5
Average Metric: 4.0 / 1  (400.0):   7%|▋         | 1/15 [00:17<04:06, 17.62s/it]Test Question: How is the integration of domain-specific information handled in RAG systems?
Predicted Answer: The integration of domain-specific information in Retrieval-Augmented Generation (RAG) systems is ac

Unnamed: 0,question,context,answer,structured_llm_feedback
0,What role does RAG play in the development of virtual assistants and conversational agents?,"['discussions on whether RAG is still necessary when LLMs are not constrained by context. In fact, RAG still plays an irreplaceable role. On one hand,...",RAG plays a crucial role in the development of virtual assistants and conversational agents by providing chunked retrieval and on-demand input to improve operational efficiency....,✔️ [4.0]
1,How is the integration of domain-specific information handled in RAG systems?,"['University College of Design and Innovation, Tongji University Abstract Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent,...",The integration of domain-specific information in Retrieval-Augmented Generation (RAG) systems is achieved by incorporating knowledge from external databases. This enhances the accuracy and credibility of...,✔️ [3.8]
2,How does RAG contribute to real-world applications such as chatbots?,"['paper introduces up-to-date evaluation framework and benchmark. At the end, this article delineates the challenges currently faced and points out prospective avenues for research and...",RAG contributes to real-world applications such as chatbots by enhancing the accuracy and reliability of large language models through the retrieval of relevant information from...,✔️ [3.8]
3,How does RAG deal with the retrieval of contradictory information from different sources?,['or contradictory information during retrieval can detrimentally affect RAG’s output quality. This situation is figuratively referred to as “Misinformation can be worse than no information...,RAG may encounter difficulties in handling contradictory information from different sources during retrieval due to precision and recall challenges. This can lead to the selection...,✔️ [4.0]
4,How is external knowledge integrated into the generation process in RAG systems?,"['University College of Design and Innovation, Tongji University Abstract Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent,...","External knowledge is integrated into the generation process in RAG systems through methods like Flare and Self-RAG, which refine the framework by determining optimal moments...",✔️ [4.0]


378.67

In [13]:
evaluate_with_llm_feedback(compiled_rag, metric=metric)

  0%|          | 0/15 [00:00<?, ?it/s]Test Question: What role does RAG play in the development of virtual assistants and conversational agents?
Predicted Answer: RAG plays a crucial role in the development of virtual assistants and conversational agents by providing chunked retrieval and on-demand input to improve operational efficiency. Additionally, RAG-based generation helps quickly locate original references for LLMs, enabling users to verify generated answers. The observable retrieval and reasoning process of RAG contrasts with the black box nature of generation solely relying on long context, making it an essential tool for developing advanced AI technologies.
Faithful: 5
Detail: 5
Overall: 5
Average Metric: 4.0 / 1  (400.0):   7%|▋         | 1/15 [00:00<00:13,  1.04it/s]Test Question: How is the integration of domain-specific information handled in RAG systems?
Predicted Answer: The integration of domain-specific information in Retrieval-Augmented Generation (RAG) systems is ac

Unnamed: 0,question,context,answer,structured_llm_feedback
0,What role does RAG play in the development of virtual assistants and conversational agents?,"['discussions on whether RAG is still necessary when LLMs are not constrained by context. In fact, RAG still plays an irreplaceable role. On one hand,...",RAG plays a crucial role in the development of virtual assistants and conversational agents by providing chunked retrieval and on-demand input to improve operational efficiency....,✔️ [4.0]
1,How is the integration of domain-specific information handled in RAG systems?,"['University College of Design and Innovation, Tongji University Abstract Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent,...",The integration of domain-specific information in Retrieval-Augmented Generation (RAG) systems is achieved by incorporating knowledge from external databases. This enhances the accuracy and credibility of...,✔️ [3.8]
2,How does RAG contribute to real-world applications such as chatbots?,"['paper introduces up-to-date evaluation framework and benchmark. At the end, this article delineates the challenges currently faced and points out prospective avenues for research and...",RAG contributes to real-world applications such as chatbots by enhancing the accuracy and reliability of large language models through the retrieval of relevant information from...,✔️ [3.8]
3,How does RAG deal with the retrieval of contradictory information from different sources?,['or contradictory information during retrieval can detrimentally affect RAG’s output quality. This situation is figuratively referred to as “Misinformation can be worse than no information...,RAG may encounter difficulties in handling contradictory information from different sources during retrieval due to precision and recall challenges. This can lead to the selection...,✔️ [4.0]
4,How is external knowledge integrated into the generation process in RAG systems?,"['University College of Design and Innovation, Tongji University Abstract Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent,...","External knowledge is integrated into the generation process in RAG systems through methods like Flare and Self-RAG, which refine the framework by determining optimal moments...",✔️ [4.0]


377.33