In [2]:
# import stuff
import numpy as np
import time

from datasets import load_dataset
import dspy
import weaviate

In [9]:
# dspy setup
lm = dspy.LM(
    "openai/gpt-4.1",
    cache=False,
    api_key=os.environ["OPENAI_API_KEY"]
)

dspy.configure(lm=lm, track_usage=True)

lm("say hello")

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='Hello! H...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


['Hello! How can I help you today? ðŸ˜Š']

In [10]:
class GenerateAnswerFromParameters(dspy.Signature):
    """Answer the question as well as you can."""

    question: str = dspy.InputField(description="The question to answer.")
    answer: str = dspy.OutputField(description="The answer to the question.")

qa_system = dspy.Predict(GenerateAnswerFromParameters)

qa_system(question="What is HyDE?")

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    answer='HyDE stands for Hydrogen Evolution (HyDE) and is a software tool designed for the automated identification and assessment of hydrogen bonds and other weak interactions in macromolecular structures, particularly those determined by X-ray crystallography or cryo-EM. Originally developed and used in structural biology and bioinformatics, HyDE analyzes protein-ligand interactions, emphasizing hydrogen bonding, hydrophobic contacts, and other intermolecular forces. The results from HyDE can help in drug design and understanding protein function.\n\nNote: In different scientific or technological contexts, "HyDE" could refer to other things. For example, in chemistry and drug discovery, HyDE may refer to a "Hydrogen bond and Dehydration scoring function" used for protein-ligand interaction assessment. Always consider the subject area to interpret "HyDE" correctly.'
)

In [None]:
# define RAG systems
from typing import Any, Literal
from weaviate.classes.query import Filter

class GenerateAnswer(dspy.Signature):
    """Assess the context and answer the question."""

    question: str = dspy.InputField(description="The question to answer.")
    context: list[str] | list[dspy.Image] = dspy.InputField(description="The context to use to answer the question.")
    answer: str = dspy.OutputField(description="The answer to the question.")

class RAGSystem(dspy.Module):
    def __init__(self, collection: Any, images_or_text: Literal["image", "text"], k: int = 5):
        self.generate_answer = dspy.Predict(GenerateAnswer)
        self.collection = collection
        self.images_or_text = images_or_text
        self.k = k
    def _get_objects(self, question: str) -> list[str] | list[dspy.Image]:
        if self.images_or_text == "image":
            response = self.collection.query.hybrid(
                query=question,
                return_properties=["base64_str"],
                limit=self.k
            )
            objects = []
            for o in response.objects:
                objects.append(o.properties["base64_str"])
            # do PIL magic and convert these to dspy.Image objects
            return objects
        elif self.images_or_text == "text":
            response = self.collection.query.hybrid(
                query=question,
                return_properties=["content"],
                limit=self.k
            )
            objects = []
            for o in response.objects:
                objects.append(o.properties["content"])
            return objects
        
    def _fetch_oracle_context(
        self,
        oracle_context_id: str, 
    ) -> str | dspy.Image:
        if self.images_or_text == "image":
            response = self.collection.query.fetch_objects(
                filters=Filter.by_property("dataset_id").like(oracle_context_id),
                return_properties=["base64_str"]
            )
            return response.objects[0].properties["base64_str"]
            
        elif self.images_or_text == "text":
            response = self.collection.query.fetch_objects(
                filters=Filter.by_property("dataset_id").like(oracle_context_id),
                return_properties=["content"]
            )
            return response.objects[0].properties["content"]

    def __call__(
        self, 
        question: str, 
        oracle_context_id: str = None
    ) -> str:
        if oracle_context_id is None:
            context = self._get_objects(question)
        else:
            context = self._fetch_oracle_context(oracle_context_id)
        return self.generate_answer(question=question, context=context)

In [21]:
weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=os.environ["WEAVIATE_URL"],
    auth_credentials=weaviate.auth.AuthApiKey(os.environ["WEAVIATE_API_KEY"])
)

collection = weaviate_client.collections.get("IRPapersText_Default")

rag_system = RAGSystem(collection, "text")

rag_system(question="What is HyDE?")


  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    answer='HyDE stands for "Hypothetical Document Embeddings." It is a method designed to build effective dense information retrievers in a completely unsupervised way, without relying on relevance labels or supervised fine-tuning. The core idea is to combine two types of models:\n\n1. **A generative, instruction-following language model (e.g., InstructGPT, GPT-3):** For each search query, the language model is tasked with generating a hypothetical document that would answer the query â€” even if it\'s not a real document and may include imaginary or hallucinated content. The goal is for this generated text to closely resemble something relevant to the query.\n\n2. **A contrastive text encoder (e.g., Contriever, mContriever):** This encoder then converts the hypothetical document into a dense vector embedding. This embedding is comparedâ€”using inner product similarityâ€”to the embeddings of all real documents in the search corpus.\n\nThe top-ranked (most similar) real doc

In [25]:
question, oracle_context_id = queries[0]["question"], str(queries[0]["dataset_id"])

rag_system(question=question, oracle_context_id=oracle_context_id)

1_1


  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    answer='GPT-3.5 generated query variants achieved up to 71.1% overlap in document pooling at depth 100 compared to human-generated variants in the UQV100 test collection.'
)

In [26]:
# llm as judge
class AssessAlignmentScore(dspy.Signature):
    """You are an expert grader assessing if a system's answer is semantically aligned with the correct answer.
    Only return True if the system answer has essentially the same meaning as the correct answer.
    If the system answer misses key aspects or meaning, return False.
    """

    question: str = dspy.InputField(description="The question asked.")
    system_answer: str = dspy.InputField(description="The answer generated by the system.")
    correct_answer: str = dspy.InputField(description="The reference answer containing the correct and complete information.")
    score: bool = dspy.OutputField(description="True if system_answer is equivalent in meaning to correct_answer, otherwise False.")

judge = dspy.Predict(AssessAlignmentScore)

test_question = "What is HyDE?"
correct_answer = "HyDE stands for Hypothetical Document Embeddings, a technique for improving retrieval in AI systems by generating hypothetical answers and using their embeddings."

# System answer missing key aspect (embeddings)
incorrect_answer = "HyDE is a technique for improving retrieval in AI systems by generating hypothetical answers."
# System answer rewords but covers all key ideas
acceptable_answer = "Hypothetical Document Embeddings (HyDE) is a method to help AI retrieval by creating hypothetical documents as sample answers and using their vector representations."

response = judge(question=test_question, system_answer=incorrect_answer, correct_answer=correct_answer)
print(response)
response = judge(question=test_question, system_answer=acceptable_answer, correct_answer=correct_answer)
print(response)

Prediction(
    score=False
)


  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## sc...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    score=True
)


In [27]:
response.get_lm_usage()

{'openai/gpt-4.1': {'completion_tokens': 12,
  'prompt_tokens': 349,
  'total_tokens': 361,
  'completion_tokens_details': {'accepted_prediction_tokens': 0,
   'audio_tokens': 0,
   'reasoning_tokens': 0,
   'rejected_prediction_tokens': 0,
   'text_tokens': None},
  'prompt_tokens_details': {'audio_tokens': 0,
   'cached_tokens': 0,
   'text_tokens': None,
   'image_tokens': None}}}

In [15]:
# load data
from datasets import load_dataset

queries = load_dataset("weaviate/irpapers-queries")["train"]

In [None]:
alignment_scores, input_tokens, output_tokens = [], [], []

K = 3

start = time.time()
for idx, query in enumerate(queries):
    if idx % 5 == 4:
        print(f"Processed {idx+1} queries in {time.time() - start} seconds...")
    test_query, ground_truth_answer, oracle_context_id = query["question"], query["answer"], str(query["dataset_id"])
    qa_system_response = rag_system(
        question=test_query,
        oracle_context_id=oracle_context_id
    )
    usage_dict = qa_system_response.get_lm_usage()["openai/gpt-4.1"]
    input_tokens.append(usage_dict["prompt_tokens"])
    output_tokens.append(usage_dict["completion_tokens"])

    ensemble_votes = 0
    for judge_predictions in range(K):
        lm_judge_response = judge(
            question=test_query,
            system_answer=qa_system_response.answer,
            correct_answer=ground_truth_answer
        )
        if lm_judge_response.score:
            ensemble_votes += 1
    if ensemble_votes >= K / 2:
        alignment_scores.append(1)
    else:
        alignment_scores.append(0)

alignment_scores = np.array(alignment_scores)
input_tokens = np.array(input_tokens)
output_tokens = np.array(output_tokens)

print(alignment_scores.mean())
print(input_tokens.mean())
print(output_tokens.mean())


1_1


  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## sc...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


1_2
1_3
1_4
Processed 5 queries in 25.57102084159851 seconds...
1_5
2_1
2_2
2_3


In [24]:
alignment_scores = np.array(alignment_scores)
input_tokens = np.array(input_tokens)
output_tokens = np.array(output_tokens)

print(alignment_scores.mean())
print(input_tokens.mean())
print(output_tokens.mean())

0.17647058823529413
170.28846153846155
123.32692307692308
