In [None]:
# set environment variabels
import os

os.environ["WEAVIATE_URL"] = ""
os.environ["WEAVIATE_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [17]:
# import stuff
import numpy as np
import time

from datasets import load_dataset
import dspy
import weaviate

In [3]:
# dspy setup
lm = dspy.LM(
    "openai/gpt-4.1",
    cache=False,
    api_key=os.environ["OPENAI_API_KEY"]
)

dspy.configure(lm=lm, track_usage=True)

lm("say hello")

['Hello! How can I help you today? ðŸ˜Š']

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='Hello! H...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


In [6]:
class GenerateAnswerFromParameters(dspy.Signature):
    """Answer the question as well as you can."""

    question: str = dspy.InputField(description="The question to answer.")
    answer: str = dspy.OutputField(description="The answer to the question.")

qa_system = dspy.Predict(GenerateAnswerFromParameters)

qa_system(question="What is HyDE?")

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    answer='HyDE can refer to several things depending on the context, but most commonly:\n\n1. **HyDE (Hybrid Density Estimator)**: In artificial intelligence and large language models (LLMs), HyDE stands for "Hypothetical Document Embeddings". It is a method, introduced by OpenAI, for enhancing retrieval-augmented generation (RAG) systems. HyDE works by generating a hypothetical answer to a user\'s query, then using the embedding (vector representation) of that hypothetical document to better match relevant real documents during retrieval from a database or corpus. This leads to improved information retrieval performance, as the search is guided by a synthesized, context-rich sample.\n\n2. **Hyde (Other meanings)**:\n   - It could refer to the character "Hyde" from the classic novel "Strange Case of Dr Jekyll and Mr Hyde" by Robert Louis Stevenson.\n   - In music, Hyde is a well-known Japanese singer.\n   - There are cities named Hyde (e.g., Hyde, Greater Manchester in En

In [None]:
# define RAG systems

class GenerateAnswer(dspy.Signature):
    """Assess the context and answer the question."""

    question: str = dspy.InputField(description="The question to answer.")
    context: list[str] | list[dspy.Image] = dspy.InputField(description="The context to use to answer the question.")
    answer: str = dspy.OutputField(description="The answer to the question.")

In [12]:
# llm as judge
class AssessAlignmentScore(dspy.Signature):
    """You are an expert grader assessing if a system's answer is semantically aligned with the correct answer.
    Only return True if the system answer has essentially the same meaning as the correct answer.
    If the system answer misses key aspects or meaning, return False.
    """

    question: str = dspy.InputField(description="The question asked.")
    system_answer: str = dspy.InputField(description="The answer generated by the system.")
    correct_answer: str = dspy.InputField(description="The reference answer containing the correct and complete information.")
    score: bool = dspy.OutputField(description="True if system_answer is equivalent in meaning to correct_answer, otherwise False.")

judge = dspy.Predict(AssessAlignmentScore)

test_question = "What is HyDE?"
correct_answer = "HyDE stands for Hypothetical Document Embeddings, a technique for improving retrieval in AI systems by generating hypothetical answers and using their embeddings."

# System answer missing key aspect (embeddings)
incorrect_answer = "HyDE is a technique for improving retrieval in AI systems by generating hypothetical answers."
# System answer rewords but covers all key ideas
acceptable_answer = "Hypothetical Document Embeddings (HyDE) is a method to help AI retrieval by creating hypothetical documents as sample answers and using their vector representations."

response = judge(question=test_question, system_answer=incorrect_answer, correct_answer=correct_answer)
print(response)
response = judge(question=test_question, system_answer=acceptable_answer, correct_answer=correct_answer)
print(response)

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## sc...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Prediction(
    score=False
)
Prediction(
    score=True
)


In [13]:
response.get_lm_usage()

{'openai/gpt-4.1': {'completion_tokens': 12,
  'prompt_tokens': 349,
  'total_tokens': 361,
  'completion_tokens_details': {'accepted_prediction_tokens': 0,
   'audio_tokens': 0,
   'reasoning_tokens': 0,
   'rejected_prediction_tokens': 0,
   'text_tokens': None},
  'prompt_tokens_details': {'audio_tokens': 0,
   'cached_tokens': 0,
   'text_tokens': None,
   'image_tokens': None}}}

In [None]:
# load data
from datasets import load_dataset

queries = load_dataset("weaviate/irpapers-queries")["train"]

In [None]:
alignment_scores, input_tokens, output_tokens = [], [], []

K = 3

start = time.time()
for idx, query in enumerate(queries):
    if idx % 5 == 4:
        print(f"Processed {idx+1} queries in {time.time() - start} seconds...")
    test_query, ground_truth_answer = query["question"], query["answer"]
    qa_system_response = qa_system(
        question=test_query
    )
    usage_dict = qa_system_response.get_lm_usage()["openai/gpt-4.1"]
    input_tokens.append(usage_dict["prompt_tokens"])
    output_tokens.append(usage_dict["completion_tokens"])

    ensemble_votes = 0
    for judge_predictions in range(K):
        lm_judge_response = judge(
            question=test_query,
            system_answer=qa_system_response.answer,
            correct_answer=ground_truth_answer
        )
        if lm_judge_response.score:
            ensemble_votes += 1
    if ensemble_votes >= K / 2:
        alignment_scores.append(1)
    else:
        alignment_scores.append(0)

alignment_scores = np.array(alignment_scores)
input_tokens = np.array(input_tokens)
output_tokens = np.array(output_tokens)

print(alignment_scores.mean())
print(input_tokens.mean())
print(output_tokens.mean())


  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='[[ ## an...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
