# Semantic F1 Evaluation Metric

The DSPy SemanticF1 evaluation metric uses an LLM to compare a candidate textual answer to a gold answer.
It computes a "precision", "recall" and "F1" measure of how well the candidate answer provides the information expected in the gold answer.

In [30]:
import dspy
from sentence_transformers import SentenceTransformer
lm = dspy.LM('xai/grok-3-mini', max_tokens=6000, temperature=0.1, top_p=0.9)
dspy.configure(lm=lm)

model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
embedder = dspy.Embedder(model.encode)

# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(dir_name, directory="../PragmatiCQA-sources"):
    texts = []
    for filename in os.listdir(os.path.join(directory, dir_name)):
        if filename.endswith(".html"):
            with open(os.path.join(directory, dir_name, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts

# Perform retrieval on a specific topic: read html files for the corresponding folder, index
def make_search(topic):
    corpus = read_html_files(topic)
    max_characters = 10000 
    topk_docs_to_retrieve = 5  # number of documents to retrieve per search query
    return dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve, brute_force_threshold=max_characters)

# Make a RAG module with a given retriever.
class RAG(dspy.Module):
    def __init__(self, search):
        self.search = search
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = self.search(question).passages
        return self.respond(context=context, question=question)

In [31]:
# Load jsonl from dataset directory
import json
import os  

def read_data(filename, dataset_dir="../PragmatiCQA/data"):
    corpus = []
    with open(os.path.join(dataset_dir, filename), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

pcqa_test = read_data("test.jsonl")


In [32]:
from pprint import pprint
pprint(pcqa_test[0])  # Print the first item in the test set to see its structure


{'community': 'The Legend of Zelda',
 'genre': 'Games',
 'qas': [{'a': 'The Legend of Zelda came out as early as 1986 for the Famicom '
               'in Japan, and was later released in the western world, '
               'including Europe and the US in 1987. Would you like to know '
               'about the story?',
          'a_meta': {'literal_obj': [{'endKey': '9cbccabd-66be-4a46-bd8b-f59a299c987d',
                                      'startKey': '1f4f808a-8560-4894-b892-15fa3c33887a',
                                      'text': 'FDS release February 21, '
                                              '1986\n'},
                                     {'endKey': '738bff65-b4f9-4660-bd18-79722ed67a40',
                                      'startKey': 'a0d9d5c5-18bb-4be4-825e-fca2900db18e',
                                      'text': 'The Legend of Zelda is the '
                                              'first installment of the Zelda '
                                   

In [33]:
# Evaluate simple RAG on a specific topic: compare predicted answer to literal answer.
s = pcqa_test[0]
topic = s['topic']
qa = s['qas'][0]
question = qa['q']
answer = qa['a']
lit_spans = [l['text'] for l in qa['a_meta']['literal_obj']]
lit_answer = ' '.join(lit_spans)
prag_spans = [l['text'] for l in qa['a_meta']['pragmatic_obj']]
prag_answer = ' '.join(prag_spans)

print(f"Topic: {topic}")
print(f"Question: {question}")
print(f"Literal spans: {lit_spans}")
print(f"Pragmatic spans: {prag_spans}")
print(f"Expected Answer: {answer}")


Topic: The Legend of Zelda
Question: What year did the Legend of Zelda come out?
Literal spans: ['FDS release February 21, 1986\n', 'The Legend of Zelda is the first installment of the Zelda series. ', ' It centers its plot around a boy named Link , who becomes the central protagonist throughout the series. ']
Pragmatic spans: ['It came out as early as 1986 for the Famicom in Japan, and was later released in the western world, including Europe and the US in 1987.']
Expected Answer: The Legend of Zelda came out as early as 1986 for the Famicom in Japan, and was later released in the western world, including Europe and the US in 1987. Would you like to know about the story?


In [34]:
search = make_search(topic)
rag = RAG(search)
pred_answer = rag(question).response
context = rag.search(question).passages
print(f"Predicted Answer: {pred_answer}")
print(f"Context: {context}")



Predicted Answer: The Legend of Zelda was first released in 1986.
Context: ["\n\n\n\n\n\n\n\n      This article is a short summary of Shigeru Miyamoto.\n      NintendoWiki features\n      \n       a more in-depth article\n      \n      .\n     \n\n\n\n\n\n\n\n      Shigeru Miyamoto\n      宮本 茂\n      みやもと しげる\n     \n\n\n\n      Current Position\n     \n\n\n       General producer of\n       \n\n         The Legend of Zelda\n        \n        series\n       \n\n\n\n\n\n      Birthday\n     \n\n\n       November 16, 1952\n      \n\n\n\n\n      Birthplace\n     \n\n\n       Sonobe, Kyoto, Japan\n      \n\n\n\n\n\n\n    Shigeru Miyamoto\n   \n   is a video game designer and producer, creator of\n   \n\n     The Legend of Zelda\n    \n    series\n   \n   .\n  \n\n\n\n     Contents\n    \n\n\n\n\n\n       1\n      \n\n       Biography\n      \n\n\n\n\n\n       2\n      \n\n\n        The Legend of Zelda\n       \n       Games\n      \n\n\n\n\n\n       3\n      \n\n       Trivia\n      \n\n\n

In [35]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

# Produce a prediction from the RAG module.
example = dspy.Example(question=question, response=answer, inputs={'context': context})
pred = rag(example.question)

# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")



Question: 	 What year did the Legend of Zelda come out?

Gold Response: 	 The Legend of Zelda came out as early as 1986 for the Famicom in Japan, and was later released in the western world, including Europe and the US in 1987. Would you like to know about the story?

Predicted Response: 	 The Legend of Zelda was first released in 1986.

Semantic F1 Score: 0.50


In [36]:
print(f"Literal Answer: {lit_answer}")
print(f"Score for Literal Answer: {metric(example, dspy.Example(question=question, response=lit_answer, inputs={'context': context})):.2f}")

print(f"Pragmatic Answer: {prag_answer}")
print(f"Score for Pragmatic Answer: {metric(example, dspy.Example(question=question, response=prag_answer, inputs={'context': context})):.2f}")


Literal Answer: FDS release February 21, 1986
 The Legend of Zelda is the first installment of the Zelda series.   It centers its plot around a boy named Link , who becomes the central protagonist throughout the series. 
Score for Literal Answer: 0.33
Pragmatic Answer: It came out as early as 1986 for the Famicom in Japan, and was later released in the western world, including Europe and the US in 1987.
Score for Pragmatic Answer: 0.80


In [37]:
cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])
print(f"Total Cost: {cost:.2f} usd")

Total Cost: 0.04 usd
