# PragmatiCQA With LLMs

## Dataset Analysis

Before you begin coding, read the original paper that introduces the PRAGMATICQA dataset. In a few paragraphs:

- Summarize the key motivations and contributions of the paper.
- Explain in a qualitative manner what makes this dataset challenging for NLP models. What specific pragmatic phenomena does it target?
- Select a few (about 5) sample conversations from the dataset (from different topics) and explain how the pragmatic answer enriches the literal answer that would be produced by a non-cooperative teacher.

## The "Traditional" NLP Approach

### Prepare Dataset

In [1]:
import dspy
from sentence_transformers import SentenceTransformer

# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

# Create an embedder using the model's encode method
embedder = dspy.Embedder(model.encode)

# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts

modules.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

0_StaticEmbedding/model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

In [4]:
# Parameters
max_characters = 10000  
topk_docs_to_retrieve = 5  # Number of passages per query

# Map topic name → DSPy retriever
topicToRetriever = {}

sources_root = "../PragmatiCQA-sources"

for topic in os.listdir(sources_root):
    topic_path = os.path.join(sources_root, topic)
    
    if not os.path.isdir(topic_path):
        continue  
    
    corpus = read_html_files(topic_path)
    corpus = [doc[:max_characters] for doc in corpus]  # Truncate documents to avoid very long ones

    retriever = dspy.retrievers.Embeddings(
        embedder=embedder,
        corpus=corpus,
        k=topk_docs_to_retrieve
    )

    topicToRetriever[topic] = retriever

In [5]:
from transformers import pipeline
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use mps:0


In [6]:
# Load jsonl from dataset directory
import json 

def read_data(filename, dataset_dir="../PragmatiCQA/data"):
    corpus = []
    with open(os.path.join(dataset_dir, filename), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

pcqa_val = read_data("val.jsonl")

### Evaluate Model Performance

In [30]:
from dspy.evaluate import SemanticF1

metric = SemanticF1(decompositional=True)

def evaluate_configs_batch(dataset=pcqa_val, retrieverMap=topicToRetriever, model=qa_model):

    gold_examples = []
    lit_preds = []
    prag_preds = []
    model_preds = []

    for i, doc in enumerate(dataset):
        topic = doc['community']
        if topic not in retrieverMap:
            print(f"Skipping topic: {topic} (not found in retriever map)")
            continue
        print(f"Processing doc number {i} with topic: {topic}")
        qa = doc['qas'][0]
        question = qa['q']
        gold_ans = qa['a']

        # Extract contexts for model
        lit_context = ' '.join([l['text'] for l in qa['a_meta']['literal_obj']])
        prag_context = ' '.join([l['text'] for l in qa['a_meta']['pragmatic_obj']])
        model_context = ' '.join(retrieverMap[topic](question).passages)
        
        # Get answers from model
        lit_answer = model(question=question, context=lit_context)['answer']
        prag_answer = model(question=question, context=prag_context)['answer']
        model_answer = model(question=question, context=model_context)['answer']

        # Prepare gold example
        gold = dspy.Example(question=question, response=gold_ans).with_inputs("question")
        gold_examples.append(gold)

        # Add answers to preds
        lit_preds.append(dspy.Example(question=question, response=lit_answer).with_inputs("question"))
        prag_preds.append(dspy.Example(question=question, response=prag_answer).with_inputs("question"))
        model_preds.append(dspy.Example(question=question, response=model_answer).with_inputs("question"))  

    # Batched scoring
    lit_scores = SemanticF1.batch(gold_examples, lit_preds)
    prag_scores = SemanticF1.batch(gold_examples, prag_preds)
    model_scores = SemanticF1.batch(gold_examples, model_preds)

    return lit_scores, prag_scores, model_scores


In [18]:
import pandas as pd
import numpy as np

def compare_results(lit_scores, prag_scores, retrieved_scores, dataset=pcqa_val):
    rows = []

    for i, doc in enumerate(dataset):
        topic = doc['topic']
        qa = doc['qas'][0]
        question = qa['q']
        gold_answer = qa['a']

        rows.append({
            "Topic": topic,
            "Question": question,
            "Gold Answer": gold_answer,
            "Literal F1": lit_scores[i],
            "Pragmatic F1": prag_scores[i],
            "Retrieved F1": retrieved_scores[i]
        })

    df = pd.DataFrame(rows)

    # Print the average scores
    print("=== AVERAGE SEMANTIC F1 SCORES ===")
    print(f"Literal:   {np.mean(df['Literal F1']):.3f}")
    print(f"Pragmatic: {np.mean(df['Pragmatic F1']):.3f}")
    print(f"Retrieved: {np.mean(df['Retrieved F1']):.3f}")

    return df


In [31]:
# Test evaluate_configs_batch on a single example
test_doc = pcqa_val[0]
test_dataset = [test_doc]

# Run evaluation on the first example only
lit_scores, prag_scores, model_scores = evaluate_configs_batch(dataset=test_dataset)

# Print results
print("=== Example Evaluation ===")
print(f"Topic:        {test_doc['topic']}")
print(f"Question:     {test_doc['qas'][0]['q']}")
print(f"Gold Answer:  {test_doc['qas'][0]['a']}")
print(f"Literal F1:   {lit_scores[0]:.3f}")
print(f"Pragmatic F1: {prag_scores[0]:.3f}")
print(f"Retrieved F1: {model_scores[0]:.3f}")


Processing doc number 0 with topic: A Nightmare on Elm Street


2025/08/04 19:30:30 ERROR dspy.utils.parallelizer: Error for ([Example({'question': 'who is freddy krueger?', 'response': "Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wiki is not allowing access to Adam Prag, to the page... so I'll have to go from memory.  Normally you can paste things and back up what you are saying, but today that's not happening. alas."}) (input_keys={'question'})], Example({'question': 'who is freddy krueger?'}) (input_keys={'question'})): 'list' object is not callable. Set `provide_traceback=True` for traceback.


Processed 0 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1408.43it/s]

2025/08/04 19:30:30 ERROR dspy.utils.parallelizer: Error for ([Example({'question': 'who is freddy krueger?', 'response': "Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wiki is not allowing access to Adam Prag, to the page... so I'll have to go from memory.  Normally you can paste things and back up what you are saying, but today that's not happening. alas."}) (input_keys={'question'})], Example({'question': 'who is freddy krueger?'}) (input_keys={'question'})): 'list' object is not callable. Set `provide_traceback=True` for traceback.



Processed 0 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1615.68it/s]

2025/08/04 19:30:30 ERROR dspy.utils.parallelizer: Error for ([Example({'question': 'who is freddy krueger?', 'response': "Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wiki is not allowing access to Adam Prag, to the page... so I'll have to go from memory.  Normally you can paste things and back up what you are saying, but today that's not happening. alas."}) (input_keys={'question'})], Example({'question': 'who is freddy krueger?'}) (input_keys={'question'})): 'list' object is not callable. Set `provide_traceback=True` for traceback.



Processed 0 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 2688.66it/s]
=== Example Evaluation ===
Topic:        A Nightmare on Elm Street (2010 film)
Question:     who is freddy krueger?
Gold Answer:  Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wiki is not allowing access to Adam Prag, to the page... so I'll have to go from memory.  Normally you can paste things and back up what you are saying, but today that's not happening. alas.


TypeError: unsupported format string passed to NoneType.__format__

In [None]:
class RAG(dspy.Module):
    def __init__(self, retrieverMap = topicToRetriever):
        self.retrieverMap = retrieverMap
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, topic, question):
        if topic not in self.retrieverMap:
            raise ValueError(f"Topic '{topic}' not found in retriever map.")
        
        # Gets the retriever for the specified topic 
        search = self.retrieverMap[topic]
        # Retrieves relevant passages using appropriate retriever
        context = search(question).passages
        return self.respond(context=context, question=question)
    
rag = RAG(topicToRetriever)