# PragmatiCQA With LLMs

## Dataset Analysis

Before you begin coding, read the original paper that introduces the PRAGMATICQA dataset. In a few paragraphs:

- Summarize the key motivations and contributions of the paper.
- Explain in a qualitative manner what makes this dataset challenging for NLP models. What specific pragmatic phenomena does it target?
- Select a few (about 5) sample conversations from the dataset (from different topics) and explain how the pragmatic answer enriches the literal answer that would be produced by a non-cooperative teacher.

## The "Traditional" NLP Approach

### RAG Module

In [1]:
import dspy
from sentence_transformers import SentenceTransformer

# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

# Create an embedder using the model's encode method
embedder = dspy.Embedder(model.encode)

# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts

modules.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

0_StaticEmbedding/model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

In [2]:
# Parameters
max_characters = 10000  
topk_docs_to_retrieve = 5  # Number of passages per query

# Map topic name → DSPy retriever
topicToRetriever = {}

sources_root = "../PragmatiCQA-sources"

for topic in os.listdir(sources_root):
    topic_path = os.path.join(sources_root, topic)
    
    if not os.path.isdir(topic_path):
        continue  
    
    corpus = read_html_files(topic_path)
    corpus = [doc[:max_characters] for doc in corpus]  # Truncate documents to avoid very long ones

    retriever = dspy.retrievers.Embeddings(
        embedder=embedder,
        corpus=corpus,
        k=topk_docs_to_retrieve
    )

    topicToRetriever[topic] = retriever

In [None]:
# Make a RAG module with a given retriever.
class RAG(dspy.Module):
    def __init__(self, search):
        self.search = search
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = self.search(question).passages
        return self.respond(context=context, question=question)

In [None]:
os.environ["XAI_API_KEY"] = ""

lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
dspy.configure(lm=lm)

In [None]:
class RAG(dspy.Module):
    def __init__(self, retrieverMap = topicToRetriever):
        self.retrieverMap = retrieverMap
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, topic, question):
        if topic not in self.retrieverMap:
            raise ValueError(f"Topic '{topic}' not found in retriever map.")
        
        # Gets the retriever for the specified topic 
        search = self.retrieverMap[topic]
        # Retrieves relevant passages using appropriate retriever
        context = search(question).passages
        return self.respond(context=context, question=question)
    
rag = RAG(topicToRetriever)

### Prepare Dataset

In [None]:
# Load jsonl from dataset directory
import json 

def read_data(filename, dataset_dir="../PragmatiCQA/data"):
    corpus = []
    with open(os.path.join(dataset_dir, filename), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

pcqa_val = read_data("val.jsonl")

### Evaluate Model Performance

In [None]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

def evaluate_configs_batch(rag=rag, dataset=pcqa_val):

    gold_examples = []
    lit_preds = []
    prag_preds = []
    rag_preds = []

    for doc in dataset:
        topic = doc['topic']
        qa = doc['qas'][0]
        question = qa['q']
        gold_ans = qa['a']
        lit_answer = ' '.join([l['text'] for l in qa['a_meta']['literal_obj']])
        prag_answer = ' '.join([l['text'] for l in qa['a_meta']['pragmatic_obj']])

        # Prepare gold example
        gold = dspy.Example(question=question, response=gold_ans)
        gold_examples.append(gold)

        # Create predictions
        lit_preds.append(dspy.Example(question=question, response=lit_answer))
        prag_preds.append(dspy.Example(question=question, response=prag_answer))
        rag_preds.append(rag(topic, question))  

    # Batched scoring
    lit_scores = metric.batch(gold_examples, lit_preds)
    prag_scores = metric.batch(gold_examples, prag_preds)
    rag_scores = metric.batch(gold_examples, rag_preds)

    return lit_scores, prag_scores, rag_scores


#### Literal Context

#### Pragmatic Context

#### Retrieved Context