# 1.5 DSPy RAG Optimization

This notebook uses the golden QA dataset to optimize the DSPy RAG retriever.

## Steps:
1. Load golden QA dataset
2. Set up DSPy with OpenAI model
3. Create evaluation metric
4. Load existing RAG agent
5. Run baseline evaluation
6. Optimize retriever with DSPy compiler
7. Evaluate optimized version

In [1]:
import json
import os
from pathlib import Path
import dspy
from dotenv import load_dotenv
from hack.rag_agent import create_agent
from hack.retriever import FaissRetriever

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Golden QA Dataset

In [2]:
# Load golden QA dataset
with open('golden_qa_with_answers.json', 'r') as f:
    golden_qa = json.load(f)

print(f"Loaded {len(golden_qa)} QA pairs")
print(f"\nExample QA pair:")
print(f"Q: {golden_qa[0]['question']}")
print(f"A: {golden_qa[0]['answer']}")
print(f"Context: {golden_qa[0]['context'][:100]}...")

Loaded 50 QA pairs

Example QA pair:
Q: Why does the simple move described in the passage ruin Black's plan and lead to the eventual loss of his pawns?
A: Because the move prevents Black's pieces from coordinating, leaving his pawns weak and vulnerable, so he will eventually lose them.
Context: A simple move, which destroys Black's plan utterly. Black will now have no concerted action of his p...


## 2. Convert to DSPy Examples

In [3]:
# Convert golden QA to DSPy examples
# Split into train and validation sets
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(golden_qa, test_size=0.3, random_state=42)

print(f"Train set: {len(train_data)} examples")
print(f"Validation set: {len(val_data)} examples")

# Create DSPy examples
train_examples = [
    dspy.Example(
        question=qa['question'],
        answer=qa['answer'],
        context=qa['context']
    ).with_inputs('question')
    for qa in train_data
]

val_examples = [
    dspy.Example(
        question=qa['question'],
        answer=qa['answer'],
        context=qa['context']
    ).with_inputs('question')
    for qa in val_data
]

Train set: 35 examples
Validation set: 15 examples


## 3. Set up DSPy with OpenAI

In [4]:
# Configure DSPy with OpenAI
lm = dspy.LM('openai/gpt-4o-mini', api_key=openai_api_key)
dspy.configure(lm=lm)

## 4. Load RAG Agent

In [5]:
# Create RAG agent with FAISS retriever
agent = create_agent(
    faiss_index_path="chess_pdf.faiss",
    workspace_json_path="workspace_with_embeddings.json"
)

print("Agent created successfully")

Loaded FAISS index with 1941 vectors
Loaded 1941 blocks from workspace
Agent created successfully


## 5. Define Evaluation Metric

We'll use a simple metric that checks if the answer contains key information from the golden answer.

In [6]:
def answer_correctness_metric(example, prediction, trace=None):
    """
    Evaluate answer quality by checking if key concepts from golden answer
    are present in the predicted answer.
    
    Returns a score between 0 and 1.
    """
    # Get predicted answer
    if isinstance(prediction, dict):
        pred_answer = prediction.get('answer', '')
    else:
        pred_answer = str(prediction)
    
    golden_answer = example.answer.lower()
    pred_answer = pred_answer.lower()
    
    # Simple keyword-based scoring
    # Extract important words (remove common words)
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                  'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'but', 'it', 'as',
                  'that', 'this', 'with', 'from', 'by', 'so', 'if', 'not', 'can', 'will'}
    
    golden_words = set(w for w in golden_answer.split() if w not in stop_words and len(w) > 3)
    pred_words = set(w for w in pred_answer.split() if w not in stop_words and len(w) > 3)
    
    if not golden_words:
        return 0.0
    
    # Calculate overlap
    overlap = len(golden_words & pred_words)
    score = overlap / len(golden_words)
    
    return min(1.0, score)


def retrieval_recall_metric(example, prediction, trace=None):
    """
    Check if the golden context appears in retrieved candidates.
    This evaluates retriever quality.
    """
    if trace is None:
        return 0.0
    
    # Look for retrieved candidates in trace
    # This is a simplified check - you may need to adjust based on your trace structure
    golden_context = example.context.lower()
    
    # Check if context substring appears in any retrieved text
    # This is a proxy for checking if the right chunk was retrieved
    if hasattr(prediction, '__dict__'):
        for key, value in prediction.__dict__.items():
            if isinstance(value, str) and golden_context[:50] in value.lower():
                return 1.0
    
    return 0.0


def combined_metric(example, prediction, trace=None):
    """
    Combined metric: 50% answer correctness + 50% retrieval recall.
    """
    answer_score = answer_correctness_metric(example, prediction, trace)
    retrieval_score = retrieval_recall_metric(example, prediction, trace)
    return 0.5 * answer_score + 0.5 * retrieval_score

## 6. Baseline Evaluation

In [10]:
# Evaluate baseline performance on validation set
print("Evaluating baseline RAG agent...")

baseline_scores = []
for i, example in enumerate(val_examples[:10]):  # Start with subset
    try:
        prediction = agent.forward(example.question)
        score = answer_correctness_metric(example, prediction)
        baseline_scores.append(score)
        print(f"Example {i+1}: {score:.2f}")
    except Exception as e:
        print(f"Error on example {i+1}: {e}")
        baseline_scores.append(0.0)

baseline_avg = sum(baseline_scores) / len(baseline_scores) if baseline_scores else 0.0
print(f"\nBaseline average score: {baseline_avg:.3f}")



Evaluating baseline RAG agent...




Example 1: 0.38
Example 2: 0.60
Example 3: 0.41




Example 4: 0.54




Example 5: 0.67




Example 6: 0.63




Example 7: 0.52




Example 8: 0.38




Example 9: 0.80




Error on example 10: litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-j6MkLqiZgm6JZAieTnFaC1x4 on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.

Baseline average score: 0.493


## 7. Optimize with DSPy

We'll use DSPy's BootstrapFewShot optimizer to improve the agent.

In [None]:
from dspy.teleprompt import BootstrapFewShot

# Configure optimizer
optimizer = BootstrapFewShot(
    metric=answer_correctness_metric,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=1
)

print("Starting optimization...")
print("This may take several minutes...\n")

# Compile/optimize the agent
optimized_agent = optimizer.compile(
    agent,
    trainset=train_examples[:20],  # Start with subset
    # valset=val_examples[:10]
)

print("\nOptimization complete!")

## 8. Evaluate Optimized Agent

In [None]:
# Evaluate optimized performance
print("Evaluating optimized RAG agent...")

optimized_scores = []
for i, example in enumerate(val_examples[:10]):
    try:
        prediction = optimized_agent.forward(example.question)
        score = answer_correctness_metric(example, prediction)
        optimized_scores.append(score)
        print(f"Example {i+1}: {score:.2f}")
    except Exception as e:
        print(f"Error on example {i+1}: {e}")
        optimized_scores.append(0.0)

optimized_avg = sum(optimized_scores) / len(optimized_scores) if optimized_scores else 0.0
print(f"\nOptimized average score: {optimized_avg:.3f}")
print(f"Baseline average score: {baseline_avg:.3f}")
print(f"Improvement: {(optimized_avg - baseline_avg):.3f}")

## 9. Compare Specific Examples

In [None]:
# Compare baseline vs optimized on a specific example
test_example = val_examples[0]

print("Question:", test_example.question)
print("\nGolden Answer:", test_example.answer)

baseline_pred = agent.forward(test_example.question)
print("\n--- Baseline Prediction ---")
print(baseline_pred.get('answer', 'No answer'))

optimized_pred = optimized_agent.forward(test_example.question)
print("\n--- Optimized Prediction ---")
print(optimized_pred.get('answer', 'No answer'))

## 10. Save Optimized Agent

In [None]:
# Save the optimized agent
optimized_agent.save('optimized_rag_agent.json')
print("Optimized agent saved to optimized_rag_agent.json")

## 11. Full Validation Set Evaluation

In [None]:
# Run on full validation set (optional - may be slow)
print("Running full validation set evaluation...")

full_baseline_scores = []
full_optimized_scores = []

for i, example in enumerate(val_examples):
    print(f"\rProcessing example {i+1}/{len(val_examples)}", end="")
    
    try:
        # Baseline
        baseline_pred = agent.forward(example.question)
        baseline_score = answer_correctness_metric(example, baseline_pred)
        full_baseline_scores.append(baseline_score)
        
        # Optimized
        optimized_pred = optimized_agent.forward(example.question)
        optimized_score = answer_correctness_metric(example, optimized_pred)
        full_optimized_scores.append(optimized_score)
    except Exception as e:
        print(f"\nError on example {i+1}: {e}")
        full_baseline_scores.append(0.0)
        full_optimized_scores.append(0.0)

print("\n\n=== Full Validation Results ===")
print(f"Baseline average: {sum(full_baseline_scores)/len(full_baseline_scores):.3f}")
print(f"Optimized average: {sum(full_optimized_scores)/len(full_optimized_scores):.3f}")
print(f"Improvement: {(sum(full_optimized_scores)-sum(full_baseline_scores))/len(full_baseline_scores):.3f}")