# Test Quick Response Quality Evaluation

This notebook tests the`QuickResponseQualityLabeler` functionality locally.

In [4]:
import os
import asyncio
from dotenv import load_dotenv

from ypl.backend.llm.chat import ModelInfo, get_chat_model, ChatProvider
from ypl.backend.llm.judge import QuickResponseQualityLabeler
from ypl.backend.prompts import JUDGE_QUICK_RESPONSE_QUALITY_SYSTEM_PROMPT, JUDGE_QUICK_RESPONSE_QUALITY_USER_PROMPT

# Load environment variables
load_dotenv()

# Initialize LLM
model_name = "gpt-4o-mini"  # Can be modified as needed
llm = get_chat_model(ModelInfo(
    model=model_name,
    provider=ChatProvider.OPENAI,
    api_key=os.getenv('OPENAI_API_KEY')
))

# Initialize labeler
labeler = QuickResponseQualityLabeler(llm)

In [2]:
# Silence warnings and info
import warnings
import logging
warnings.filterwarnings("ignore")

# Configure logging to silence httpx
logging.getLogger("httpx").setLevel(logging.WARNING)

In [5]:
# Print the prompts to verify they're correct
print("System Prompt:")
print(JUDGE_QUICK_RESPONSE_QUALITY_SYSTEM_PROMPT)
print("\nUser Prompt Template:")
print(JUDGE_QUICK_RESPONSE_QUALITY_USER_PROMPT)

System Prompt:

You are an AI assistant tasked with evaluating the quality of short Twitter-like AI responses to prompts (and conversation history if available).

Consider these factors:
- Accuracy: Is the response factually correct?
- Brevity: Is the response concise without any extraneous words? (Should be ≤140 characters)
- Formatting: Is the response plain text without formatting, markdown, or newlines?
- Completeness: Is the response complete and not truncated mid-sentence?
- Relevance: Does the response address the user's prompt?
- Tone: Is the response appropriate and friendly?

Special case for [NULL] responses: a [NULL] response represents a refusal to answer the prompt, since a short response is inadequate for the prompt.
- POOR: If the prompt could be answered briefly but wasn't.
- ACCEPTABLE: If it's unclear whether a brief answer was possible.
- EXCELLENT: If a long answer is required and the AI correctly refuses.

Return format: Respond with one of these ratings:
1 - POOR

In [6]:
# Test single case with debug info
test_case = {
    "prompt": "What's the capital of France?",
    "response": "Paris",
    "chat_history": []
}

# Debug the input preparation
prepared_input = labeler._prepare_input((test_case["prompt"], test_case["response"], test_case["chat_history"]))
print("Prepared Input:")
print(prepared_input)

# Test evaluation
rating = await labeler.alabel((test_case["prompt"], test_case["response"], test_case["chat_history"]))
print("\nRating (1=poor, 2=acceptable, 3=excellent):")
print(rating)

Prepared Input:
{'user_prompt': "What's the capital of France?", 'response': 'Paris', 'chat_history': 'No previous conversation'}

Rating (1=poor, 2=acceptable, 3=excellent):
excellent


In [8]:
# Test more cases
test_cases = [
    {
        "prompt": "What's the capital of France?",
        "response": "Paris",
        "chat_history": [],
        "expected": "excellent"  # Short, accurate, relevant
    },
    {
        "prompt": "What's the capital of France?",
        "response": "The capital of France is Paris.",
        "chat_history": [],
        "expected": "excellent"  # Short, accurate, relevant
    },
    {
        "prompt": "Write a long essay about AI",
        "response": "[NULL]",
        "chat_history": [],
        "expected": "excellent"  # Correctly refuses long response
    },
    {
        "prompt": "How's the weather?",
        "response": "I don't have access to real-time weather data.",
        "chat_history": [],
        "expected": "acceptable"  # Clear response but could be more concise
    },
    {
        "prompt": "What's 2+2?",
        "response": "Let me explain step by step: First, we take 2 and add another 2 to it. This basic arithmetic operation results in the sum of 4.",
        "chat_history": [],
        "expected": "poor"  # Too verbose for a simple question
    },
    {
        "prompt": "Can you explain quantum mechanics?",
        "response": "Quantum mechanics is a fundamental theory in physics that describes the behavior of matter and energy at the atomic and subatomic scale. It deals with phenomena like superposition, entanglement, and wave-particle duality. The mathematical framework...",
        "chat_history": [],
        "expected": "poor"  # Too verbose, should have refused
    },
    {
        "prompt": "What's the best programming language?",
        "response": "Ugh, not this question again! Only noobs ask this. Real programmers know it depends on the use case.",
        "chat_history": [],
        "expected": "poor"  # Unhelpful response
    },
    {
        "prompt": "Can you explain how a car engine works?",
        "response": "A car engine converts fuel into",
        "chat_history": [],
        "expected": "poor"  # Truncated mid-sentence, should have given complete response or refused
    },
    {
        "prompt": "What are the key features of Python?",
        "response": "### Key Python Features\n- Easy to read syntax\n- Dynamic typing\n- Extensive libraries\n\nPerfect for beginners and experts alike.",
        "chat_history": [],
        "expected": "poor"  # Markdown formatting inappropriate for quick response
    },
    {
        "prompt": "what is the super long word by Mary Poppins' son",
        "response": "Supercalifragilisticexpialidocious.",
        "chat_history": [],
        "expected": "poor"  # Incorrect, she doesn't have a son
    }
]

# Test with debug info
async def test_cases_with_debug():
    for case in test_cases:
        print(f"Testing case with prompt: {case['prompt']}")
        print(f"Response: {case['response']}")
        print(f"Expected rating: {case['expected']}")
        
        rating = await labeler.alabel((case["prompt"], case["response"], case["chat_history"]))
        print(f"Actual rating: {rating}\n")

await test_cases_with_debug()

Testing case with prompt: What's the capital of France?
Response: Paris
Expected rating: excellent
Actual rating: excellent

Testing case with prompt: What's the capital of France?
Response: The capital of France is Paris.
Expected rating: excellent
Actual rating: excellent

Testing case with prompt: Write a long essay about AI
Response: [NULL]
Expected rating: excellent
Actual rating: excellent

Testing case with prompt: How's the weather?
Response: I don't have access to real-time weather data.
Expected rating: acceptable
Actual rating: acceptable

Testing case with prompt: What's 2+2?
Response: Let me explain step by step: First, we take 2 and add another 2 to it. This basic arithmetic operation results in the sum of 4.
Expected rating: poor
Actual rating: poor

Testing case with prompt: Can you explain quantum mechanics?
Response: Quantum mechanics is a fundamental theory in physics that describes the behavior of matter and energy at the atomic and subatomic scale. It deals with ph

In [9]:
# Test with chat history
test_with_history = {
    "prompt": "What about the second one?",
    "response": "K2",
    "chat_history": [
        {"role": "user", "content": "What's the tallest mountain in the world?"},
        {"role": "assistant", "content": "Mount Everest"},
        {"role": "user", "content": "What about the second one?"}
    ]
}

# Debug the input preparation
prepared_input = labeler._prepare_input((
    test_with_history["prompt"],
    test_with_history["response"],
    test_with_history["chat_history"]
))
print("Prepared Input:")
print(prepared_input)

# Test evaluation
rating = await labeler.alabel((
    test_with_history["prompt"],
    test_with_history["response"],
    test_with_history["chat_history"]
))
print("\nRating (1=poor, 2=acceptable, 3=excellent):")
print(rating)

Prepared Input:
{'user_prompt': 'What about the second one?', 'response': 'K2', 'chat_history': [{'role': 'user', 'content': "What's the tallest mountain in the world?"}, {'role': 'assistant', 'content': 'Mount Everest'}, {'role': 'user', 'content': 'What about the second one?'}]}

Rating (1=poor, 2=acceptable, 3=excellent):
excellent


In [10]:
# Test with chat history
test_with_history = {
    "prompt": "What about the second one?",
    "response": "K2",
    "chat_history": [
        {"role": "user", "content": "What is the largest animal in the world?"},
        {"role": "assistant", "content": "The blue whale"},
        {"role": "user", "content": "What about the second one?"}
    ]
}

# Debug the input preparation
prepared_input = labeler._prepare_input((
    test_with_history["prompt"],
    test_with_history["response"],
    test_with_history["chat_history"]
))
print("Prepared Input:")
print(prepared_input)

# Test evaluation
rating = await labeler.alabel((
    test_with_history["prompt"],
    test_with_history["response"],
    test_with_history["chat_history"]
))
print("\nRating (1=poor, 2=acceptable, 3=excellent):")
print(rating)

Prepared Input:
{'user_prompt': 'What about the second one?', 'response': 'K2', 'chat_history': [{'role': 'user', 'content': 'What is the largest animal in the world?'}, {'role': 'assistant', 'content': 'The blue whale'}, {'role': 'user', 'content': 'What about the second one?'}]}

Rating (1=poor, 2=acceptable, 3=excellent):
poor
