# Refusal response labeler

Tests an LLM-judge refusal detection model against a simple regex-based model.

In [1]:
import re
import sys
from typing import Any
from sqlalchemy import func
from sqlmodel import Session, select
import logging
import httpx

sys.path.append("..")

from ypl.backend.llm.labeler import LLMLabeler, OnErrorBehavior
from ypl.backend.config import settings
from ypl.backend.db import get_engine
from ypl.db.chats import Chat, ChatMessage, LanguageCode, MessageType, Turn, TurnQuality
from ypl.backend.llm.chat import ModelInfo, get_chat_model
from ypl.backend.llm.constants import ChatProvider
from ypl.backend.llm.judge import ResponseRefusalLabeler
# Make the HTTP requests less chatty.
logging.getLogger("httpx").setLevel(logging.WARNING)

PyTorch version 2.4.1 available.


In [2]:
judge_llm = get_chat_model(
    ModelInfo(
        provider=ChatProvider.OPENAI,
        model="gpt-4o-mini",
        api_key=settings.OPENAI_API_KEY,
    ),
    temperature=0.0,
)
labeler = ResponseRefusalLabeler(judge_llm)

In [3]:
# Get turns to label.

num_parallel = 12

max_num_turns = 1000
prompts_responses = []
message_ids = []

with Session(get_engine()) as session:
    turn_ids = session.exec(
        select(Turn.turn_id)
        .order_by(func.random())
        .limit(max_num_turns)
    ).all()
    
    for turn_id in turn_ids:
        messages = session.exec(
            select(ChatMessage.message_id, ChatMessage.content, ChatMessage.message_type)
            .where(ChatMessage.turn_id == turn_id)
            .order_by(ChatMessage.created_at)
        ).all()
        
        prompt = None
        for message_id, content, message_type in messages:
            if message_type == MessageType.USER_MESSAGE:
                prompt = content 
                break

        if prompt is None:
            continue
            
        for message_id, content, message_type in messages:
            if message_type == MessageType.ASSISTANT_MESSAGE:
                prompts_responses.append((prompt, content))
                message_ids.append(message_id)
        
print(f"Collected {len(prompts_responses)} prompt-response pairs")

Collected 1320 prompt-response pairs


In [4]:
results = await labeler.abatch_label(prompts_responses, num_parallel=num_parallel)
print(f"Found {sum(results)} refusals ({sum(results) / len(results):.2%})")

100%|██████████| 1320/1320 [00:48<00:00, 27.29it/s]

Found 26 refusals (1.97%)





In [5]:
refusal_regex = re.compile(r"^(I.m sorry.{,20})?I (can|don|won).t (help|do|assist|comply|)", re.IGNORECASE)

regex_results = [
    1 if refusal_regex.search(response) is not None else 0
    for _, response in prompts_responses
]

print(f"Found {sum(regex_results)} regex refusals ({sum(regex_results) / len(regex_results):.2%})")

Found 10 regex refusals (0.76%)


In [6]:
for message_id, (prompt, response), llm_result, regex_result in zip(message_ids, prompts_responses, results, regex_results):
    if llm_result != regex_result:
        short_response = response.replace("\n", "\\n")[:100] + "..." if len(response) > 100 else response
        print(f"llm:{llm_result} != regex:{regex_result}\t{message_id}: {short_response}")

llm:0 != regex:1	e8e7eb54-41ad-4518-b411-6c748374c085: I don't have access to real-time data, so I can't provide the current date. However, you can easily ...
llm:1 != regex:0	8332977a-bdf2-4b21-a259-6014da97ad44: \n\nI can't give you a definitive "Top 10" list of songs played on the radio in 1977. \n\nHere's why...
llm:1 != regex:0	d63906d0-85ad-4fb4-b4c1-1eaf1bbbe5a6: I'm sorry, but as an AI language model, I don't have real-time access to current financial market da...
llm:1 != regex:0	3219b1d6-e4a7-4aae-89e6-6194654cf692: As an AI language model, I am not authorized to provide tax advice, and it's essential to consult a ...
llm:1 != regex:0	6c2608bb-7b0a-48d2-a57d-f00afa37be67: As a large language model, I am not capable of providing an answer to the meaning of life. That is a...
llm:0 != regex:1	631eea47-d3ee-4612-ab41-f4db12fa1f3a: I don't have a physical location or origin since I'm an artificial intelligence created by OpenAI. H...
llm:0 != regex:1	7e770f80-511d-458f-bbe8-f7462