# Vendor quality labeling run

This notebook collects a random sample of prompts and responses from backfill chats, adding missing QT responses if needed.

This can be used to share with a vendor for labeling.

In [5]:
import sys  # noqa

sys.path.append("..")
from dotenv import load_dotenv      
from openai import OpenAI
from anthropic import Anthropic
from google import generativeai
from tenacity import retry, stop_after_attempt, wait_fixed
import pandas as pd
from tqdm import tqdm
from sqlmodel import Session, select, text, func
from ypl.backend.db import get_engine
from ypl.db.chats import Chat, ChatMessage, Turn, TurnQuality, Category, MessageType
from ypl.db.language_models import LanguageModel
from ypl.db.users import User
import uuid
import os
import random
import logging

logging.getLogger("httpx").setLevel(logging.WARNING)  # silence gemini client logs.

load_dotenv()

True

In [6]:
# Get a random sample of backfill chats and their messages.

excluded_categories = {'Math', 'Code', 'Comparison'}

with Session(get_engine()) as session:
    # Select random backfill chats.
    random_chats_query = (
        select(Chat.chat_id)
        .join(User, Chat.creator_user_id == User.user_id)
        .where(
            User.backfill_job_id.is_not(None),
            Chat.deleted_at.is_(None),
            User.deleted_at.is_(None),
        )
        .order_by(func.random())
        .limit(1000)
    )
    random_chats = session.exec(random_chats_query).all()
    random_chat_ids = [str(chat_id) for chat_id in random_chats]

    # Get the prompt and its responses.
    query = (
        select(
            Chat.chat_id,
            ChatMessage.content,
            ChatMessage.message_type,
            ChatMessage.message_id,
            Turn.turn_id,
            TurnQuality.prompt_difficulty,
            Category.name.label("category_name"),
        )
        .join(Turn, ChatMessage.turn_id == Turn.turn_id)
        .join(TurnQuality, Turn.turn_id == TurnQuality.turn_id)
        .join(Chat, Turn.chat_id == Chat.chat_id)
        .join(Category, ChatMessage.category_id == Category.category_id, isouter=True)
        .where(
            ChatMessage.deleted_at.is_(None),
            Chat.chat_id.in_(random_chat_ids),
            TurnQuality.prompt_difficulty.is_not(None),
            (Category.name.notin_(excluded_categories) | Category.name.is_(None)),
        )
        .order_by(Turn.turn_id, ChatMessage.created_at)
    )

    df = pd.read_sql(query, session.connection())

print(df.shape)

(1330, 7)


In [7]:
# Helpers to add QT responses, if needed (most of the backfill chats don't have them).

qt_system_prompt = """
You are a model that will give very concise responses. IMPORTANT: don't add any explanations on the answer; don't write full sentences; do not output markdown or any markup, return the cleaned text only; do not use newline characters; NEVER prompt for more information, feedback, or responses. If the prompt is not serious or seems like gibberish, you can be slightly witty. If you still can't satisfy the prompt, or if the prompt requires realtime data, respond with "${QUICK_RESPONSE_NULL_MAGIC}", but use this rarely. Do not be mean, rude, or blunt; be considerate. Respond in fewer than 140 characters, in the language of the user's message.

Here are some examples. Match the style as best you can: 

Prompt: Why is the sky blue? 
Response: Rayleigh scattering of sunlight by the atmosphere

Prompt: How many people are there in the US? 
Response: 333.3 million

Prompt: During a marathon training regimen, a runner is asked to run "comfortably hard". What does that mean?
Response: Challenging but manageable

Prompt: whats 4*5*6*....1000
Response: A very large number

Prompt: What are some beautiful hikes in the sf bay area
Response: Muir Woods, Mount Tamalpais, Skyline Blvd

Prompt: whats up
Response: Life's good, and you?

Prompt: wassup
Response: Life's great, and you?

Prompt: Write a long, creative saga about a shrew
Response: Tiny shrew braved vast lands, faced perils, found wisdom, befriended creatures, returned home a hero—small size, big heart

Prompt: give me a random battle cat
Response: Crazed Cat

Prompt: how fast can you respond?
Response: "Pretty fast! 🚀 What do you need next?

Prompt: What are the most common subjects in MMLU becnhmark?
Response: Math, science, humanities, social sciences, and professional fields

Prompt: suggest an alternate name for leaderboard
Response: Hall of Fame

Prompt: how's it going
Response: Life's good, and you?

Prompt: where do birds hatch
Response: Nests

Prompt: long cat is ...
Response: Long!

Prompt: What does the fox say?
Response: Ring-ding-ding-ding-dingeringeding!

Prompt: Draw a whale
Response: 🐳

Prompt: Draw a picture of a cat
Response: 🐱

Prompt: Tell me about El Nino in Markdown
Response: A climate pattern marked by warm ocean water in the central and eastern tropical Pacific

Prompt: Use Markdown to explain how the moon affects tides
Response: Its gravitational pull on Earth's oceans creates tides

Prompt: njkwejk wlafje
Response: Whoa there, everything alright?

Prompt: webjnkkjbwer
Response: Looks like a keyboard sneezed!"""

qt_model_names = ['gemini-1.5-flash-8b', 'claude-3-5-sonnet-20240620', 'gpt-4o']
client_openai = OpenAI()
client_anthropic = Anthropic()
client_google = generativeai.configure(api_key=os.getenv('GEMINI_API_KEY'))

# Get the model ids for the QT models.
with Session(get_engine()) as session:
    qt_models = session.exec(
        select(LanguageModel).where(LanguageModel.name.in_(qt_model_names))
    ).all()
    qt_models = {model.name: str(model.language_model_id) for model in qt_models}

# Get a QT response for a prompt.
@retry(stop=stop_after_attempt(3), wait=wait_fixed(0.5))
def qt(user_prompt: str):
    responses = {}
    for qt_model, qt_model_id in qt_models.items():
        response = None

        if qt_model.startswith("claude"):
            message = client_anthropic.messages.create(
                model=qt_model,
                max_tokens=1024,
                system=qt_system_prompt,
                messages=[
                    {"role": "user", "content": user_prompt}
                ],
            )
            response = message.content[0].text

        elif qt_model.startswith("gemini"):
            google_model = generativeai.GenerativeModel(qt_model, system_instruction=qt_system_prompt)
            content = google_model.generate_content(user_prompt)
            response = content.text

        elif qt_model.startswith('gpt-4o'):
            completion = client_openai.chat.completions.create(
                model=qt_model,
                messages=[
                    {"role": "system", "content": qt_system_prompt},
                    {"role": "user", "content": f"User prompt: {user_prompt}"},
                ],
            )
            response = completion.choices[0].message.content

        else:
            raise ValueError(f"Unknown model: {qt_model}")
        
        responses[qt_model] = response

    return responses

# Add QT responses to a turn, storing it in the database.
def add_qt(turn_id: str, prompt: str):
    qt_responses = qt(prompt)
    added = {}

    with Session(get_engine()) as session:
        for qt_model, qt_response in qt_responses.items():
            message = ChatMessage(
                turn_id=turn_id,
                content=qt_response,
                message_type=MessageType.QUICK_RESPONSE_MESSAGE,
                assistant_model_name=qt_model,   
                assistant_language_model_id=qt_models[qt_model],
                message_id=uuid.uuid4(),         
            )
            session.add(message)
            message_id = message.message_id
            added[message_id] = qt_response
        session.commit()

    return added


In [8]:
# Process a turn, adding QT responses if needed, and converting it to a row in the output dataframe.
# For now QT generation is blocking as the number of requests is low, but we can make it async if needed.
def process_turn(turn_id: str, df: pd.DataFrame, qt_only=False, max_responses=3):
    turn_id = str(turn_id)
    prompt = df[df['message_type'] == MessageType.USER_MESSAGE]
    if len(prompt) != 1:
        # Excluded due to the category.
        return {}
    prompt_text = prompt.content.values[0]
    prompt_message_id = str(prompt.message_id.values[0])

    responses = df[df['message_type'] == MessageType.ASSISTANT_MESSAGE]
    if len(responses) != 2:
        print("Wrong number of responses in turn ", turn_id)
        return {}
    
    qt_response = df[df['message_type'] == MessageType.QUICK_RESPONSE_MESSAGE]
    if len(qt_response) < 3:
        qt_messages = add_qt(turn_id, prompt_text)
    else:
        qt_messages = dict(zip(qt_response.message_id.values, qt_response.content.values))

    res = {
        'turn_id': str(turn_id),
        'prompt_message_id': str(prompt_message_id),
        'prompt_text': prompt_text,
        'prompt_difficulty': df.prompt_difficulty.values[0],
        'prompt_category': df.category_name.values[0],
    }

    all_responses = [
        (qt_message_id, qt_text) for qt_message_id, qt_text in qt_messages.items() if 'QUICK_RESPONSE_NULL_MAGIC' not in qt_text
    ]

    if not qt_only:
        all_responses.extend([
            (str(responses.message_id.values[0]), responses.content.values[0]),
            (str(responses.message_id.values[1]), responses.content.values[1]),
        ])
    random.shuffle(all_responses)
    all_responses = all_responses[:max_responses]

    for i in range(len(all_responses)):
        res[f'response_{i}_message_id'] = str(all_responses[i][0])
        res[f'response_{i}_text'] = all_responses[i][1]

    return res


# Process all turns into a dataframe.
rows = []
groups = df.groupby('turn_id')
for group in tqdm(groups):
    rows.append(process_turn(*group, qt_only=True))

df_results = pd.DataFrame(rows).dropna()

  0%|          | 1/420 [00:03<24:14,  3.47s/it]

existing QT messages:
{UUID('eac2f4e2-28a1-42eb-80b1-79fe0eeafe76'): 'Type: `<0`', UUID('cdcbf9ef-ba28-41f7-8c87-dfb6ee828313'): 'Use the less than symbol\n', UUID('564e1755-3009-4e3f-bf67-37c5c6d03682'): '"<0"', UUID('16330fd1-a4ee-4fbd-88a0-2be118504669'): 'Use the formula: =CONCATENATE("<",0)'}


100%|██████████| 420/420 [14:32<00:00,  2.08s/it]


In [9]:
min_count_per_category = 8
total_samples = 300

# Ensure a minimal number of rows for each category.
sampled_df = pd.concat([
    df_results[df_results['prompt_category'] == category].sample(n=min_count_per_category, replace=True)
    for category in df_results['prompt_category'].unique()
])

# Add remaining samples to meet `total_samples`.
remaining_sample = df_results.drop(sampled_df.index).sample(n=total_samples - len(sampled_df), replace=False)
sampled_df = pd.concat([sampled_df, remaining_sample])

sampled_df.prompt_category.value_counts()

prompt_category
Factual             120
Advice               39
Creative Writing     35
Opinion              21
Education            19
Other                18
Analysis             15
Summarization         9
Multilingual          8
Entertainment         8
Reasoning             8
Name: count, dtype: int64

In [10]:
# Spot check.
df_results.sample(20)[[c for c in df_results.columns if not c.endswith('_id')]]

Unnamed: 0,prompt_text,prompt_difficulty,prompt_category,response_0_text,response_1_text,response_2_text
1,"how to write ""<0"" in googlesheets",5.0,Other,"Use the formula: =CONCATENATE(""<"",0)",Use the less than symbol\n,"""<0"""
21,Make a story about a soldier and a female sold...,6.0,Creative Writing,"Enemies on battlefield, lovers in secret. Duty...","On rival lines, a soldier and a female soldier...","Across battle lines, love bloomed.\n"
358,Read the following text and answer: Help me g...,6.0,Education,1. Compare themes in the book and movie; 2. Wa...,"Cross-curricular activities, comparative analy...","Reading, watching, gaming: compare language, d..."
120,if I throw a tennis ball on the floor is it el...,7.0,Factual,Mostly elastic,Elastic\n,"Elastic, if it bounces back."
114,Who sings the most emotional song?,5.0,Factual,Impossible to say\n,Adele,Adele
324,Summarize the flow of air through the respirat...,5.0,Summarization,1. Inhalation through nose/mouth \n2. Air tra...,"Inhalation, air travels to lungs, gas exchange...",Nose/mouth > trachea > bronchi > lungs
311,Can you give me a list of 25 TV Series that ar...,3.0,Entertainment,"The Good Place, Parks and Recreation, Brooklyn...","Ted Lasso, The Office, Brooklyn Nine-Nine, Sch...","The Office, Friends, Parks and Rec, Brooklyn N..."
61,what is the main theme of the book The Man Who...,6.0,Analysis,"Examination of death, existential contemplation",Mortality and the meaning of life\n,"Chekhov didn't write ""The Man Who Died"". Perha..."
56,the story of how I lost my phone and got it back,4.0,Creative Writing,"Lost phone, found phone, happy ending.\n","Lost phone, frantic search, found by kind stra...","Lost phone, retraced steps, asked around, call..."
351,on a scale of 1-10 how good is magnetic tape i...,7.0,Opinion,"8, widely used for flaw detection",6\n,7/10 - Effective for surface and near-surface ...


In [11]:
# Shuffle and remove columns that are not shared with the vendor, and store to CSV.
shuffled_df = sampled_df.sample(frac=1).reset_index(drop=True)

shuffled_df.drop(columns=['prompt_category', 'prompt_difficulty']).to_csv('promptstart_300.csv', index=False)
shuffled_df.to_csv('promptstart_300_with_categories_and_difficulty.csv', index=False)