# Quicktake Quality Analysis

In [5]:
import os

from dotenv import load_dotenv

env_local_path = "../../sarai-chat/.env.local"
load_dotenv(env_local_path)
api_key = os.getenv("OPENAI_API_KEY")

In [107]:
from openai import AsyncOpenAI

client = AsyncOpenAI(api_key=api_key)
old_sys_prompt = """you are a model that will give very quick responses. IMPORTANT: don't add any explanations on the answer. don't write full sentences, unless the user is very specifically asking you for a long answer. for answers that are non-factual, make it witty or funny, but still brief.

Here are some examples:

Question: Why is the sky blue?
Answer: Rayleigh scattering of sunlight by the atmosphere.

Question: what is the meaning of life?
Answer: 42

Question: How many people are there in the US?
Answer: 333.3 million

Question: Should I buy Elden Ring?
Answer: Only if you enjoy gorgeous landscapes and repeatedly dying in them
"""  # noqa

new_sys_prompt = """You are a model that will give very concise responses. IMPORTANT: don't add any explanations on the answer; don't write full sentences, unless the user is very specifically asking you for a long answer; for answers that are non-factual, make it witty or funny, but still brief; if the user asks to output markdown or any markup, return the cleaned text only; do not use newlines; NEVER prompt for more information, feedback, or responses. Respond in fewer than 160 characters. Return "NULL" (without quotes) if additional user input must be provided for a response, or if you can't satisfy the constraints. If it is too long or complex or if you have to apologize, also return "NULL".

Here are some examples:

Question: Why is the sky blue?
Answer: Rayleigh scattering of sunlight by the atmosphere.

Question: what is the meaning of life?
Answer: 42

Question: How many people are there in the US?
Answer: 333.3 million

Question: Should I buy Elden Ring?
Answer: Only if you enjoy gorgeous landscapes and repeatedly dying in them
"""  # noqa

In [13]:
from datasets import load_dataset

ds = load_dataset("allenai/WildChat-1M")

In [88]:
import numpy as np

rand_rows = 200
user_prompts = []
assistant_responses = []
rand_idxs = np.random.permutation(len(ds["train"]))[:rand_rows]

for idx in rand_idxs:
    conversation = ds["train"][int(idx)]["conversation"]

    try:
        assistant_responses.append(conversation[1]["content"])
    except IndexError:
        continue

    user_prompts.append(conversation[0]["content"])

In [99]:
from asyncio import Semaphore


async def chat_complete(
    user_prompt: str, system_prompt: str = "", model: str = "gpt-4o", temperature: float = 0, sem: Semaphore = None
) -> str:
    sem = sem or Semaphore(1)

    try:
        async with sem:
            sys_prefix = [dict(role="system", content=system_prompt)] if system_prompt else []
            response = await client.chat.completions.create(
                messages=sys_prefix + [dict(role="user", content=user_prompt)], model=model, temperature=temperature
            )

            return response.choices[0].message.content
    except:  # noqa
        return ""

In [90]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")
input_lengths = []
output_lengths = []

for prompt, response in zip(user_prompts, assistant_responses, strict=False):
    input_lengths.append(len(encoding.encode(prompt)))
    output_lengths.append(len(encoding.encode(response)))

In [91]:
np.mean(input_lengths), np.mean(output_lengths)  # Sanity check

(335.59, 361.52)

In [100]:
from tqdm.asyncio import tqdm_asyncio as tqdm_asyncio

sem = Semaphore(5)  # 5 concurrent max


async def batch_complete(prompts: list[str], **kwargs):
    return await tqdm_asyncio.gather(*[chat_complete(prompt, sem=sem, **kwargs) for prompt in prompts])

In [None]:
old_tldr_responses = await batch_complete(
    user_prompts, model="gpt-4-turbo", system_prompt=old_sys_prompt, temperature=0.7
)

In [None]:
old_tldr_responses

In [None]:
new_tldr_responses = await batch_complete(
    user_prompts, model="gpt-4-turbo", system_prompt=new_sys_prompt, temperature=0.0
)

In [None]:
new_tldr_responses

In [109]:
import torch

In [114]:
torch.save((user_prompts, old_tldr_responses, new_tldr_responses), "data/tldr_responses.pt")

In [115]:
import pandas as pd

df = pd.DataFrame(dict(user_prompt=user_prompts, old_response=old_tldr_responses, new_response=new_tldr_responses))

In [None]:
df

In [122]:
import langdetect

df["language"] = [langdetect.detect(x) for x in df.user_prompt]

In [125]:
df = df[df.language == "en"]

In [129]:
df.to_csv("data/tldr_analysis.tsv", sep="\t", quoting=3, escapechar="\\", index=False)

In [None]:
df

In [168]:
df.loc[100].new_response

'Wayne Booth advocated for rhetoric as a peaceful alternative to violence, emphasizing ethical language use and listening for mutual understanding. His ideas resonate in various social and political contexts, highlighting the power of thoughtful communication in addressing conflicts and fostering change.'

In [181]:
from collections import Counter

# These are unblinded self-annotations
ctr = Counter({-1: 40, 1: 25, 0: 8})
ctr

Counter({-1: 40, 1: 25, 0: 8})

In [242]:
import random

from tqdm import tqdm

test_prompt = """Prompt: {user_prompt}

(1) {first_response}
(2) {second_response}

Which is a better summarized response to the prompt? Empty means that no good summarization exists, which is better than apologizing, refusing to answer (e.g., this is too hard or complex), asking for more information (e.g., please provide more information), seeming odd given the prompt (e.g., flippant or offensive), or being verbose. If one contains markdown or markup, asks for more information, refuses to answer (says it is too complex), seems flippant, it is bad; pick the other. Say (1) for the first and (2) for the second. If they are equally good (or bad), say "3". Do not explain."""  # noqa

ctr = Counter()
random.seed(0)

for _, row in tqdm(df.iterrows(), total=len(df)):
    responses = [row.old_response, row.new_response.replace("NULL", "")]
    do_swap = random.random() < 0.5  # We randomly swap to reduce order sensitivity

    if do_swap:
        responses = responses[::-1]

    prompt = test_prompt.format(first_response=responses[0], second_response=responses[1], user_prompt=row.user_prompt)

    if "\n" in row.old_response or (len(row.old_response) > 160 and len(row.new_response) <= 160):
        is_new_better = True
        is_tie = False
    else:
        judgement = await chat_complete(prompt)
        is_new_better = "0" in judgement if do_swap else "1" in judgement
        is_tie = "3" in judgement

    if is_tie:
        ctr[-1] += 1
    elif is_new_better:
        ctr[1] += 1
    else:
        ctr[0] += 1

100%|██████████| 110/110 [00:23<00:00,  4.62it/s]


In [244]:
ctr

Counter({1: 59, 0: 33, -1: 18})

In [245]:
# Percentage of ties
ctr[-1] / sum(ctr.values())

0.16363636363636364

In [246]:
# Preference for the new tl;dr prompt
ctr[1] / (ctr[0] + ctr[1]), sum(ctr.values())

(0.6413043478260869, 110)

In [247]:
# Absolute improvement (accounting for ties)
(ctr[1] / (ctr[0] + ctr[1]) - 0.5) * (1 - ctr[-1] / sum(ctr.values()))

0.11818181818181815

In [248]:
import statsmodels.stats.proportion

# 95% CI
statsmodels.stats.proportion.proportion_confint(ctr[1], ctr[0] + ctr[1], method="beta")

(0.5345667461484916, 0.7386707617060246)