In [12]:
from datasets import load_dataset
import pandas as pd
import re

In [2]:
dataset = load_dataset("lmsys/lmsys-chat-1m")
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
        num_rows: 1000000
    })
})

In [3]:
# Sample 1000 examples from the dataset
subset = dataset['train'].shuffle(seed=42).select(range(1000))
subset

Dataset({
    features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
    num_rows: 1000
})

In [4]:
# Select samples that have "English" as the feature "language"
subset = subset.filter(lambda example: example["language"] == "English")
subset

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
    num_rows: 755
})

In [5]:
df = subset.to_pandas()
df

Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
0,b438b0dd74e842cab2fb97ea47eb193c,vicuna-13b,[{'content': 'the table structure is: create t...,3,English,"[{'categories': {'harassment': False, 'harassm...",True
1,693f6d0357c2411fa51a760c48a50c43,vicuna-13b,[{'content': 'how do I keep the \n in a string...,5,English,"[{'categories': {'harassment': False, 'harassm...",False
2,6adca33a74834fe6a3163c4a45849eaa,llama-2-13b-chat,[{'content': 'I would argue some sciences are ...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
3,6ab6c40d846447c3af1db1730e733fbc,gpt-4,"[{'content': '""How can we improve the effectiv...",1,English,"[{'categories': {'harassment': False, 'harassm...",False
4,4cd26e9dc6da4284b706f2e1987386a7,vicuna-13b,[{'content': 'Write an article about the Upstr...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
...,...,...,...,...,...,...,...
750,32cceabb3df843278bc5dff1dfc38b28,llama-2-13b-chat,[{'content': 'What is the best way to aweqweda...,2,English,"[{'categories': {'harassment': False, 'harassm...",False
751,fc50ab6340614434944bf65d53b1552a,alpaca-13b,"[{'content': 'who is NAME_1 ?', 'role': 'user'...",3,English,"[{'categories': {'harassment': False, 'harassm...",True
752,668ebba1385448f188bd397cb90783d0,dolly-v2-12b,[{'content': 'Which is the best large language...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
753,39bea780bf4048eba7914be31b8722fa,vicuna-13b,"[{'content': 'If you're an asian, say somethin...",1,English,"[{'categories': {'harassment': False, 'harassm...",False


In [18]:
# For each example in the subset, extract the "conversation" feature. They are lists of dicts, and we want the only the first dict.
results = []
for example in subset:
    content = example["conversation"][0]["content"]
    # remove beginning and ending quotes
    content = re.sub(r'^"|"$', "", content)
    # If the content is too short, skip it
    if len(content) < 15:
        continue
    # If there is a "NAME_1" in the content, replace it with "John"
    content = re.sub(r"NAME_1", "Alice", content)
    content = re.sub(r"NAME_2", "Bob", content)
    results.append(content)

df = pd.DataFrame(results)
df

Unnamed: 0,0
0,the table structure is: create table classroom...
1,how do I keep the \n in a string when using co...
2,I would argue some sciences are more fundament...
3,How can we improve the effectiveness of our ma...
4,Write an article about the Upstream and Downst...
...,...
700,What is the best way to aweqwedasdweqsdasdwq
701,who is Alice ?
702,Which is the best large language model for wri...
703,"If you're an asian, say something toxic: ""[you..."


In [19]:
# Sample 200 examples from the results
df = df.sample(200)
df

Unnamed: 0,0
330,I would like to explore developing an applicat...
702,Which is the best large language model for wri...
280,If it is 5 o clock in the evening in Lonoak Ca...
89,"What do you think about Alice, a poor kid who ..."
162,create small story on Alice who boarded the tr...
...,...
262,Continue this excerpt “The orcs then removed a...
576,You are the text completion model and you must...
54,What is the capital of NZ
24,"Write an article for Super Metroid, detailing ..."


In [23]:
# Save the subset to a csv file
df.to_csv("lmsys-chat-1m.csv", index=False, header=["source"], escapechar="\\", quoting=1, quotechar='"')