In [1]:
# https://huggingface.co/datasets/lmsys/chatbot_arena_conversations

In [2]:
import json
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

In [3]:
tqdm.pandas()
login(UserSecretsClient().get_secret("HF_TOKEN"))

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
train         = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
external_data = load_dataset('lmsys/chatbot_arena_conversations')['train'].to_pandas()

train.columns

Downloading readme:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 41.6M/41.6M [00:00<00:00, 74.7MB/s]


Generating train split:   0%|          | 0/33000 [00:00<?, ? examples/s]

Index(['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b',
       'winner_model_a', 'winner_model_b', 'winner_tie'],
      dtype='object')

In [5]:
def separate_conv(conv):
    try:
        user_texts      = [x['content'] for x in conv if x['role'] == 'user']
        assistant_texts = [x['content'] for x in conv if x['role'] == 'assistant']

        return user_texts, json.dumps(assistant_texts)
    except:
        print(conv)
        
external_data['prompt_a'], external_data['response_a'] = zip(*external_data.conversation_a.progress_apply(separate_conv))
external_data['prompt_b'], external_data['response_b'] = zip(*external_data.conversation_b.progress_apply(separate_conv))

  0%|          | 0/33000 [00:00<?, ?it/s]

  0%|          | 0/33000 [00:00<?, ?it/s]

In [6]:
assert (external_data['prompt_a'] == external_data['prompt_b']).all() == True

external_data['prompt'] = external_data['prompt_a'].progress_apply(json.dumps)

  0%|          | 0/33000 [00:00<?, ?it/s]

In [7]:
external_data.winner.value_counts()

winner
model_a          11744
model_b          11550
tie (bothbad)     6263
tie               3443
Name: count, dtype: int64

In [8]:
def one_hot_encode(winner):
    return pd.Series([int('model_a'==winner), int('model_b'==winner), int('tie'==winner or 'tie (bothbad)'==winner)])

external_data[['winner_model_a', 'winner_model_b', 'winner_tie']] = external_data.winner.progress_apply(one_hot_encode)

  0%|          | 0/33000 [00:00<?, ?it/s]

In [9]:
assert external_data[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1).all()

In [10]:
external_data.columns

Index(['question_id', 'model_a', 'model_b', 'winner', 'judge',
       'conversation_a', 'conversation_b', 'turn', 'anony', 'language',
       'tstamp', 'openai_moderation', 'toxic_chat_tag', 'prompt_a',
       'response_a', 'prompt_b', 'response_b', 'prompt', 'winner_model_a',
       'winner_model_b', 'winner_tie'],
      dtype='object')

In [11]:
cols = ['question_id', 'model_a', 'model_b', 'prompt', 'response_a', 
        'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']

external_data = pd.DataFrame(external_data[cols].copy().values, columns=train.columns)

In [12]:
external_data.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,"[""What is the difference between OpenCL and CU...","[""OpenCL and CUDA are two different programmin...","[""OpenCL and CUDA are both programming languag...",0,1,0
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,"[""Why did my parent not invite me to their wed...","[""It is possible that your parent did not invi...","[""It is likely that they wanted to keep the gu...",0,0,1
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,"[""Fuji vs. Nikon, which is better?""]","[""Both Fuji and Nikon are popular camera brand...","[""This is a subjective question and the answer...",0,1,0
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,"[""How to build an arena for chatbots?""]","[""Building an arena for chatbots can be done b...","[""Building an arena for chatbots is a great wa...",0,1,0
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,"[""When is it today?""]","[""I'm sorry, I cannot determine the current da...","[""Today is February 23, 2023.""]",1,0,0


In [13]:
superset = pd.concat([external_data, train]).reset_index(drop=True)
external_data_deduplicated = superset.drop_duplicates(subset=['prompt'], keep='last')
external_data_deduplicated = external_data_deduplicated[external_data_deduplicated.index.isin(external_data.index)]

len(external_data_deduplicated)

21187

In [14]:
external_data_deduplicated.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,"[""Why did my parent not invite me to their wed...","[""It is possible that your parent did not invi...","[""It is likely that they wanted to keep the gu...",0,0,1
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,"[""How to build an arena for chatbots?""]","[""Building an arena for chatbots can be done b...","[""Building an arena for chatbots is a great wa...",0,1,0
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,"[""When is it today?""]","[""I'm sorry, I cannot determine the current da...","[""Today is February 23, 2023.""]",1,0,0
5,c0fc42c6f5f14f2aa5a89f71f8553730,vicuna-13b,koala-13b,"[""Count from 1 to 10 with step = 3""]","[""1, 4, 7, 10\n\nCounting with a step of 3 mea...","[""1, 4, 7, 10""]",1,0,0
6,c4938f25c1d94fc1b110ace95a2243d0,vicuna-13b,koala-13b,"[""Emoji for \""sharing\"". List 10""]","[""1. \ud83d\udcaa Sharing progress or achievem...","[""1. Sharing hands\n* Share icon\n* Grou...",1,0,0


In [15]:
external_data.to_csv('lmsys-33k.csv', index=False)
external_data_deduplicated.reset_index(drop=True).to_csv('lmsys-33k-deduplicated.csv', index=False)

In [16]:
train_prompts    = train.prompt.apply(lambda x: " ".join(json.loads(x))).values.tolist()
external_prompts = external_data.prompt.apply(lambda x: " ".join(json.loads(x))).values.tolist()

len(train_prompts), len(external_prompts), len(train_prompts + external_prompts), len(list(set(train_prompts + external_prompts)))

(57477, 33000, 90477, 71779)

In [17]:
external_data_deduplicated.model_a.value_counts()

model_a
vicuna-13b                 2015
koala-13b                  1838
oasst-pythia-12b           1550
gpt-3.5-turbo              1487
alpaca-13b                 1409
gpt-4                      1355
claude-v1                  1239
RWKV-4-Raven-14B           1134
chatglm-6b                 1040
palm-2                      998
vicuna-7b                   954
fastchat-t5-3b              930
dolly-v2-12b                891
claude-instant-v1           868
mpt-7b-chat                 867
stablelm-tuned-alpha-7b     852
llama-13b                   639
gpt4all-13b-snoozy          395
wizardlm-13b                365
guanaco-33b                 361
Name: count, dtype: int64

In [18]:
train.model_a.value_counts()

model_a
gpt-4-1106-preview          3678
gpt-3.5-turbo-0613          3553
gpt-4-0613                  3099
claude-2.1                  2859
gpt-4-0314                  2087
                            ... 
falcon-180b-chat             145
openchat-3.5-0106            108
qwen1.5-7b-chat              106
qwen1.5-4b-chat              100
mistral-7b-instruct-v0.2      54
Name: count, Length: 64, dtype: int64