### v6

In [None]:
import pandas as pd
from datasets import load_dataset
import re

def keep_only_letters(text):
    pattern = re.compile('[^a-zA-Z]')
    result = re.sub(pattern, '', text)
    return result


def clean(text):
    text = ''.join(text).lower()
    text = keep_only_letters(text)
    return text


def make_temp_cols(dataframe):
    df = dataframe.copy()
    for col in ['prompt', 'response_a', 'response_b']:
        df[col+'_temp'] = df[col].apply(clean)
    return df

In [None]:
ds = load_dataset('lmsys/chatbot_arena_conversations')
ds = ds['train']
external = ds.to_pandas()

train_ds = load_dataset('lmsys/lmsys-arena-human-preference-55k')
train_ds = train_ds['train']
train_ds = train_ds.to_pandas()

train_ds = train_ds[train_ds['response_a'] != '[null]']
train_ds = train_ds[train_ds['response_b'] != '[null]']

train_ds["prompt"] = train_ds.prompt.map(lambda x: eval(x))
train_ds["response_a"] = train_ds.response_a.map(lambda x: eval(x.replace("null", "''")))
train_ds["response_b"] = train_ds.response_b.map(lambda x: eval(x.replace("null", "''")))

In [None]:
from tqdm.notebook import tqdm

results = []
for i in tqdm(range(len(external))):
    sample = external.iloc[i].to_dict()
    prompts = [conv['content'] for conv in sample['conversation_a'] if conv['role'] == 'user']
    response_a = [conv['content'] for conv in sample['conversation_a'] if conv['role'] == 'assistant']
    response_b = [conv['content'] for conv in sample['conversation_b'] if conv['role'] == 'assistant']

    results.append({
        'model_a': sample['model_a'],
        'model_b': sample['model_b'],
        'prompt': prompts,
        'response_a': response_a,
        'response_b': response_b,
        'winner_model_a': 1 if sample['winner'] == 'model_a' else 0,
        'winner_model_b': 1 if sample['winner'] == 'model_b' else 0,
        'winner_tie': 1 if sample['winner'] == 'tie' or sample['winner'] == 'tie (bothbad)' else 0,
    })

external = pd.DataFrame(results)

In [None]:
external = make_temp_cols(external)
train_ds = make_temp_cols(train_ds)

external['text'] = external['prompt_temp'] + external['response_a_temp'] + external['response_b_temp']
train_ds['text'] = train_ds['prompt_temp'] + train_ds['response_a_temp'] + train_ds['response_b_temp']
external['text'] = external['text'].apply(lambda x: ''.join(sorted(x)))
train_ds['text'] = train_ds['text'].apply(lambda x: ''.join(sorted(x)))
external = external[~external['text'].isin(train_ds['text'].values)]

In [None]:
external.to_parquet('data/additional/additional_data_v6.parquet', index=False)

### v8

In [None]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset('openai/webgpt_comparisons')
ds = ds['train']
external = ds.to_pandas()
external.drop(['quotes_0', 'tokens_0', 'quotes_1', 'tokens_1'], axis=1, inplace=True)

train_ds = load_dataset('lmsys/lmsys-arena-human-preference-55k')
train_ds = train_ds['train']
train_ds = train_ds.to_pandas()

train_ds = train_ds[train_ds['response_a'] != '[null]']
train_ds = train_ds[train_ds['response_b'] != '[null]']

train_ds["prompt"] = train_ds.prompt.map(lambda x: eval(x))
train_ds["response_a"] = train_ds.response_a.map(lambda x: eval(x.replace("null", "''")))
train_ds["response_b"] = train_ds.response_b.map(lambda x: eval(x.replace("null", "''")))

In [None]:
external['prompt'] = external['question'].apply(lambda x: x['full_text'])
external.drop('question', axis=1, inplace=True)
external.rename(columns={'answer_0': 'response_a', 'answer_1': 'response_b'}, inplace=True)

external = external[(external['response_a'] != '') & (external['response_b'] != '')]

external['winner_model_a'] = 0
external['winner_model_b'] = 0
external['winner_tie'] = 0

external.loc[external['score_0'] > external['score_1'], 'winner_model_a'] = 1
external.loc[external['score_0'] < external['score_1'], 'winner_model_b'] = 1
external.loc[external['score_0'] == external['score_1'], 'winner_tie'] = 1

external.drop(['score_0', 'score_1',], axis=1, inplace=True)

external['prompt'] = external['prompt'].apply(lambda x: [x])
external['response_a'] = external['response_a'].apply(lambda x: [x])
external['response_b'] = external['response_b'].apply(lambda x: [x])

In [None]:
external.to_parquet('data/additional/additional_data_v8.parquet', index=False)