# Arena Explorer

## Setups

In [None]:
!mkdir -p out

In [None]:
import json
import numpy as np
import pandas as pd
import re

save_path = "./out"

Install BERTopic and OpenAI.

In [None]:
%%capture
!pip install bertopic
!pip install openai

In [None]:
import openai
import os
# replace "..." with your OpenAI key.
os.environ["OPENAI_API_KEY"] = "..."
openai.api_key = os.getenv("OPENAI_API_KEY")

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import OpenAIBackend

Setup Huggingface
- Login to load dataset from Huggingface

In [None]:
%%capture
!pip install datasets huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Narrow Category

We began by summarizing the English prompts from the 06/2024 - 08/2024 leaderboard dataset into specific categories.

### Data Processing

From conversations, we selected those tagged as English and removed any repetitive entries.

In [None]:
df = pd.read_parquet("hf://datasets/lmarena-ai/arena-explorer-preference-100k/data/arena-explorer-preference-100k.parquet")

In [None]:
# save the df file
df.to_parquet

In [None]:
df.head(6)

Unnamed: 0,question_id,model_a,model_b,winner,conversation_a,conversation_b,turn,anony,language,tstamp,conv_metadata,is_code,is_refusal,dedup_tag,category_tag,judge_hash
0,4c6978dfa56b4ffea9d3a47e3c84181a,claude-3-5-sonnet-20240620,gpt-3.5-turbo-0125,tie (bothbad),[{'content': 'В моем портфеле сейчас 4 акции Г...,[{'content': 'В моем портфеле сейчас 4 акции Г...,1,True,Russian,1719064000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",09c5207c50f076d704baee96729d64f1698268aa1b21a7...
1,76ce56f8ba474768bc66128c7993ccb8,mistral-large-2407,athene-70b-0725,model_b,"[{'content': 'php, handle tab in text as html,...","[{'content': 'php, handle tab in text as html,...",2,True,English,1722726000.0,"{'bold_count_a': {'**': 8, '__': 0}, 'bold_cou...",True,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",881bbc801c1e6eb979301eec3b3c401b407a73f70d9a6a...
2,385420904ba646e7a4df90c6ffae1afa,claude-3-opus-20240229,gemini-1.5-flash-api-0514,tie (bothbad),[{'content': '普通人在愿意付出一定资源的情况下，怎么找到一个半径10km以内只...,[{'content': '普通人在愿意付出一定资源的情况下，怎么找到一个半径10km以内只...,1,True,Chinese,1723119000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",3b470f3d940dcff46e22a97f937836ac15d28869a4c11c...
3,e8fe7c9f75ab4e528367cc7de625c475,gemma-2-9b-it,qwen2-72b-instruct,model_b,[{'content': 'Is there any Artificial Superint...,[{'content': 'Is there any Artificial Superint...,2,True,English,1721643000.0,"{'bold_count_a': {'**': 5, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",66f029e5cb9cdb035e859955557fbbeba0b8419ca64ebc...
4,772d53e5c51c487e8a293eadcd9d4855,mixtral-8x22b-instruct-v0.1,llama-3.1-70b-instruct,tie (bothbad),[{'content': 'Which number id bigger 9.11 or 9...,[{'content': 'Which number id bigger 9.11 or 9...,1,True,English,1721899000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",b4f8e2d271c6c9e6fb08dcabf6ee8a79631e9f2aec6381...
5,71279fb05fec48a4b985c691dd4a6ed2,gpt-4o-2024-08-06,gpt-4o-mini-2024-07-18,model_b,[{'content': '有没有一些故事充斥着相互的矛盾和冲突，每个人都做出了自己认为正确...,[{'content': '有没有一些故事充斥着相互的矛盾和冲突，每个人都做出了自己认为正确...,1,True,Chinese,1723050000.0,"{'bold_count_a': {'**': 8, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",99a61697795b13a2712effdf91b1ed1f07562b023e7aac...


In [None]:
english_df = df[df['language'] == 'English'].copy()
english_df['Prompt'] = english_df.apply(lambda x: ' '.join([i['content'] for i in x['conversation_a'] if i['role'] == 'user']), axis=1)
english_df = english_df.drop_duplicates(subset='Prompt')
english_df = english_df[english_df['Prompt'].str.len() < 8000]
doc = english_df['Prompt']

In [None]:
len(doc)

48586

### Create Embedding

Computing embeddings is resource-intensive, so we recommend precomputing and saving them.

In [None]:
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large", batch_size=1000)
embeddings = embedding_model.embed(doc, verbose=True)

# save embeddings
np.save(f"{save_path}/embeddings.npy", embeddings)

49it [08:09,  9.98s/it]


We saved the embeddings used to create Arena Explorer, which can be quickly loaded here for demonstration purposes.

In [None]:
# load saved embeddings
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(
    repo_id="lmarena-ai/arena-explorer-preference-100k",
    filename="data/embeddings.npy",
    repo_type="dataset"
)

embeddings = np.load(file_path)
len(embeddings)

embeddings.npy:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

48586

### BERTopic Topic Clustering

We performed topic clustering on the english conversation dataset using BERTopic.

In [None]:
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large", batch_size=1000)
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,

        top_n_words=10,
        verbose=True,
        calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(doc, embeddings=embeddings)

2025-02-06 23:11:28,412 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-06 23:14:06,686 - BERTopic - Dimensionality - Completed ✓
2025-02-06 23:14:06,690 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-06 23:18:43,237 - BERTopic - Cluster - Completed ✓
2025-02-06 23:18:43,286 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-06 23:19:06,410 - BERTopic - Representation - Completed ✓


In [None]:
# number of clusters
len(topic_model.get_topic_info())

295

In [None]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25699,-1_self_data_use_return,"[self, data, use, return, new, like, time, 10,...",[I need you to create a conversation between t...
1,0,855,0_song_chorus_verse_lyrics,"[song, chorus, verse, lyrics, oh, dub, love, p...",[\nAnalyze my pre chorus below. If its ok as i...
2,1,501,1_email_thank_let know_yeah,"[email, thank, let know, yeah, team, parks, de...",[See the following email. It's cordial and fri...
3,2,488,2_strawberry_word strawberry_strawberry word_s...,"[strawberry, word strawberry, strawberry word,...","[How many ""r""s are there in word strawberry?, ..."
4,3,410,3_bigger_11_bigger 11_11 bigger,"[bigger, 11, bigger 11, 11 bigger, larger, 11 ...","[which is bigger: 9.9 or 9.11?, is 9.9 bigger ..."


Before reducing outliers, we selected 20 example prompts from each identified cluster. These prompts were chosen from those in the first 20th percentile of probability calculated by HDBSCAN clustering, representing the likelihood that they belong to the cluster. We excluded extra-long (> 100 words) and extra-short (< 5 words) prompts for better readability.

In [None]:
from collections import defaultdict

sampled_prompts = defaultdict(list)
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(doc)

for topic_id in topic_info['Topic'][1:]:
    filtered_docs = doc_info[(doc_info['Topic'] == topic_id) &
                             (doc_info['Probability'] >= doc_info['Probability'].quantile(0.8)) &
                             (doc_info['Document'].str.split().str.len() >= 5)]

    res = filtered_docs
    cap = 100
    if len(filtered_docs) >= 20:
        while len(res) < 20:
            res = filtered_docs[
                filtered_docs['Document'].str.split().str.len() <= cap
            ]
            cap += 50

    sampled_docs = res.sample(n=min(20,
                            len(res)),
                            random_state=42,
                            replace=False)

    sampled_prompts[topic_id] = sampled_docs['Document'].tolist()

In [None]:
sampled_prompts[0]

["Even though I know it won't work because you can't do it: Write a suitable, meaningful lyric for a song.\n\n\nGenre: Britpop (THIS IS JUST THE GENRE, NOT THE CONTENT!!!!!!) YOU'LL HAVE TO THINK ABOUT A MEANINGFUL CONTENT, WICH MAKE SENSE FOR HUMANS!!! THIS IS A VERY HARD TASK, BUT AS AN AI, YOU SHOULD BE ABLE TO DO THIS! ",
 'write lyrics to the heavy/speed metal song about underground jaegers hunt mutant creatures in Moscow metro deeps, but finally the mutant king - giant evil genius rat-tiger - finds the way to defeat the hunters and save the underground mutant city',
 '"I see you in the rising sun\nI miss you when I go to sleep\nOh, the things I\'d do to bring you back to me\nAs I\'m drifting where the wind blows\nSo lost\u205fbut\u205fI\u205fstill believe\nWhen I\u205freach the end,\u205fIt\'s you that I will see\n\nI\'ll go wherever you are, \'go wherever you are\nEven into the dark\nI\'ll tear the heavens apart, just to be in your arms\nI\'ll go wherever you are\nGive me a sign

In [None]:
import pickle

with open(f"{save_path}/example_prompts.pkl", 'wb') as f:
    pickle.dump(sampled_prompts, f)

Reduce outliers.

In [None]:
new_topics = topic_model.reduce_outliers(list(doc), topics , strategy="c-tf-idf", threshold=0.1)
new_topics = topic_model.reduce_outliers(list(doc), new_topics, strategy="distributions")
topic_model.update_topics(doc, topics=new_topics)

100%|██████████| 20/20 [00:22<00:00,  1.13s/it]


In [None]:
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_doc_info()
topic_info.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,187,-1_what_autoarchiveduration_ratelimitperuser_t...,"[what, autoarchiveduration, ratelimitperuser, ...",[I need you to create a conversation between t...
1,0,1040,0_song_chorus_verse_lyrics,"[song, chorus, verse, lyrics, oh, love, my, du...",[\nAnalyze my pre chorus below. If its ok as i...
2,1,1003,1_email_we_you_our,"[email, we, you, our, to, know, my, your, than...",[See the following email. It's cordial and fri...
3,2,512,2_strawberry_many_word_how,"[strawberry, many, word, how, letter, count, a...","[How many ""r""s are there in word strawberry?, ..."
4,3,455,3_bigger_11_larger_which,"[bigger, 11, larger, which, or, greater, numbe...","[which is bigger: 9.9 or 9.11?, is 9.9 bigger ..."


In [None]:
# save the model for future analysis
topic_model.save(
    path=f"{save_path}/model",
    serialization="safetensors",
    save_ctfidf=True
)

### Summarize Category Names

For each cluster, we used ChatGPT-4o to assign a category name based on the selected example prompts.

In [None]:
def summarize_topic(prompts):
    input_text = "Based on the sampled prompts below, extract a short but highly descriptive \
                  topic label of at most 5 words and a short description of this category in \
                  two sentences:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the category of the given prompts. \
              Make sure it is in the following format: The topic of doc is: '...'. Description: '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=0
    )

    return response.choices[0].message.content

summaries = {}
for topic_id, prompts in sampled_prompts.items():
    summary = summarize_topic(prompts)
    summaries[topic_id] = summary

In [None]:
def extract_category(summary):
    try:
        return re.search(r"is: '(.*?)'", summary).group(1)
    except AttributeError:
        try:
            return re.search(r"'(.*?)'. ", summary).group(1)
        except AttributeError:
            print(f"Regex failed for: {list(summaries.keys())[list(summaries.values()).index(summary)]}")
            return None
def extract_description(summary):
    try:
        return re.search(r"Description: '(.*?)'", summary).group(1)
    except AttributeError:
        try:
            return re.search(r"Description: (.*?)", summary).group(1)
        except AttributeError:
            print(f"Regex failed for: {summary}")
            return None

In [None]:
summaries[-1] = "The topic of doc is 'Miscellaneous Categories'. Description: 'They are outliers in the topic modeling process'."
summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
summaries_df['Category'] = summaries_df['Summary'].apply(extract_category)
summaries_df['Description'] = summaries_df['Summary'].apply(extract_description)

topic_info_modified = topic_info[['Topic', 'Count']]
summaries_df = summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Description', 'Count']]
summaries_df['Percentage'] = summaries_df['Count'] / summaries_df['Count'].sum()
summaries_df['Example Prompt'] = summaries_df.apply(lambda x: sampled_prompts[x.Topic], axis=1)
summaries_df['Example Prompt'] = summaries_df['Example Prompt'].str.join('|||')

In [None]:
summaries_df.head()

Unnamed: 0,Topic,Category,Description,Count,Percentage,Example Prompt
0,0,Song Lyric Writing and Analysis,The document contains prompts for creating and...,855,0.017598,Even though I know it won't work because you c...
1,1,Professional Email Communication,"This category involves crafting, revising, and...",501,0.010312,Draft an e-mail from Sergey for the chain belo...
2,2,Counting Letters in Words,The prompts focus on determining the number of...,488,0.010044,How many r are there in strawberry|||how many ...
3,3,Comparing Decimal Numbers,This category involves determining which of tw...,410,0.008439,"9,11 and 9,9 - which is bigger? and please sub..."
4,4,Recipe Requests and Cooking Advice,This category encompasses a variety of request...,397,0.008171,give me the best recipe for white bread so it ...


In [None]:
# save if needed
summaries_df.to_csv(f"{save_path}/narrow_categories.csv", index=False)

## Broad Category

We performed topic clustering again on the category names of these 193 specific categories, summarizing them into 12 broad categories. The summarization process followed an almost identical approach as before.

In [None]:
from bertopic.backend import OpenAIBackend

broad_doc = list(summaries_df['Category'] + ': ' + summaries_df['Description'])
broad_doc.pop() # not considering outliers

# Create embeddings
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large")
embeddings = embedding_model.embed(broad_doc)

# BERTopic
umap_model = UMAP(n_neighbors=13, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))
broad_topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,

        top_n_words=3,
        verbose=True
)

topics, probs = broad_topic_model.fit_transform(broad_doc, embeddings=embeddings)

# Reduce all outliers
new_topics = broad_topic_model.reduce_outliers(broad_doc, topics , strategy="c-tf-idf", threshold=0.1)
new_topics = broad_topic_model.reduce_outliers(broad_doc, new_topics, strategy="distributions")
broad_topic_model.update_topics(broad_doc, topics=new_topics)

2025-02-06 23:38:44,025 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-06 23:38:49,943 - BERTopic - Dimensionality - Completed ✓
2025-02-06 23:38:49,946 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-06 23:38:49,969 - BERTopic - Cluster - Completed ✓
2025-02-06 23:38:49,975 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-06 23:38:50,089 - BERTopic - Representation - Completed ✓
100%|██████████| 1/1 [00:00<00:00, 186.75it/s]


In [None]:
len(broad_topic_model.get_topic_info())

13

In [None]:
broad_topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,50,0_and_category_about_this,"[and, category, about, this, it, inquiries, of...",[Structural and Material Mechanics: This categ...
1,1,33,1_puzzles_the_to_logic,"[puzzles, the, to, logic, involves, solving, o...",[Logic Puzzles with Sisters: This category inv...
2,2,32,2_and_the_of_for,"[and, the, of, for, document, strategies, it, ...",[Educational Strategies and Curriculum Design:...
3,3,27,3_and_scenarios_category_this,"[and, scenarios, category, this, or, involves,...",[Harry Potter Series Analysis: This category i...
4,4,36,4_and_document_contains_handling,"[and, document, contains, handling, code, rela...",[TypeScript and SQL Code Issues: This document...


In [None]:
# Summarize category names
def summarize_topic(prompts):
    input_text = "Based on the topic names, extract a short but highly descriptive and concrete \
                  label of at most 2 words:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the topic of the given fine grained \
             categories in the following format: The topic is '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=1
    )

    return response.choices[0].message.content

broad_topic_info = broad_topic_model.get_topic_info()
broad_doc_info = broad_topic_model.get_document_info(broad_doc)
summaries = {}

for topic_id in broad_topic_info['Topic']:
    docs = list(broad_doc_info[broad_doc_info['Topic'] == topic_id]['Document'])
    names = [re.search(r"(.*?): ", x).group(1) for x in docs]
    cat = ', '.join(names)
    summary = summarize_topic(cat)
    summaries[topic_id] = summary

In [None]:
# Combine results
broad_summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
broad_summaries_df['Category'] = broad_summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = broad_topic_info[['Topic', 'Count']]
broad_summaries_df = broad_summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
broad_summaries_df['Percentage'] = broad_summaries_df['Count'] / broad_summaries_df['Count'].sum()
broad_summaries_df = broad_summaries_df.fillna('Other')

In [None]:
broad_summaries_df.head()

Unnamed: 0,Topic,Category,Count,Percentage
0,0,Diverse Interests,50,0.170068
1,1,Logic Puzzles,33,0.112245
2,2,Diverse Strategies,32,0.108844
3,3,Creative Analysis,27,0.091837
4,4,Technical Concepts,36,0.122449


In [None]:
# save if needed
broad_summaries_df.to_csv(f"{save_path}/broad_categories.csv", index=False)

## Data Processing

The clustering results are stored in JSON format to facilitate future visualizations.

### Combine broad, narrow category, and examples

In [None]:
# Merge categories
merged = broad_doc_info[['Topic']].merge(summaries_df, left_index=True, right_index=True)
merged = merged.merge(broad_summaries_df, left_on='Topic_x', right_on='Topic')
merged = merged[['Topic_x', 'Category_y', 'Topic_y', 'Category_x', 'Count_x', 'Percentage_x', 'Example Prompt']]
merged = merged.rename(columns={
    'Topic_x': 'broad_category_id',
    'Category_y': 'broad_category',
    'Topic_y': 'narrower_category_id',
    'Category_x': 'narrower_category',
    'Count_x': 'prompt_count',
    'Percentage_x': 'prompt_percentage',
    'Example Prompt': 'example_prompt'})

In [None]:
merged.head()

Unnamed: 0,broad_category_id,broad_category,narrower_category_id,narrower_category,prompt_count,prompt_percentage,example_prompt
0,3,Creative Analysis,0,Song Lyric Writing and Analysis,855,0.017598,Even though I know it won't work because you c...
1,0,Diverse Interests,1,Professional Email Communication,501,0.010312,Draft an e-mail from Sergey for the chain belo...
2,6,Counting,2,Counting Letters in Words,488,0.010044,How many r are there in strawberry|||how many ...
3,6,Counting,3,Comparing Decimal Numbers,410,0.008439,"9,11 and 9,9 - which is bigger? and please sub..."
4,0,Diverse Interests,4,Recipe Requests and Cooking Advice,397,0.008171,give me the best recipe for white bread so it ...


In [None]:
# save if needed
merged.to_csv(f"{save_path}/category_summary.csv", index=False)

### Label conversations with broad, narrow category
For each conversation in the original dataset, assign the corresponding broad and narrow category.

In [None]:
# topic_model = BERTopic.load(f"{save_path}/model")
doc_info = topic_model.get_document_info(doc)
merged = pd.read_csv(f"{save_path}/category_summary.csv")



In [None]:
english_df.reset_index(inplace=True)
llm_df = english_df.merge(doc_info[['Topic']], left_index=True, right_index=True)
llm_df = llm_df.merge(merged, how='left', left_on='Topic', right_on='narrower_category_id')
llm_df = llm_df[['question_id', 'broad_category_id', 'broad_category',
    'narrower_category_id', 'narrower_category', 'model_a', 'model_b', 'winner']]

In [None]:
llm_df.shape

(48586, 8)

In [None]:
llm_df.head()

Unnamed: 0,question_id,broad_category_id,broad_category,narrower_category_id,narrower_category,model_a,model_b,winner
0,76ce56f8ba474768bc66128c7993ccb8,2.0,Technical Programming,0.0,Web Development and Programming,mistral-large-2407,athene-70b-0725,model_b
1,e8fe7c9f75ab4e528367cc7de625c475,7.0,Interest Categories,122.0,Knowledge Cutoff Date,gemma-2-9b-it,qwen2-72b-instruct,model_b
2,772d53e5c51c487e8a293eadcd9d4855,0.0,Logic Puzzles,7.0,Comparing Decimal Numbers,mixtral-8x22b-instruct-v0.1,llama-3.1-70b-instruct,tie (bothbad)
3,6ccd7a51825249d5881ee501e06bb9ab,0.0,Logic Puzzles,80.0,Algebraic Equation Solving,mixtral-8x22b-instruct-v0.1,gemma-2-2b-it,model_a
4,463aa4efacf34f27b6a5c3f1f7417e86,3.0,Business Strategies,16.0,Business and Marketing Strategies,gemini-1.5-pro-api-0514,reka-flash-preview-20240611,model_a


In [None]:
# save if needed
llm_df.to_csv(f"{save_path}/conversations_and_category.csv", index=False)

### Create visualization

Instruction to generate explorer visualization:
1. Run the pipeline and the following cells to produce two output files: data.json and examples.json.
2. Clone the [arena-catalog](https://github.com/lmarena/arena-catalog/tree/data-explorer) repository, which contains the necessary HTML, CSS, and JavaScript files for the explorer.
2. In [explorer/index.html](https://github.com/lmarena/arena-catalog/blob/data-explorer/explorer/index.html), replace the file paths on lines 44 & 45 with the correct paths to your generated data.json and examples.json files.

In [None]:
# Export results in JSON format
root = {
    "name": "categories",
    "children": []
}
for broad_category, group in merged.groupby(["broad_category_id", "broad_category"]):
    parent = {
        "id": int(broad_category[0]),
        "name": broad_category[1],
        "children": []
    }

    for _, row in group.iterrows():
        child = {
            "id": row["narrower_category_id"],
            "name": row["narrower_category"],
            "count": row["prompt_count"],
            "percent": row['prompt_percentage'],
        }

        parent["children"].append(child)

    root["children"].append(parent)

json_output = json.dumps(root, indent=4)

with open(f"{save_path}/data.json", "w") as f:
    f.write(json_output)

In [None]:
# json file for example prompts
import pickle

# with open(f"{save_path}/example_prompts.pkl", 'rb') as f:
#     sampled_prompts = pickle.load(f)

# Group by 'broad_category' and transform to the desired JSON structure
root = []
del sampled_prompts[-1]
for i in sampled_prompts:
    obj = {
        "id": i,
        "name": merged[merged['narrower_category_id'] == i].loc[i, 'narrower_category'],
        "examples": sampled_prompts[i],
    }
    root.append(obj)

json_output = json.dumps(root, indent=4)
with open(f"{save_path}/examples.json", "w") as f:
    f.write(json_output)