In [12]:
from pathlib import Path
import pandas as pd
from IPython.display import display, HTML

## Load Data

In [11]:
DATA_PATH = Path("/kaggle/input/lmsys-chatbot-arena")

TARGETS = ["winner_model_a", "winner_model_b", "winner_tie"]

In [5]:
train = pd.read_csv(DATA_PATH / "train.csv")
test = pd.read_csv(DATA_PATH / "test.csv")
sub = pd.read_csv(DATA_PATH / "sample_submission.csv")

<a id="data_appearance"></a>
## 1. Data Appearance
[**<span style="color:#FEF1FE; background-color:#535d70;border-radius: 5px; padding: 2px">Go to Table of Content</span>**](#toc)

After data is loaded, let's see what information is provided in training and test sets. Following is the file description of `train.csv`,
* `id` - A unique identifier for the row.
* `model_[a/b]` - The identity of model_[a/b]. Included in `train.csv` but not `test.csv`.
* `prompt` - The prompt that was given as an input (to both models).
* `response_[a/b]` - The response from model_[a/b] to the given prompt.
* `winner_model_[a/b/tie]` - Binary columns marking the judge's selection. The ground truth target column.

<a id="quick_sum"></a>
### *Quick Summary*
[**<span style="color:#FEF1FE; background-color:#535d70;border-radius: 5px; padding: 2px">Go to Table of Content</span>**](#toc)
1. There are 57477 training rows and 3 test rows.
    * **Note:** Test data will be replaced with **the full test set (~25k rows, 70% for private LB)** during scoring phase.
2. The column `id` has no duplicated values.
3. Model identities aren't revealed in the test set.
    * **Question:** Can we try to predict model identities first and then use this auxiliary information for predicting preferences?
4. Strings in columns `prompt`, `response_a`, and `response_a` are wrapped in a list. The reason is that each chat can **contains more than one prompt/response pairs**.
5. There exist 14 duplicated rows (7 groups).

In [6]:
print(f"Data Shape | train {train.shape} | test {test.shape}")
print(f"-" * 50)
print(f">>> The First 3 Training Rows <<<")
display(train.head(3))
print(f"-" * 50)
print(f">>> The First 3 Test Rows <<<")
display(test.head(3))
assert train["id"].nunique() == len(train)

Data Shape | train (57477, 9) | test (3, 4)
--------------------------------------------------
>>> The First 3 Training Rows <<<


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1


--------------------------------------------------
>>> The First 3 Test Rows <<<


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran..."
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ..."
2,1233961,"[""How to initialize the classification head wh...","[""When you want to initialize the classificati...","[""To initialize the classification head when p..."


We decide to drop `id` column directly because it has nothing to do with our analysis task. But, we leave `id` in `test` untouched for submission.

In [7]:
train = train.drop("id", axis=1)
print(f"After dropping 'id' column, shape of the training DataFrame becomes {train.shape}.")
print(f"-" * 50)
print(f">>> The First 3 Training Rows <<<")
train.head(3)

After dropping 'id' column, shape of the training DataFrame becomes (57477, 8).
--------------------------------------------------
>>> The First 3 Training Rows <<<


Unnamed: 0,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1


There exist 14 duplicated rows forming 7 groups, we just keep one row per group.

In [8]:
n_dups = train.duplicated(keep=False).sum()
print(f"There exist {n_dups} duplicated rows.")

n_rows1 = len(train)
train = train.drop_duplicates(keep="first", ignore_index=True)
n_rows2 = len(train)
print(f"After removing duplicates, #samples drops from {n_rows1} to {n_rows2}.")

There exist 14 duplicated rows.
After removing duplicates, #samples drops from 57477 to 57470.


<a id="one_chat"></a>
### *One Chat*

Then, let's take the first row (*i.e.,* the first chat) as an example to see what information is contained in one sample. Through displaying the raw text, we observe,
1. The prompt and both reponses are of **string** type and also **wrapped in a list**.
    * One chat can **contain more than one prompt/response pairs**. 
2. There exist escape characters (*e.g.,* a new line `\n`).
3. Markdown syntax is used (*e.g.,* double asterisks `****` for strong words).
4. There exist ordered lists.
    * Unordered lists are also found in othere examples.
5. There exist unicode characters (*e.g.,* emoji `\ud83c\udf4d`).

In [9]:
demo_chat = train.iloc[0]
for col in ["prompt", "response_a", "response_b"]:
    assert isinstance(demo_chat[col], str)
    
print(f"=== Prompt ===")
print(demo_chat["prompt"])
print()
print(f"=== Response of {demo_chat['model_a']} ===") 
print(demo_chat["response_a"])
print()
print(f"=== Response of {demo_chat['model_b']} ===") 
print(demo_chat["response_b"])
print("-" * 50)
if demo_chat["winner_model_a"]:
    print(f">>> {demo_chat['model_a']} is the winner!!")
elif demo_chat["winner_model_b"]:
    print(f">>> {demo_chat['model_b']} is the winner!!")
else:
    print(f">>> Winner tie!!")

=== Prompt ===
["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]

=== Response of gpt-4-1106-preview ===
["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\n\nHere are some arguments in favor of and against such policies:\n\n**Arguments in favor:**\n\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\n\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizatio

To facilitate data exploration and pair comparison between two models, we implement a simple chat renderer which can be used to display one chat with Chatbot Arena-like UI. Following shows the same chat as above. Now, we can compare and analyze responses from two models side by side, and the winning status will be shown at the bottom!

In [13]:
class ChatRenderer:
    
    CSS: str = """
        <style>
            lm-chat-body {
                display: flex;
                justify-content: center;
                align-items: flex-start;
                margin: 0;
                padding: 20px;
                font-family: Arial, sans-serif;
                background-color: #f5f5f5;
            }
            .lm-chat-container {
                display: flex;
                flex-direction: column;
                width: 100%;
                max-width: 1200px;
                border: 1px solid #ddd;
                box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
                background-color: #fff;
            }
            .lm-chat-panels {
                display: flex;
                width: 100%;
            }
            .lm-chat-panel {
                width: 50%;
                padding: 20px;
                box-sizing: border-box;
                border-right: 1px solid #ddd;
                position: relative;
            }
            .lm-chat-panel:last-child {
                border-right: none;
            }
            .lm-chat-model-header {
                font-weight: bold;
                margin-bottom: 10px;
                font-size: 14px;
                position: absolute;
                top: 10px;
                left: 20px;
                background-color: white;
                padding: 0 5px;
            }
            .lm-chat-prompt {
                background-color: #fdf5e6;
                padding: 10px;
                margin-top: 30px;
                margin-bottom: 20px;
                border-radius: 5px;
                border: 1px solid #ddd;
            }
            .lm-chat-response {
                background-color: #fff;
                padding: 10px;
                margin-bottom: 20px;
                border-radius: 5px;
                border: 1px solid #ddd;
            }
            .lm-chat-winner {
                text-align: center;
                padding: 10px;
                border-top: 1px solid #ddd;
                background-color: #f5f5f5;
                font-size: 16px;
            }
        </style>
    """
    
    def __init__(self, chat: pd.Series) -> None:
        self.model_a, self.model_b = chat["model_a"], chat["model_b"]
        self.prompt = chat["prompt"] if isinstance(chat["prompt"], list) else eval(chat["prompt"])
        self.res_a = chat["response_a"] if isinstance(chat["response_a"], list) else eval(chat["response_a"])
        self.res_b = chat["response_b"] if isinstance(chat["response_b"], list) else eval(chat["response_b"])
        self.targets = chat[TARGETS]
    
    def display(self, suppress_output: bool = False) -> None:
        if suppress_output:
            with io.capture_output() as captured:
                display(self._gen_html())
        else:
            display(self._gen_html())
    
    def _gen_html(self) -> HTML:
        html_content = f"""
            <script type="module" src="https://md-block.verou.me/md-block.js"></script>
            <html>
                <lm-chat-body>
                    <div class="lm-chat-container">
                        <div class="lm-chat-panels">
                            {self._gen_panel("a")}
                            {self._gen_panel("b")}
                        </div>
                        <div class="lm-chat-winner">
                            {self._get_winner()}
                        </div>
                    </div>
                </lm-chat-body>
            </html>
        """.encode("utf-16", "surrogatepass").decode("utf-16")
        
        html = HTML(self.CSS + html_content)
        
        return html
            
    def _gen_panel(self, model: str) -> str:
        res = self.res_a if model == "a" else self.res_b
        model_name = self.model_a if model == "a" else self.model_b
        panel = ""
        for p, r in zip(self.prompt, res):
            panel += f"""
                <div class="lm-chat-prompt">
                    <md-block>{p}</md-block>
                </div>
                <div class="lm-chat-response">
                    <md-block>{r}</md-block>
                </div>
            """
        panel = f"""
            <div class="lm-chat-panel">
                <div class="lm-chat-model-header">Model {model.upper()} - {model_name}</div>
                {panel}
            </div>
        """

        return panel
    
    def _get_winner(self) -> str:
        if self.targets["winner_model_a"] == 1:
            winner = f"Model A - <strong>{self.model_a}</strong> Wins!"
        elif self.targets["winner_model_b"] == 1:
            winner = f"Model B - <strong>{self.model_b}</strong> Wins!"
        else:
            winner = "Tie!"
        
        return winner

In [16]:
chat_renderer = ChatRenderer(train.iloc[0])
chat_renderer.display()

## <a id="feat_and_tgt"></a>
## 2. Feature and Target Exploration
We've already known what information is included in one sample. Next, we're going to take a deep dive into each feature!

<a id="model_pair"></a>
### *Model Pair - (`model_a`, `model_b`)*

Let's focus on LLM compositions for both `model_a` and `model_b`. Observations are summarized as follows,
1. All model identities in the training set are provided (*i.e.,* no missing or empty entry exists).
2. Model compositions of `model_a` and `model_b` are slightly different.
    * Both have the same 64 unique models.
    * Both have the same top-4 most frequent model identities, `gpt-4-1106-preview --> gpt-3.5-turbo-0613 --> gpt-4-0613 --> claude-2.1`.
    * The 5-th most frequent model is `gpt-4-0314` for `model_a` and `claude-instant-1` for `model_b`.
3. Most models account for less than 2%, which are combined into a single `Others` class for better visualization.

In [17]:
unknown_model_a = train["model_a"].isna().sum() + (train["model_a"] == "").sum()
unknown_model_b = train["model_b"].isna().sum() + (train["model_b"] == "").sum()
print(f"There exist {unknown_model_a} unknown model_a and {unknown_model_b} unknown model_b identities.")

There exist 0 unknown model_a and 0 unknown model_b identities.


In [18]:
def _plot_pie(
    data: np.ndarray,
    labels: List[str], 
    title: str,
    combine_minority: bool = False,
    thres: float = 0.01, 
    max_labels: int = 10,
    ax: Optional[Axes] = None,
) -> None:
    tot = sum(data)
    max_idx = np.argmax(data)
    if combine_minority:
        major_data = [(d, l) for d, l in zip(data, labels) if d / tot >= thres]
        minor_data = [(d, l) for d, l in zip(data, labels) if d / tot < thres]
        
        if len(minor_data) != 0:
            # Add one "Others" class to store minority classes
            minor_tot = sum(d for d, l in minor_data)
            major_data.append((minor_tot, "Others"))
        
        data, labels = map(list, zip(*major_data))
        max_idx = np.argmax(data[:-1])
        
    # Sort data/labels
    # ...
    explode = [0.1 if i == max_idx else 0 for i in range(len(data))]    

    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 8))
    if len(data) > max_labels:
        visible_labels = labels[:max_labels] + [""] * (len(data) - max_labels)
    else:
        visible_labels = labels
    patches, texts = ax.pie(data, startangle=140, colors=colors, explode=explode, labels=visible_labels)
    ax.legend(patches, labels, title="Model Identity", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
    ax.set_title(title, fontsize=20)
    if ax is None:
        plt.show()

NameError: name 'np' is not defined