In [None]:
## IMPORTANT: On Colab, we expect your homework to be in the cs189 folder
## Please contact staff if you encounter any problems with installing dependencies
import sys
IS_COLAB = 'google.colab' in sys.modules
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/cs189/hw/hw2
    %pip install -r ./requirements.txt
    !pip install -U kaleido plotly
    import kaleido
    kaleido.get_chrome_sync()

import plotly.io as pio
pio.renderers.default = pio.renderers.default + "+png"


In [None]:
# Initialize Otter
import otter
grader = otter.Notebook("arena_style_control.ipynb")

<link rel="stylesheet" href="berkeley.css">

<h1 class="cal cal-h1">Homework 02 ‚Äì Welcome to the Arena (Style Control)</h1>

CS 189, Fall 2025

In this homework you will get more experience with logistic regression in two very different settings: creating leaderboards and predicting model responses.

We will be taking real data from [LMArena](https://lmarena.ai/), a popular platform for crowsourcing evaluations of large language models and recreating their leaderboards, with a few fun extra steps along the way.

The chats can be viewed interactively by accessing [ChatBot-Arena-Viewer](https://huggingface.co/spaces/BerkeleyML/Chatbot-Arena-Viewer) through hugging face. Much of the first half of this homework was first written by Prof Gonzalez back when his students first started the project, and now LMArena is a standard evaluation for large language models and turned into a company! Don't let anyone tell you logistic regression isn't valuable, it's worth at least $600 Million.
    
---


## Due Date: Friday, October 17, 11:59 PM

This assignment is due on **Friday, October 17, 11:59 PM**. You must submit your work to Gradescope by this deadline. Please refer to the syllabus for the [Slip Day policy](https://eecs189.org/fa25/syllabus/#slip-days). No late submissions will be accepted beyond the details outlined in the Slip Day policy.

### Submission Tips
- **Plan ahead**: We strongly encourage you to submit your work several hours before the deadline. This will give you ample time to address any submission issues.
- **Reach out for help early**: If you encounter difficulties, contact course staff well before the deadline. While we are happy to assist with submission issues, we cannot guarantee responses to last-minute requests.
      
<!-- --- -->

### Key Learning Objectives

In this homework you will build on the previous warmup section, implementing the Bradley-Terry ranking used in the actual Arena and taking account of style controls for model rank. In particular, you will:
1. Apply the Bradley‚ÄìTerry model to build leaderboards
2. Practice analyzing conversational data and extracting stylistic features  
3. Build custom features (length, punctuation, phrase presence, etc.) and integrate them into ranking models  
4. Explore confounding stylistic variables in LLM evaluation (style vs. content)  
5. Apply pairwise evaluation methods to understand how style affects outcomes  
  
---

### Collaboration Policy
You are encouraged to discuss high-level concepts with your peers. However:
- All submitted work must be written in your own words and code.
- Do not share or copy solutions directly.
- List any collaborators (students you worked with) in the line below:

**Your Collaborators**: **TODO**

### AI Tools Usage Disclosure
We allow the use of AI tools (e.g., ChatGPT, Copilot) **only as support**, not as a replacement for your own reasoning. To ensure transparency, you must acknowledge any use of AI tools.

Please complete one of the following:
- **A) I did not use any AI tools for this homework.**
- **B) I used AI tools in the following way(s):**  
  (describe briefly, e.g., ‚ÄúUsed ChatGPT to get hints for debugging a NumPy indexing error‚Äù)


**Your Answer**: **TODO**
    
---

### Grading Breakdown

| Question | Manual Grading? | Points |
|----------|-----------------|--------|
| q4a      | No              | 2      |
| q4b      | No              | 2      |
| q4c      | Yes             | 2      |
| q5a      | No              | 2      |
| q5b      | No              | 2      |
| q6a      | No              | 2      |
| q6b      | No              | 2      |
| q6c      | No              | 2      |
| q6d      | No              | 2      |
| q7a      | No              | 2      |
| q7b      | No              | 2      |
| q8a      | No              | 2      |
| q8afrq   | Yes             | 2      |
| q8b      | No              | 2      |
| q8c      | No              | 2      |
| q8d      | No              | 2      |
| q9a      | No              | 2      |
| q9b      | Yes             | 4      |
| q9c      | Yes             | 6      |
| **Total**|                 | **44** |


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotting_utils import plot_rank_heatmap, plot_style_features
#set fixed seed of 189
np.random.seed(189)

In [None]:
# ! pip install ipywidgets
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset (this will take a few minutes to download)
ds = load_dataset("lmarena-ai/arena-human-preference-100k")
battles = ds['train'].to_pandas()

In [None]:
print("Before dedup: ", len(battles))
battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]
print("After dedup: ", len(battles))

## Initialize with HW2 Warmup

Fill in the cells in this section with your implementations from part 1 of the homework.

#### FILL IN: HW2 Part 1 Question 1B Block

In [None]:
# #TODO: FILL IN from your solution for Question 1B
# models = ...
# selected_models = ...
# selected_battles = ...
# selected_battles_no_ties = ...

##  **Question 4: Model Strengths**

In the earlier part of the homework, we calculated the Average Model Win-Rate.

However, this method is not ideal for our use case where battle counts per model are not equal. For instance, if ChatGPT-4o-latest battled more often with weaker models, it would have a high win rate without being an actually stronger model. Now let's explore how we can instead *learn* these model strengths.

**To recap,** we want to construct a leaderboard by assigning a strength score $S_m$ to each model $m \in \{1,...,M\}$, such that:
- The ranking reflects the probability of one model winning against another.
- For any pair of models A and B, the probability that A beats B, should depend on the *difference* in their strengths: $S_A - S_B$. Why the difference? Since we are measuring pairwise preference, there is no absolute measure of strength but rather a model's strength *relative* to other models.

**Formally, we want a function $f$ such that**
- For models A and B with scores $S_A$ and $S_B$, we want:
  $$ P(\text{A beats B}) = f(S_A - S_B) $$
- The function $f$ should be increasing (bigger skill gap, higher win chance), and always output a probability between 0 and 1.

At this point, a natural question is: what should we choose for the function $f$? A standard and effective choice is the logistic (sigmoid) function:

$$ P(\text{A beats B}) = \sigma(S_A - S_B) = \frac{1}{1 + e^{-(S_A - S_B)}} $$

Notice that this is exactly the same form as logistic regression, where the model scores are the parameters to be learned. In other words, learning model strengths from pairwise outcomes is equivalent to fitting a logistic regression model to the data.

So, we can use logistic regression to learn the model strengths that best explain the observed battle outcomes. The higher a model's score, the more likely it is to win against others. The methodology is called the [Bradley-Terry](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) model and is the underlying theory to other common scoring systems like ELO ratings.

#### Step 1: Understanding the formulation of the problem as features

To learn these model strengths, we need to prepare our data in a form suitable for logistic regression. Recall that each battle involves two models: A and B. One of them wins (for simplicity we still start by removing any battles that end in ties).

We want to convert this into:

1. A feature vector indicating which two models were involved.
2. A label representing the winner.

Each row produces two training examples:

1. One with model A as +1 and model B as ‚Äì1, and a label denoting if model A wins
2. Another with model B as +1 and model A as ‚Äì1, and a label denoting if model B wins

Let's take a look an example:

`row = {'model_a': 'gpt-4o-2024-05-13', 'model_b': 'claude-3-opus-20240229', 'winner': 'model_a'}`

This generates two features, which are...

**Feature 1:** [1, -1]
* +1 at index 0 (gpt-4o)
* ‚Äì1 at index 1 (claude-3-opus)
* Label: 1 because model A (gpt-4o) won

**Feature 2:** [-1, 1]
* -1 at index 0 (gpt-4o)
* +1 at index 1 (claude-3-opus)
* Label: 0 because model B (claude-3-opus) lost

Why do we have to do this?
This lets the model take account for both ways:


$$ P(\text{GPT-4o beats Claude-3-opus}) = \sigma(S_{\text{GPT-4o}}  - S_{\text{Claude}})$$



$$ P(\text{Claude-3-opus beats GPT-4o}) = \sigma(S_{\text{Claude}} - S_{\text{GPT-4o}}  )$$


**Stop and Think:** When we have more than two models, how should we handle the models that were not invovled in the battle?

---

#### Step 2: Constructing generalized features and labels

Cool, now we want to generalize this formulation to the pair of not only GPT-4o and Claude-3-opus, but all the models.

Once we have turned all battle outcomes into feature vectors, we can organize them into a **feature matrix** $\mathbf{X}$ and a **label vector** $\mathbf{y}$.

We have the model strengths we want to learn:
\begin{bmatrix}
S_A \\
S_B \\
S_C
\end{bmatrix}

And we want our model to predict:

\begin{bmatrix}
\sigma(S_A - S_C) \\
\sigma(S_B - S_A) \\
\sigma(S_B - S_C)
\end{bmatrix}



As a recap...

- $\mathbf{X}$ encodes **who played whom** and in what direction.
- $\mathbf{S}$ are the model strengths we are trying to learn.
- $\mathbf{y} = \sigma(\mathbf{X} \cdot \mathbf{S})$ gives us the predicted win probabilities.

Phew, that was a long. Now let's try to actually featurize these battles and labels!

However, before analysis, let's deduplicate common prompts like "hi" and "hello" to ensure they don't overly influence the leaderboard.

In [None]:
print("Before dedup: ", len(battles))
battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]
print("After dedup: ", len(battles))

Also, let's focus on the top 20 models like we have done for 1a

In [None]:
models = battles.model_a.value_counts().index.tolist()
selected_models = models[:20]
selected_battles = battles[battles['model_a'].isin(selected_models) & battles['model_b'].isin(selected_models)]
selected_battles_no_ties = selected_battles[~selected_battles["winner"].str.contains("tie")]

In [None]:
selected_models

## **Question 4a**

In order to train our model, we should first featurize our battles as discussed before.

**Task:** Implement the function below to transform `selected_battles_no_ties` and `selected_models` into feature vectors and labels. This will allow us to represent each battle as input-output pairs for training.

In [None]:
def turn_into_features(df, models):
    '''
    Convert pairwise battle results into feature matrix X and label vector y
    suitable for logistic regression based on the Bradley-Terry model
    '''
    # TODO:
    # 1. Iterate through each row in the DataFrame.
    # 2. For each battle, create a feature vector:
    #    - Assign +1 to the column corresponding to 'model_a'.
    #    - Assign -1 to the column corresponding to 'model_b'.
    #    - All other columns should be 0.
    # 3. Append the label:
    #    - 1 if 'model_a' is the winner.
    #    - 0 if 'model_b' is the winner.
    # 4. Return the feature matrix X and label vector y as numpy arrays.
    ...
    return np.array(X), np.array(y)

X, y = turn_into_features(selected_battles_no_ties, selected_models)
X.shape, y.shape

In [None]:
grader.check("q4a")

## **Question 4b**
Now that we have extracted out the features from the previous question, let's now dive into actually building the model. 

**Task:** 
Train the model with the features and labels created in Question 4a, and store the strengths, sorted in the order of scores in `results_df`. 

In [None]:
from sklearn.linear_model import LogisticRegression


model = ...
scores = ...

results = {"Model": selected_models, "Score": scores}
results_df = pd.DataFrame(results).sort_values("Score", ascending=False).reset_index(drop=True)
results_df

In [None]:
grader.check("q4b")

<!-- BEGIN QUESTION -->

## **Question 4c**

Let's think about an important aspect of our formulation.

**Task:** Answer the following question: Why we don't need an intercept for the logistic regression formulation above?

<!-- END QUESTION -->

# **Question 5: Confidence Intervals**

From the previous question, we were able to train the model and obtain the scores of the models. 

However, when comparing model scores, it's important to understand not just the average performance, but also how much uncertainty there is in our estimates. Our rankings are based on a finite sample of battles, and if we had collected a different set of match-ups, the resulting scores could be different. This sampling variability means that our estimated model strengths are subject to noise.

Bootstrapping is a powerful, intuitive way to assess this uncertainty without making strong assumptions about the underlying data. By repeatedly resampling our observed battles (with replacement) and retraining the model on each resampled dataset, we simulate what might have happened if we had observed a slightly different set of battles. For each resample, we get a new set of model scores. By looking at the distribution of these bootstrapped scores, we can estimate confidence intervals for each model's strength.

In short, bootstrapping helps us answer: "If we repeated this evaluation process many times, how much could each model's score vary just due to random chance in which battles we happened to observe?" This gives us a more honest sense of which differences in model scores are robust, and which might just be due to luck.

## **Question 5a**

Let's implement a function that returns these scores and confidence intervals after bootstrapping.

**Task:** 
* Bootstrap the samples to train a new logistic regression model.
* Store each set of coefficients (or learned model strengths).
* Compute the mean and percentiles (2.5th and 97.5th) to obtain the 95% confidence intervals.
* Return i) results_df, ii) mean_scores, iii) confidence_intervals.

An example outout of results_df is below.

<div style="text-align: center;">
  <img src="https://imgur.com/ZmC1PhW.png" alt="WarmupPairwisePlot" style="display: block; margin-left: auto; margin-right: auto; width: 80%;">
</div>

In [None]:
def get_bootstrapped_score(X, y, models, category_name="Overall", n_bootstrap=25):
    """
    Bootstraps logistic regression model scores to estimate confidence intervals.
    Args:
        X: Feature matrix
        y: Labels
        models: List of model names (order matches columns of X)
        n_bootstrap: Number of bootstrap samples
    Returns:
        results_df: DataFrame with Model, Average Score, Lower Bound, Upper Bound
        mean_scores: Mean of bootstrapped scores (np.array)
        confidence_intervals: 2.5 and 97.5 percentiles (np.array shape [2, n_models])
    """
    #TODO

    np.random.seed(189)  # for reproducibility
    bootstrap_scores = []
    for i in range(n_bootstrap):
        indices = np.random.choice(len(X), size=len(X), replace=True)
        ...
        model = ...
        model.fit(...)
        ...
    ...
    mean_scores = ...
    ...
    results = {
        "Model": ...
        "Average Score": ...
        "Lower Bound": ...
        "Upper Bound": ...
    }
    results_df = pd.DataFrame(results)
    return results_df, mean_scores, confidence_intervals
results_df, mean_scores, confidence_intervals = get_bootstrapped_score(X, y, selected_models, n_bootstrap=25)

# Test that confidence intervals make sense
assert (confidence_intervals[0] <= confidence_intervals[1]).all(), "Every lower bound must be <= upper bound."
assert ((confidence_intervals[0] <= mean_scores) & (mean_scores <= confidence_intervals[1])).all(), "Each mean score should lie within its CI."

In [None]:
grader.check("q5a")

### Now let's visualize the intervals! *üßô*

In [None]:
results_df, mean_scores, confidence_intervals = get_bootstrapped_score(X, y, selected_models, n_bootstrap=25)
fig = go.Figure()

# Use the sorted values from results_df for plotting
fig.add_trace(go.Scatter(
    x=results_df["Model"],
    y=results_df["Average Score"],
    mode='markers',
    name='Model Scores',
    marker=dict(size=5, color='blue'),
    error_y=dict(
        type='data',
        array=results_df["Upper Bound"] - results_df["Average Score"],   # Upper error
        arrayminus=results_df["Average Score"] - results_df["Lower Bound"],  # Lower error
        visible=True
    )
))

fig.update_layout(
    title='Model Performance Scores with 95% Confidence Intervals (Sorted by Mean Score)',
    xaxis_title='Models',
    yaxis_title='Score',
    xaxis=dict(tickangle=45),
    height=500
)

fig.show()

## **Question 5b**

Now that we have confidence intervals, we can assign a rank to each model. We want the rank of model $i$ to represent the number of models that are **confidently better** than model $i$.

When we say model A is **confidently better** than model B, it will mean that model A's lower bound is still greater than model B's upper bound. Remember that greater rank means that there are more models that perform better than the current model.

**Task:**
Implement the `assign_rank` function below that assigns rank to the model.

In [None]:
def assign_rank(row, df=results_df):
    """
    Input:
        row : pd.Series
            A row of the DataFrame (representing a model‚Äôs metrics).
        df : pd.DataFrame (default = results_df)
            DataFrame containing model performance with 'Lower Bound' and 'Upper Bound'.

    Output:
        int : The rank of the model, defined as (# of models confidently better) + 1.
    """

    count = ...
    return ...


results_df['Rank'] = results_df.apply(lambda r: assign_rank(r, results_df), axis=1)
results_df = results_df.sort_values(by="Rank", ascending=True)
results_df

In [None]:
grader.check("q5b")

In [None]:
fig = plot_rank_heatmap(results_df)
fig.show()

<!-- END QUESTION -->

> **NOTICE BEFORE YOUR PROGRESS
(Q6 Data)**
> - If you accidentally modify `selected_battles_no_ties` in a way that breaks later parts, double check and **reset it** using the initial block of code you have placed.


# **Question 6: Category Leaderboards**

So far, we have computed overall model rankings using all available battles.
However, models may perform differently in specific categories, such as creativity, technical_accuracy, instruction_following, or math.
Now that we know how to get rankings, let's see what the leaderboards look like for certain categories.

Breaking this down, we want to do the following:

1. For each category we are interested in, filter the battles to only those belonging to a given category.
2. Compute bootstrapped confidence intervals for model strengths in that category.
3. Rank the models within that category.
4. Combine category-specific ranks with the overall leaderboard into a single DataFrame.

### Function Reference For Q6

Below is a summary of the functions you have already implemented that might be helpful. Remember that the **overall leaderboard** (used in Q6d) should be a DataFrame named `results_df` with a unique **`Model`** column and an overall **`Rank`**.

| Function | Inputs (types) | Output | One-liner purpose | Where you‚Äôll use it |
|---|---|---|---|---|
| `turn_into_features` | `df_filtered: pd.DataFrame`, `selected_models: list[str]` | `X, y` | Build model‚Äìvs‚Äìmodel feature matrix `X` and labels `y` from filtered battles. | Q6a, Q6d |
| `get_bootstrapped_score` | `X`, `y`, `selected_models: list[str]`, `n_bootstrap: int` | `results_df, ci_low, ci_high` | Bootstrap model strengths; returns per-model scores + confidence intervals. | Q6a, Q6d |
| `assign_rank` | `row: pd.Series` (row of `results_df`) | `int` | Compute a row's rank from its score(s). | Q6a, Q6d |

## **Question 6a**

Let's start with the first step described above! 

Specifically, we will implement a general function that returns a DataFrame filtered to the battles belonging to a given mask, computes model scores using bootstrapping, assigns ranks, and finally returns the results sorted by rank. This function will allow us to conveniently obtain the score for any specific category.

---

**Task:**
Implement a function `get_category_results` that does the following:

- Takes in a DataFrame of battles, a boolean filter mask for a category, and a list of models.  
- Filters the battles to those in the category.
- Turns the filtered battles into features using `turn_into_features`
- Computes bootstrapped scores for the selected models using `get_bootstrapped_score`
- Assigns ranks based on model performance.  
- Returns a DataFrame sorted by ascending rank (best model first).  


**Parameters:**
- **`df` (pd.DataFrame)**  
  The full battles DataFrame. Each row corresponds to a single head-to-head battle between two models, along with metadata such as the category of the prompt (e.g., `"math"`, `"coding"`, `"writing"`).  

- **`filter_mask` (pd.Series[bool])**  
  A boolean array (same length as `df`) that indicates which rows to keep.  
  - Example: `filter_mask = (df["category"] == "math")` produces a Series of `True`/`False` values.  
  - When applied as `df.loc[filter_mask]`, only rows where the mask is `True` are kept.  
  - This lets us focus only on battles from a specific category.  

- **`selected_models` (list[str])**  
  A list of model names (strings) to evaluate and compare. The function will restrict bootstrapped scoring and ranking to this set.  
  - Example: `["gpt-4", "llama-2", "claude-3"]`.  

- **`n_bootstrap` (int, optional, default = 25)**  
  The number of bootstrap resamples to use when estimating model scores. Larger values give more stable estimates but take longer to compute.  


In [None]:
def get_category_results(df, filter_mask, selected_models, category_name="Overall", n_bootstrap=25):
    """
    Given:
      - df (pd.DataFrame): The full DataFrame of battles.
      - filter_mask (pd.Series[bool]): A boolean mask selecting rows belonging 
        to a specific category (e.g., df["category"] == "math").
      - selected_models (list[str]): A list of model names to include.
      - n_bootstrap (int, optional): Number of bootstrap resamples (default = 25).

    Returns:
      pd.DataFrame: A ranked DataFrame of models with bootstrapped scores,
      sorted by ascending rank (best model first).
    """

    filtered_df = ...
    ...
    ...
    results_df['Rank'] = ...
    return ...
    

In [None]:
grader.check("q6a")

## **Question 6b**

We computed the score for each of the categories in the previous question. Ultimately, we want to compute the intervals for model strengths in these new categories like we originally did before. To achieve this, we first need to extract out the relevant characteristic of each of the battles (whether it is creative, has certain technical accuracy, etc.).

**Task:**
Using the `selected_battles_no_ties` DataFrame, create four new boolean columns that indicate whether each battle belongs to a given category:
* creative
* technical_accuracy
* instruction_following
* math

These columns should be derived from the nested dictionary in the `category_tag `column.
We are going to make a copy first to avoid pandas SettingWithCopyWarning.

**Hint:** Each of these categories is stored inside a specific subkey (e.g., "criteria_v0.1" or "math_v0.1") within `category_tag`.

In [None]:
# extract category columns

#TODO
# We are going to make a copy first to avoid pandas SettingWithCopyWarning.
selected_battles_no_ties = selected_battles_no_ties.copy()
selected_battles_no_ties.loc[:, 'creative'] = ...
...
...
...

In [None]:
# Memory cleanup: delete large dataframes no longer needed
del battles, selected_battles
import gc
gc.collect()

In [None]:
grader.check("q6b")

## **Question 6c**

Now that obtained the relevant characteristic of each battle, let's try to define a filter that extracts out the battles we want for each category. 

**Task:**
Define the category filters for each of the categories.
Specifically, using the `selected_battles_no_ties` DataFrame, create a dictionary called `category_filters` that maps each category name to a boolean mask selecting only the battles in that category.

Your dictionary should include filters for:
*  'english' (battles where language is "English")
*  'coding' (battles where is_code is True)
*  'creative' (battles where creative is True)
*  'instruction_following' (battles where instruction_following is True)
*  'math' (battles where math is True)
*  'technical_accuracy' (battles where technical_accuracy is True)

In [None]:
category_filters = {
    'english': selected_battles_no_ties['language'] == 'English',
    ...
    ...
    }

In [None]:
grader.check("q6c")

## **Question 6d**


Now that we have all the filters defined, let's compute the **per-category leaderboards** and combine them with the **overall leaderboard** in tidy data format.

**Task:**
Compute per-category leaderboards using `get_category_results` and combine them into a single tidy DataFrame.

1. **Overall leaderboard:** Use `get_category_results` with a mask that includes all battles (all `True` values) to generate the overall leaderboard with category name "Overall".

2. **Per-category leaderboards:** For each category in `category_filters`, use `get_category_results` to build a DataFrame containing `Model`, `Category`, `Average Score`, `Lower Bound`, `Upper Bound`, and `Rank`.

3. **Tidy format combination:** Concatenate all category results into a single DataFrame with columns `['Model', 'Rank', 'Category']` where:
   - Each row represents one model's performance in one category
   - The `Category` column identifies which category the rank belongs to (e.g., "Overall", "english", "coding", etc.)

4. **Sort final table:** Sort each category dataframe by Rank then sort the entire dataframe by Category (both ascending).

**Example Output:**

| Model               | Rank | Category              |
|---------------------|-----:|----------------------:|
| chatgpt-4o-latest   | 1    | Overall               |
| gemini-1.5-pro-exp-0801 | 2    | Overall               |
| gpt-4o-2024-05-13   | 3    | Overall               |
| chatgpt-4o-latest   | 1    | english               |
| gemini-1.5-pro-exp-0801 | 2    | english               |
| gpt-4o-2024-05-13   | 4    | english               |
| chatgpt-4o-latest   | 1    | coding                |
| gpt-4o-2024-05-13   | 2    | coding                |

This is in the same tidy format as the previous part of the homework.

In [None]:
#TODO
# Create tidy format by concatenating all category results (including overall)
# We have provided the overall mask for you

overall_mask = pd.Series([True] * len(selected_battles_no_ties), index=selected_battles_no_ties.index)

category_results_df = ...

In [None]:
grader.check("q6d")

Amazing! Now let's plot the heatmap showing rank across the categories üßô. 

In [None]:
fig = plot_rank_heatmap(category_results_df, title="Model Rankings by Category")
fig.show()

**Something to ponder upon:** We see model rankings can change a lot depending on the type of question being asked. Sometimes these make sense, like how deepseek coder gets much higher rankings on coding problems, but sometimes it isn't clear why one model does better than another. Especially for things like creative tasks, why do people like Gemini 1.5 so much more than Claude 3.5?

# **Question 7: Ranking Influences**

One thing that has been known to affect user preference is response length: people (and LLM's) tend to prefer longer answers. A recurring observation in human grading and UX is that **longer responses are often preferred**. For example, analyses from the SAT essay reported that **essay length strongly correlated with higher scores‚Äîeven when errors were present** ([New York Times, 2005](https://www.nytimes.com/2005/05/04/education/sat-essay-test-rewards-length-and-ignores-errors.html)). 

In the context of LLM evaluations, this motivates a core question: **does response length systematically tilt battle outcomes and model rankings?**


Let's investigate whether length plays a role in model rankings. First let's do some quick analysis on the response length per model

## **Question 7a**

We want to analyze whether **response length (in tokens)** is related to model rankings.

In `per_model_battles` (which is what you would implement in Q7b), the **`conversation`** column contains, for each row, a *single exchange* between a user and a model (one battle). It is represented as a **list of message dictionaries**. These dictionaries are representing a full exchange between a user and a model in a single battle. The number of turns is `len(row['conversation']/2`.

**Each message dictionary contains (as provided):**
1. `"content"` ‚Äì the text of the message  
3. `"role"` ‚Äì either `"user"` or `"assistant"`. In our question we will be focusing on `"assistant"`

---

#### What exactly is `conv`?

For this question, assume your function will receive **`conv`**, which is a **dictionary** with a single key `"conversation"` mapping to a **list of message dictionaries**:

```python
conv_example = {
    "conversation": [
        {"role": "user", "content": "How do I sum a list in Python?"},
        {"role": "assistant", "content": "Use the built-in function: sum(your_list)."}
    ]
}
# conv_example["conversation"]  -> list of message dicts, ordered by turns
# Each dict has:
#   - "role": "user" or "assistant"
#   - "content": str (message text)


**Task:**
Implement a function `calculate_response_length` that, given a conversation `conv`, returns the total number of GPT-2 tokens in the concatenation of all **assistant messages** in that conversation.

**Requirements:**

1. Use the tiktoken library with the "gpt2" encoding. Import tiktoken.

2. Concatenate only the assistant messages and count tokens.

3. Call `enc.encode(..., disallowed_special=())` to allow all special tokens (avoids ValueError, e.g., for <|endoftext|>).

4. Concatenate all assistant role messages separated by two newlines ("\n\n") before counting tokens (join each them by this).

In [None]:
import tiktoken
def calculate_response_length(conv):
    ...
    ...
    ...

In [None]:
grader.check("q7a")

## **Question 7b**

In the previous question (Q7a), you wrote a function to compute the **token length** of a model's reply from a conversation. We‚Äôll now **reshape** the battle-level data so that each row corresponds to a **single model‚Äôs response in a single battle** (instead of one row per battle).

In the `selected_battles_no_ties` DataFrame, each row represents a battle between two models, with:

*   conversation_a = the conversation for model_a in that battle
*   conversation_b = the conversation for model_b in that battle


To analyze response length per model, we would want a table where each row corresponds to a single model's response in a single battle (rather than one row per battle).

Specifically, our goal is to turn each battle row into **two rows** (tidy format):
1. one for `model_a` using `conversation_a`
2. one for `model_b` using `conversation_b`

**Task:**
Using the function defined in Question 7a, create a DataFrame named `per_model_battles ` with columns:

1. **`conversation`** ‚Äî the list of message dicts for that model‚Äôs side of the battle  
2. **`model`** ‚Äî the model name  
3. **`response_length`** ‚Äî integer token count of all assistant messages concatenated (computed via **`calculate_response_length`** from Q7a)
> **Hint:** `pd.concat` might be handy for stacking the A-side and B-side tables into one.  
> Docs: https://pandas.pydata.org/docs/reference/api/pandas.concat.html

In [None]:
#TODO:
do NOT use .copy() on large dataframes, it was cause RAM autograder errors
battles_a = ..
battles_b = ..
...
...
...
...
...
per_model_battles[...] = ...
per_model_battles.head()

In [None]:
grader.check("q7b")

In [None]:
# Let's take a look at the structure
# print(per_model_battles['conversation'].iloc[0])

Let's plot the response length for each model ordered by their rank and fit a trendline to see if there is any relation between rank and length.

In [None]:
model_lineup = results_df.sort_values("Rank")['Model'].tolist()
avg_lengths = per_model_battles.groupby("model")["response_length"].mean().reset_index()
avg_lengths["model"] = pd.Categorical(avg_lengths["model"], categories=model_lineup, ordered=True)
avg_lengths = avg_lengths.sort_values("model").reset_index(drop=True)

# Add a numeric rank column for trendline fitting
avg_lengths["rank"] = avg_lengths.index + 1  # 1 = best, etc.

# Fit a linear trendline (polyfit) to the response length vs. rank
z = np.polyfit(avg_lengths["rank"], avg_lengths["response_length"], 1)
p = np.poly1d(z)
trendline = p(avg_lengths["rank"])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=avg_lengths["model"],
    y=avg_lengths["response_length"],
    mode='lines+markers',
    name='Avg Response Length'
))

fig.add_trace(go.Scatter(
    x=avg_lengths["model"],
    y=trendline,
    mode='lines',
    name='Trendline',
    line=dict(dash='dash', color='red')
))

fig.update_layout(
    title="Average Response Length of Models (with Trendline)",
    xaxis_title="Model (sorted by performance)",
    yaxis_title="Average Response Length",
    xaxis_tickangle=45,
    yaxis=dict(range=[0, max(avg_lengths["response_length"].max(), trendline.max()) * 1.05])  # y-axis starts at 0
)

fig.show()

# **Question 8: Style Control**

It looks like there is a trend: models with shorter responses tend to be ranked lower. While not a perfect analysis, if it could be true that people are preferring models which generate longer responses regardless of their other capabilities, then it would be useful to create a leaderboard which is *length agnostic*. Meaning, creating leaderboard model scores that control for certain stylistic properties of responses.

So how can we control for these stylistic factors in our model rankings?

In this question, you will implement a style feature ranking pipeline, starting with length as the only style feature.


1. Each row in `selected_battles_no_ties` represents a battle between model_a and model_b.

2. Each battle contains `conv_metadata` with pre-computed style metrics, such as bold text counts, header counts, list counts, and token counts for each side.

3. In the earlier question, each battle was converted to pairwise feature, where 1 denoted the model it belongs to, and -1 for the model that it was battling against. Now, our goal is to build on that data, including both the model identity indicators and the chosen style features.


## **Question 8a**

We want to add style features other than length that will give us style feature aware ranks.


**Task:**
Implement the function `add_style_features` that reads stylistic metrics from `conv_metadata` between each model (model_a, model_b) for count of bold, header, list, and assistant tokens. Then, store the normalized differences in columns (with the designated names):

1. style_bold_count
2. style_header_count
3. style_list_count
4. style_sum_assistant_tokens

The normalized differences would be following the formulation below:

$$
\text{normdiff}(a, b) =
\begin{cases}
0 & \text{if } a + b = 0 \\[6pt]
\dfrac{a - b}{a + b} & \text{otherwise}
\end{cases}
$$


‚ö†Ô∏è **Important note**: Make sure you are not mutating the original DataFrame passed into your function. Work on a copy (df.copy()) and return that new DataFrame with the added columns.

Let's take a took at `conv_metadata`. Essentially, we will be stacking up these elements for each style counts.

In [None]:
selected_battles_no_ties['conv_metadata'].iloc[0]

In [None]:
def add_style_features(df):
    """
    Adds normalized style feature difference columns to the DataFrame.
    The columns added are:
      - style_bold_count
      - style_header_count
      - style_list_count
      - style_sum_assistant_tokens
    """
    def normdiff(a, b):
        denom = a + b
        return 0 if denom == 0 else (a - b) / denom

    ...
    style_bold = []
    style_header = []
    style_list = []
    style_tokens = []
    for idx, row in df.iterrows():
        ...
      
    df["style_bold_count"] = style_bold
    df["style_header_count"] = style_header
    df["style_list_count"] = style_list
    df["style_sum_assistant_tokens"] = style_tokens
    return df

# Example usage:
selected_battles_no_ties = add_style_features(selected_battles_no_ties)

In [None]:
grader.check("q8a")

<!-- BEGIN QUESTION -->

## **Question 8a Free Response Question**

We‚Äôve now added stylistic features to each model comparison.  

**Answer the following question**

```otter
YOUR ANSWER: How can integrating these features into the ranking pipeline create the effect of a ‚Äúlength-controlled‚Äù leaderboard, and why might this adjustment be useful?  

Think about whether raw win/loss outcomes fully capture model quality, or whether stylistic inflation (e.g., longer answers, formatting tricks) can bias rankings.
```

```otter
YOUR ANSWER:
```

<!-- END QUESTION -->

## **Question 8b**

Let's try to visualize and formulate the features we defined in the previous question in a neat way that we can see the direct relationship between the model battles and the style features.

We now want a training table where each battle produces two rows, one for each ordering of the competitors (A‚ÜíB and B‚ÜíA).
In other words, each row in the dataframe creates two entries in the new table.

**Task:**
Implement a function that creates the table described above. Each row should encode:
1. The model identity vector **X** (+1 at the selected model in the row, ‚àí1 at its opponent, 0 elsewhere)
2. The outcome y (win, lose)
3. Set of style covariates capturing A vs B normalized differences (e.g., length)

**NOTE:** Ensure that features are also antisymmetric, meaning they flipping the order of model should also flip the sign of each style feature.

Below is an example of the desired table.

| question_id                         | X                                           | y | direction | style_bold_count | style_header_count | style_list_count | style_sum_assistant_tokens |
|--------------------------------------|----------------------------------------------|---|-----------|------------------|--------------------|------------------|----------------------------|
| e8fe7c9f75ab4e528367cc7de625c475     | [0, 0, 0, 0, 0, 1, ...]  | 0 | A->B      | 1.0              | 0.0                | 1.0              | 0.07717                    |
| e8fe7c9f75ab4e528367cc7de625c475     | [0, 0, 0, 0, 0, -1 ...] | 1 | B->A      | -1.0             | -0.0               | -1.0             | -0.07717                   |


In [None]:
def make_pairwise_feature_df(df, models, style_feature_cols):
    """
    For each row in df, create two rows in the output:
      - One for A->B (original direction)
      - One for B->A (flipped direction)
    Each row contains:
      - question_id
      - X: model indicator vector (1 for model_a, -1 for model_b, 0 otherwise)
      - y: 1 if model_a wins, 0 if model_b wins
      - style features (from style_feature_cols)
      - direction: "A->B" or "B->A"
    Returns a new DataFrame with only these columns.
    """
    ...
    records = []
    for idx, row in df.iterrows():
        # X vector for A->B and B->A
        ...
        ...
        # y for A->B and B->A
        ...
        ...
        # Style features for A->B, B->A


        # Add A->B, B->A
        
        
    
    return pd.DataFrame(records)

style_feature_cols = [
    "style_bold_count",
    "style_header_count",
    "style_list_count",
    "style_sum_assistant_tokens"
]

pairwise_feature_df = make_pairwise_feature_df(selected_battles_no_ties, selected_models, style_feature_cols)
pairwise_feature_df.head(2)

In [None]:
grader.check("q8b")

## **Question 8c**

Amazing! üéâ Now that we've built our pairwise identity matrix X (which model is battling which) and our style feature matrix X_style (how A and B differ stylistically), lets's combine these two so that our logistic model can learn:

1. The intrinsic strength of each model (controlling for style)
2. The influence of each style feature on the outcome

**Task:**
Now, implement the function `get_sc_category_results` that:
1. Stacks these two matrices into one design matrix X_with_style, so the model can learn both intrinsic model strengths and style effects simultaneously.
2. Uses `get_bootstrapped_score` to get the ranking results.
3. Returns the results sorted by rank in ascending order. All style features should be assigned a rank of -1

In [None]:
def get_sc_category_results(df, selected_models, filter_mask = None, category_name="Overall w/ Style Control", n_bootstrap=25, style_features=style_feature_cols):
    feature_labels = selected_models + style_features
    ...
    
    #TODO
    feature_labels = selected_models + style_features
    ...

    # rerank so the models are ranked and the style features are given a rank of -1
    return results_df.sort_values(by="Rank", ascending=True)
results_df_style_control = get_sc_category_results(selected_battles_no_ties, selected_models, category_name="Overall w/ Style Control", n_bootstrap=25)
combined_results_df = pd.concat([results_df, results_df_style_control])
fig = plot_rank_heatmap(combined_results_df, title="With and Without Style Control")
fig.show()

In [None]:
grader.check("q8c")

### Look at the impact of style

Now let's visualize the style feature scores with confidence intervals using our premade `plot_style_features` in `plotting_utils.py`.

In [None]:
# plot style feature coefficients
fig = plot_style_features(results_df_style_control, selected_models)
fig.show()

Here we see that length matters a LOT (in fact this coefficient is higher than the actual model coefficients), while things like bold and lists don't matter as much. 

## **Question 8d**

Let's add the stylistic features to our computation of per-category leaderboards.

**Goal:** Extend your category leaderboards from Q7d to the style-controlled setting.

**Note:** This mirrors the merge pattern you used in Q7d. Reuse that approach, but start from the style-controlled baseline (not the plain baseline).

**Task:**
Now, re-using the function `get_sc_category_results` and `category_filters` that was previously defined, define `category_style_control_results_df` that has all the bound scores for the other categories (english, coding, creative, instruction_following, math, technical_accuracy). Specifically,

1. Using `get_sc_category_results` and the provided category_filters, compute per-category results. 

3. Name the final DataFrame category_stle_control_results_df and sort by "Rank" ascending. This should be the same structure as the previous result `combined_results_df`.

In [None]:
# 1) Use the dictionary to store the returned dataframe from the get_sc_category_results function
....
category_style_control_results_df = ...

In [None]:
grader.check("q8d")

In [None]:
fig = plot_rank_heatmap(category_style_control_results_df, title="With Style Control")
fig.show()

Now let's look at the delta in rankings (shift of ranking) when we appy style control across all these categories.

In [None]:
style_control_models = category_style_control_results_df[
    category_style_control_results_df['Model'].isin(selected_models)
].copy()

baseline_models = category_results_df[
    category_results_df['Model'].isin(selected_models)
].copy()
style_pivot = style_control_models.pivot(index='Model', columns='Category', values='Rank')
baseline_pivot = baseline_models.pivot(index='Model', columns='Category', values='Rank')

# Create mapping between style control and baseline categories
category_mapping = {}
for style_cat in style_pivot.columns:
    baseline_cat = style_cat.replace(" w/ Style Control", "")
    if baseline_cat in baseline_pivot.columns:
        category_mapping[style_cat] = baseline_cat

print(f"Category mapping: {category_mapping}")

if not category_mapping:
    print("No matching categories found between style control and baseline results")
else:
    common_models = list(set(style_pivot.index) & set(baseline_pivot.index))
    print(f"Comparing {len(common_models)} models across {len(category_mapping)} categories")
    style_aligned = pd.DataFrame(index=common_models)
    baseline_aligned = pd.DataFrame(index=common_models)
    for style_cat, baseline_cat in category_mapping.items():
        style_aligned[baseline_cat] = style_pivot.loc[common_models, style_cat]
        baseline_aligned[baseline_cat] = baseline_pivot.loc[common_models, baseline_cat]
    
    # Compute rank deltas (baseline - style_control)
    delta_data = baseline_aligned - style_aligned
    if 'Overall' in delta_data.columns:
        delta_data = delta_data.drop(columns=['Overall'])

    heatmap_z = delta_data.values
    heatmap_x = delta_data.columns.tolist()  # Categories
    heatmap_y = delta_data.index.tolist()    # Models
    avg_delta = delta_data.mean(axis=1).sort_values(ascending=False)
    delta_data_sorted = delta_data.loc[avg_delta.index]
    heatmap_z = delta_data_sorted.values
    heatmap_y = delta_data_sorted.index.tolist()

    annotations = []
    for i, model in enumerate(heatmap_y):
        for j, category in enumerate(heatmap_x):
            value = heatmap_z[i][j]
            if not pd.isna(value):
                annotations.append(dict(x=category,y=model,text=f"{int(value)}",showarrow=False,font=dict(color="black" if abs(int(value)) < 2 else "white", size=12)))
    n_models = len(heatmap_y)
    height = max(400, n_models * 30)
    
    fig = go.Figure(data=go.Heatmap(z=heatmap_z,x=heatmap_x,y=heatmap_y,colorscale="RdBu",colorbar=dict(title="Rank Delta (Baseline - Style Control)"),zmid=0))
    fig.update_layout(
        title="Delta in Model Rankings With Style Control (Category-Specific)",
        xaxis_title="Category",
        yaxis_title="Model",
        yaxis_autorange="reversed",
        annotations=annotations,
        height=height
    )
    fig.show()

Here we can quickly see which models are "Style hacking" - formatting their responses nicely but not necesarily being more capable models. It looks like gpt-4o-mini, llama-3.1-70b-instruct, llama-3.1-8b-instruct see a consistent drop in rankings while claude 3.5 sonnet, gemma-2-27b, and claude-3-haiku see a consistent rise in rankings.

# **Question 9: Finding New Style Influences**


Earlier, we saw how model preference differs by looking at structural style features in model outputs (e.g., bold text count, header count, list count, token length).
Now let's see if we can find new style features by inspecting the model responses to understand differences in models.
Let's inpsect üîç the text ourselves for stylistic signals associated with wins. Your goal is to analyze assistant responses and identify phrases that differentiate winning from losing replies. This helps surface style features we might add to our ranking model later.

### Helper Functions
Function to turn a conversations into plain text:
*   convert_conversation_to_string
*   convert_asst_conversation_to_string

Function that compares two text with n-gram TF-IDF:
*   tfidf_phrase_diff

In [None]:
def convert_conversation_to_string(conv):
  ret = ""
  for i in conv:
    if i['role'] == 'user':
      ret += "User: " + i['content'] + "\n\n"
    else:
      ret += "Assistant: " + i['content'] + "\n\n"
  return ret

def convert_asst_conversation_to_string(conv):
  ret = ""
  for i in conv:
    if i['role'] == 'assistant':
      ret += i['content'] + "\n\n"
  return ret

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def is_number_phrase(phrase):
    # Remove phrases that are only numbers or contain only numbers and spaces/punctuation
    # Also remove phrases that are just a number or start/end with a number
    return bool(re.fullmatch(r"[\d\s\W]+", phrase)) or bool(re.search(r"\b\d+\b", phrase))

def tfidf_phrase_diff(str_list_a, str_list_b, name_a="A", name_b="B", top_n=30, max_features=1000):
    """
    Compute distinguishing ngram tfidf phrases between two sets of strings.
    Returns two DataFrames: one for phrases more common in A, one for B.
    """
    all_texts = str_list_a + str_list_b
    labels = [name_a] * len(str_list_a) + [name_b] * len(str_list_b)
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=(2,4))
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    feature_names = vectorizer.get_feature_names_out()
    n = len(str_list_a)
    tfidf_a = tfidf_matrix[:n]
    tfidf_b = tfidf_matrix[n:]
    mean_a = np.asarray(tfidf_a.mean(axis=0)).flatten()
    mean_b = np.asarray(tfidf_b.mean(axis=0)).flatten()
    # Top phrases for A
    a_scores = mean_a - mean_b
    top_a_indices = np.argsort(a_scores)[::-1]
    top_a_phrases = []
    for i in top_a_indices:
        phrase = feature_names[i]
        if not is_number_phrase(phrase):
            top_a_phrases.append((phrase, mean_a[i], mean_b[i]))
        if len(top_a_phrases) >= top_n:
            break
    # Top phrases for B
    b_scores = mean_b - mean_a
    top_b_indices = np.argsort(b_scores)[::-1]
    top_b_phrases = []
    for i in top_b_indices:
        phrase = feature_names[i]
        if not is_number_phrase(phrase):
            top_b_phrases.append((phrase, mean_b[i], mean_a[i]))
        if len(top_b_phrases) >= top_n:
            break
    df_a = pd.DataFrame(top_a_phrases, columns=["phrase", f"{name_a}_tfidf", f"{name_b}_tfidf"])
    df_b = pd.DataFrame(top_b_phrases, columns=["phrase", f"{name_b}_tfidf", f"{name_a}_tfidf"])
    return df_a, df_b

In [None]:
# Example usage:
model = "llama-3.1-70b-instruct"
llama_battles = selected_battles_no_ties[
    (selected_battles_no_ties['model_a'] == model) | (selected_battles_no_ties['model_b'] == model)
].copy()

llama_battles.loc[:, "model_a_conversation_string"] = llama_battles["conversation_a"].apply(convert_asst_conversation_to_string)
llama_battles.loc[:, "model_b_conversation_string"] = llama_battles["conversation_b"].apply(convert_asst_conversation_to_string)
llama_battles = llama_battles[llama_battles['language'] == 'English']

str_list_a = llama_battles.apply(lambda x: x["model_a_conversation_string"] if x["model_a"] == model else x["model_b_conversation_string"], axis=1).tolist()
str_list_b = llama_battles.apply(lambda x: x["model_b_conversation_string"] if x["model_a"] == model else x["model_a_conversation_string"], axis=1).tolist()
# print(str_list_a)

df_a, df_b = tfidf_phrase_diff(str_list_a, str_list_b, model, "others")

print(f"Top {model} phrases:")
# display(df_a)
print(f"Top others phrases:")
# display(df_b)

# **Question 9a: Key Phrases**

Let's analyze which phrases inherent in the text might be related to the win or lose of the battles. This is a similar idea to VibeCheck except (1) instead of comparing moel pairs we care comparing winning vs losing models and (2) instead of using LLM's to propose and validate vibes, we are going to be relying on keyword matching. 

Using the helper functions provided (`convert_asst_conversation_to_string`, `tfidf_phrase_diff`) and the reference example as guidance, implement a winning-vs-losing phrase analysis for assistant responses.




**Task:** 

1. From selected_battles_no_ties, keep only rows in English.

2. For each battle, extract assistant-only text using convert_asst_conversation_to_string.

3. Build winning_responses: one assistant-only string for the winning side of each battle.

4. Build losing_responses: one assistant-only string for the losing side of each battle.

5. Use tfidf_phrase_diff to compare the two lists and construct df_win and df_lose. (we have set this up for you)

6. Display the results to see the phrases most associated with winning vs. losing. (we have set this up for you)

For your reference, a sample subset of outputs is shown below.

### Example: Top phrases in *winning* responses
| phrase              | winning_tfidf | losing_tfidf |
|---------------------|---------------|--------------|
| let break           | 0.0127        | 0.0106       |
| step step           | 0.0146        | 0.0126       |\
| ... |...       | ...     |

### Example: Top phrases in *losing* responses
| phrase                 | losing_tfidf | winning_tfidf |
|------------------------|--------------|----------------|
| let know               | 0.0278       | 0.0190         |
| sorry assist           | 0.0054       | 0.0003         |
| ...     | ...      | ...  |


In [None]:
# TFIDF comparing winning responses to losing responses
...
# 1) Restrict to English Only and prepare assistant-only strings for A/B
# Use 'convert_asst_conversation_to_string' helper function
# Create a working copy to avoid modifying the main dataframe
selected_battles_english = selected_battles_no_ties.copy()
...

# 2) Build lists of assistant-only winning/losing responses
winning_responses = selected_battles_english.apply(...).tolist()

losing_responses = selected_battles_english.apply(...).tolist()

# 3) Compare phrases
df_win, df_lose = tfidf_phrase_diff(winning_responses, losing_responses, "winning", "losing")
# 4) Show results
print("Top winning response phrases:")
# display(df_win)
print("Top losing response phrases:")
# display(df_lose)

In [None]:
grader.check("q9a")

## Feature Exploration


One thing we see from the TF-IDF results is that "i'm sorry" or "i apologize" appear often in losing models - when looking through these conversations you will see that these are often instances of **refusal**: where the model refuses to answer the question beacuse it violates ethical guidelines or is out of its domain of knowledge. Now let's turn this into a style feature to measure its impact on accuracy. 


Let's try capturing whether the assistant on side A which apologizes more than side B in a battle by calculating the normalized sorry_count_diff. Here we will just have a binary 1/0 for each conversation indicating if it contains or does not contain the word "sorry". 

In [None]:
def count_phrase_diff(row, phrase=["step by step"]):
    step_by_step_a = False
    step_by_step_b = False
    for i in row["conversation_a"]:
        if i["role"] == "assistant" and any([p in i["content"].lower().replace("-", " ") for p in phrase]):
            step_by_step_a = True
    for i in row["conversation_b"]:
        if i["role"] == "assistant" and any([p in i["content"].lower().replace("-", " ") for p in phrase]):
            step_by_step_b = True
    return int(step_by_step_a) - int(step_by_step_b)


selected_battles_no_ties.loc[:, "refusal_count"] = selected_battles_no_ties.apply(
    lambda row: count_phrase_diff(row, ["sorry", "apologize"]),
    axis=1
)

<!-- BEGIN QUESTION -->

# **Question 9b: Discover Some Immaculate Vibes**

**Task:** 
Now, just like the `refusal_count` feature we created above, implement new functions that can extract any stylistic features from the conversations.  
- Define a function that computes the normalized difference for a feature of your choice (e.g., presence of certain phrases, punctuation, formatting). You can check multiple different phrases if you want, they just to have a common "theme" - similar to the last problem of the previous part of this homework.   
- Apply this function to each row in the dataset.  
- Store the results in a new column of `selected_battles_no_ties["YOUR_FEATURE"]`. 
- Plot the change in ranking and style coefficients and the existing style features along with your custom feature. **To get full points, your feature need to get a higher coefficient (Average Score) than `style_header_count`**. It is okay if the confidence intervasls overlap. 
- You cannot use a feature already explored or anything similar (e.g. you can't have an "I refuse" style feature or a word count style feature). 

This is open ended, you don't need to use the features you found above, get creative with it! Heck, you can even throw response pairs into your LLM of choice and ask it to come up with differences just like VibeCheck! 

In [None]:
# Design your own stylistic feature(s) that compare model A vs B on each row.
# Keep it simple (boolean presence, counts, or normalized differences) or get creative.

# Remember might want to normalize difference, if you compute counts

# TODO: Define your feature function
def YOUR_FUNC(row):
    #Input: a row with 'conversation_a' and 'conversation_b' (each is a list of {role, content} dicts).
    #Output: a single numeric feature comparing A vs B (e.g., -1/0/1, count diff, or normalized diff).
    #Hint: You probably want to inspect only assistant messages.
    # Example (placeholder): return 0
    return ...

# Avoid warnings by making copy
selected_battles_no_ties = selected_battles_no_ties.copy()

# TODO: Apply your function to create a new column
# selected_battles_no_ties.loc[:, "YOUR_FEATURE"] = selected_battles_no_ties.apply(YOUR_FUNC, axis=1)

# Choose which style features to control for in ranking.
# Start from this list and add yours below.
style_feature_cols = [
    "style_bold_count",
    "style_header_count",
    "style_list_count",
    "style_sum_assistant_tokens",
    "refusal_count",
    # "YOUR_FEATURE",   # <- uncomment after you create it

combined_results_df_new = get_sc_category_results(selected_battles_no_ties,
                                 selected_models, 
                                 category_name="Overall w/ Style Control", 
                                 n_bootstrap=25, 
                                 style_features=style_feature_cols)

fig = plot_rank_heatmap(combined_results_df_new, title="With and Without Style Control (New Features)", selected_models=selected_models)
fig.show()

# Create and display the plot
fig = plot_style_features(combined_results_df_new, selected_models)
fig.show()

In [None]:
# Memory cleanup: drop conversation columns (saves ~400-600 MB)
# All features extracted from conversations have been computed
selected_battles_no_ties = selected_battles_no_ties.drop(columns=['conversation_a', 'conversation_b'])
import gc
gc.collect()

<!-- END QUESTION -->

<!-- BEGIN QUESTION -->

# **Question 9c: Reflection**

**Answer the following questions**

```otter
9c-1. Why did you decide to use the stylistic feature that you have implemented for Q9b? What ranking changes across models did you see when looking at the new style features you created? Why do you think that is the case?

9c-2. Why might some models overuse or underuse stylistic markers (e.g., exclamation points, apologies, or explicit reasoning phrases), and how could that influence rankings?

9c-3. How does including these stylistic features help control for length or formatting effects when building leaderboards?
```

```otter
YOUR ANSWER 9c-1: Replace This
YOUR ANSWER 9c-2: Replace This
YOUR ANSWER 9c-3: Replace This
```

<!-- END QUESTION -->



## Submission

Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**

In [None]:
## Use this cell if you are running the notebook in Google Colab to install the necessary dependencies, this may take a few minutes
if IS_COLAB:
    !apt-get install -y texlive texlive-xetex pandoc


In [None]:
# Save your notebook first, then run this cell to export your submission.
grader.export(pdf=False, run_tests=True)