In [62]:
import numpy as np
import pandas as pd
from tabulate import tabulate


In [79]:
df = pd.read_json("arena_votes/arena_votes.jsonl", lines=True)
df.head(3)

Unnamed: 0,session_id,paper_id,reviewer_a,reviewer_b,technical_quality,constructiveness,clarity,overall_quality,review_a,review_b,vote_time
0,7tppsuo8kyq,acl_2024_s59,barebones,multi_agent_with_knowledge,👉 B is better,👉 B is better,👉 B is better,👉 B is better,Here are my main feedback comments as a peer r...,"Critical Review of ""Zero-Shot Cross-Lingual Re...",2024-09-08 19:41:00.624620
1,7tppsuo8kyq,acl_2024_s59,multi_agent_with_knowledge,liang_etal,👈 A is better,👈 A is better,👈 A is better,👈 A is better,"Critical Review of ""Zero-Shot Cross-Lingual Re...",Review outline:\n\n1. Significance and novelty...,2024-09-08 19:41:16.496030
2,7tppsuo8kyq,acl_2024_s59,multi_agent_with_knowledge,multi_agent_without_knowledge,👈 A is better,👈 A is better,👈 A is better,👈 A is better,"Critical Review of ""Zero-Shot Cross-Lingual Re...","Critical Review of ""Zero-Shot Cross-Lingual Re...",2024-09-08 19:41:48.048705


In [9]:
# Total number of reviews
num_reviews = df.shape[0]
print(f"Total number of reviews: {num_reviews}")

Total number of reviews: 140


In [46]:
# Total wins for each model:

total_wins_multi_with_knowledge = sum((df['reviewer_a'] == 'multi_agent_with_knowledge') & (df['overall_quality'] == '👈  A is better') | 
                                 (df['reviewer_b'] == 'multi_agent_with_knowledge') & (df['overall_quality'] == '👉  B is better'))
total_wins_multi_no_knowledge = sum((df['reviewer_a'] == 'multi_agent_without_knowledge') & (df['overall_quality'] == '👈  A is better') | 
                                 (df['reviewer_b'] == 'multi_agent_without_knowledge') & (df['overall_quality'] == '👉  B is better'))
total_wins_liang_etal = sum((df['reviewer_a'] == 'liang_etal') & (df['overall_quality'] == '👈  A is better') | 
                                 (df['reviewer_b'] == 'liang_etal') & (df['overall_quality'] == '👉  B is better'))
total_wins_barebones = sum((df['reviewer_a'] == 'barebones') & (df['overall_quality'] == '👈  A is better') | 
                                 (df['reviewer_b'] == 'barebones') & (df['overall_quality'] == '👉  B is better'))
total_wins_human = sum((df['reviewer_a'] == 'human_reviewer') & (df['overall_quality'] == '👈  A is better') | 
                                 (df['reviewer_b'] == 'human_reviewer') & (df['overall_quality'] == '👉  B is better'))

# Total Appearances
appearances_multi_agent_with_knowledge = sum((df['reviewer_a'] == 'multi_agent_with_knowledge') | (df['reviewer_b'] == 'multi_agent_with_knowledge'))
appearances_multi_agent_without_knowledge = sum((df['reviewer_a'] == 'multi_agent_without_knowledge') | (df['reviewer_b'] == 'multi_agent_without_knowledge'))
appearances_liang_et_al = sum((df['reviewer_a'] == 'liang_etal') | (df['reviewer_b'] == 'liang_etal'))
appearances_barebones = sum((df['reviewer_a'] == 'barebones') | (df['reviewer_b'] == 'barebones'))
appearances_human = sum((df['reviewer_a'] == 'human_reviewer') | (df['reviewer_b'] == 'human_reviewer'))

# Win Frequency
relative_wins_multi_agent_with_knowledge = total_wins_multi_with_knowledge / appearances_multi_agent_with_knowledge
relative_wins_multi_agent_without_knowledge = total_wins_multi_no_knowledge / appearances_multi_agent_without_knowledge
relative_wins_liang_etal = total_wins_liang_etal / appearances_liang_et_al
relative_wins_barebones = total_wins_barebones / appearances_barebones
relative_wins_human = total_wins_human / appearances_human

print(total_wins_multi_with_knowledge)
print(f"Win rate for multi-agent with knowledge: {relative_wins_multi_agent_with_knowledge}")
print(f"Win rate for multi-agent without knowledge: {relative_wins_multi_agent_without_knowledge}")
print(f"Win rate for Liang et al.: {relative_wins_liang_etal}")
print(f"Win rate for Barebones: {relative_wins_barebones}")
print(f"Win rate for Human Reviewers: {relative_wins_human}")

23
Win rate for multi-agent with knowledge: 0.46
Win rate for multi-agent without knowledge: 0.352112676056338
Win rate for Liang et al.: 0.2641509433962264
Win rate for Barebones: 0.44
Win rate for Human Reviewers: 0.2903225806451613


In [45]:
categories = ['technical_quality', 'constructiveness', 'clarity', 'overall_quality']
possible_outcomes = ['👈  A is better', '👉  B is better', '👎  Both are bad', '🤝  Tie']
models = pd.unique(df[['reviewer_a', 'reviewer_b']].values.ravel())

wins = {model: {category: 0 for category in categories} for model in models}
appearances = {model: {category: 0 for category in categories} for model in models}
non_wins = {model: {category: 0 for category in categories} for model in models}

# For each category, count appearances and wins
for category in categories:
    for model in models:
        # Count total appearances (either in reviewer_a or reviewer_b) without double-counting
        appearances[model][category] = sum(((df['reviewer_a'] == model) | (df['reviewer_b'] == model)))

        # Count wins for reviewer_a
        wins[model][category] += sum((df['reviewer_a'] == model) & (df[category] == '👈  A is better'))

        # Count wins for reviewer_b
        wins[model][category] += sum((df['reviewer_b'] == model) & (df[category] == '👉  B is better'))

        # Count non-win outcomes (Both are bad or Tie)
        non_wins[model][category] += sum((df['reviewer_a'] == model) & (df[category].isin(['👎  Both are bad', '🤝  Tie'])))
        non_wins[model][category] += sum((df['reviewer_b'] == model) & (df[category].isin(['👎  Both are bad', '🤝  Tie'])))

# Example to print total wins, appearances, and non-wins by category for each model
for model in models:
    for category in categories:
        total_wins = wins[model][category]
        total_appearances = appearances[model][category]
        total_non_wins = non_wins[model][category]
        
        print(f'{model} in {category}:')
        print(f' - Wins: {total_wins}')
        print(f' - Appearances: {total_appearances}')
        print(f' - Non-win outcomes (Both are bad or Tie): {total_non_wins}')
        print(f' - Win rate: {(total_wins / total_appearances * 100) if total_appearances > 0 else 0:.2f}%')
        print()

barebones in technical_quality:
 - Wins: 32
 - Appearances: 75
 - Non-win outcomes (Both are bad or Tie): 12
 - Win rate: 42.67%

barebones in constructiveness:
 - Wins: 29
 - Appearances: 75
 - Non-win outcomes (Both are bad or Tie): 16
 - Win rate: 38.67%

barebones in clarity:
 - Wins: 29
 - Appearances: 75
 - Non-win outcomes (Both are bad or Tie): 16
 - Win rate: 38.67%

barebones in overall_quality:
 - Wins: 33
 - Appearances: 75
 - Non-win outcomes (Both are bad or Tie): 12
 - Win rate: 44.00%

multi_agent_with_knowledge in technical_quality:
 - Wins: 22
 - Appearances: 50
 - Non-win outcomes (Both are bad or Tie): 23
 - Win rate: 44.00%

multi_agent_with_knowledge in constructiveness:
 - Wins: 23
 - Appearances: 50
 - Non-win outcomes (Both are bad or Tie): 25
 - Win rate: 46.00%

multi_agent_with_knowledge in clarity:
 - Wins: 22
 - Appearances: 50
 - Non-win outcomes (Both are bad or Tie): 24
 - Win rate: 44.00%

multi_agent_with_knowledge in overall_quality:
 - Wins: 23
 - A

In [65]:
matrix_wins = pd.DataFrame(0, index = models, columns = models)
matrix_comparisons = pd.DataFrame(0, index = models, columns = models)

for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['overall_quality']
    
    
    matrix_comparisons.loc[reviewer_a, reviewer_b] += 1
    matrix_comparisons.loc[reviewer_b, reviewer_a] += 1
    
    if outcome == '👈  A is better':
        matrix_wins.loc[reviewer_a, reviewer_b] +=1
    elif outcome == '👉  B is better':
        matrix_wins.loc[reviewer_b, reviewer_a] += 1
    
matrix_win_rate = matrix_wins / matrix_comparisons
matrix_win_rate[matrix_comparisons == 0] = np.nan

matrix_win_rate = matrix_win_rate.round(2)
matrix_win_rate_display = matrix_win_rate.fillna('N/A')

print("Pairwise Win Rates: ")
print(matrix_win_rate)

Pairwise Win Rates: 
                               barebones  multi_agent_with_knowledge  \
barebones                            NaN                        0.00   
multi_agent_with_knowledge          1.00                         NaN   
liang_etal                          0.15                        0.00   
multi_agent_without_knowledge       0.80                        0.09   
human_reviewer                      0.38                         NaN   

                               liang_etal  multi_agent_without_knowledge  \
barebones                            0.67                           0.05   
multi_agent_with_knowledge           1.00                           0.41   
liang_etal                            NaN                            NaN   
multi_agent_without_knowledge         NaN                            NaN   
human_reviewer                       0.31                           0.00   

                               human_reviewer  
barebones                                

In [67]:
print("Pairwise Win Rates (row=winner, column=loser):")
print(tabulate(matrix_win_rate, headers='keys', tablefmt='fancy_grid'))

Pairwise Win Rates (row=winner, column=loser):
╒═══════════════════════════════╤═════════════╤══════════════════════════════╤══════════════╤═════════════════════════════════╤══════════════════╕
│                               │   barebones │   multi_agent_with_knowledge │   liang_etal │   multi_agent_without_knowledge │   human_reviewer │
╞═══════════════════════════════╪═════════════╪══════════════════════════════╪══════════════╪═════════════════════════════════╪══════════════════╡
│ barebones                     │      nan    │                         0    │         0.67 │                            0.05 │             0.46 │
├───────────────────────────────┼─────────────┼──────────────────────────────┼──────────────┼─────────────────────────────────┼──────────────────┤
│ multi_agent_with_knowledge    │        1    │                       nan    │         1    │                            0.41 │           nan    │
├───────────────────────────────┼─────────────┼────────────────────────

In [74]:


# Initialize matrices for different outcomes
matrix_wins = pd.DataFrame(0, index=models, columns=models)
matrix_losses = pd.DataFrame(0, index=models, columns=models)
matrix_ties = pd.DataFrame(0, index=models, columns=models)
matrix_both_bad = pd.DataFrame(0, index=models, columns=models)
matrix_comparisons = pd.DataFrame(0, index=models, columns=models)

# Iterate through the DataFrame to populate the matrices
for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['overall_quality']
    
    # Increment total comparisons for both models
    matrix_comparisons.loc[reviewer_a, reviewer_b] += 1
    matrix_comparisons.loc[reviewer_b, reviewer_a] += 1
    
    # Record different outcomes
    if outcome == '👈  A is better':
        matrix_wins.loc[reviewer_a, reviewer_b] += 1
        matrix_losses.loc[reviewer_b, reviewer_a] += 1
    elif outcome == '👉  B is better':
        matrix_wins.loc[reviewer_b, reviewer_a] += 1
        matrix_losses.loc[reviewer_a, reviewer_b] += 1
    elif outcome == '🤝  Tie':
        matrix_ties.loc[reviewer_a, reviewer_b] += 1
        matrix_ties.loc[reviewer_b, reviewer_a] += 1
    elif outcome == '👎  Both are bad':
        matrix_both_bad.loc[reviewer_a, reviewer_b] += 1
        matrix_both_bad.loc[reviewer_b, reviewer_a] += 1

# Create an upper triangular matrix combining the outcomes
upper_triangular_matrix = pd.DataFrame('', index=models, columns=models)

for i, row_model in enumerate(models):
    for j, col_model in enumerate(models):
        if i < j:  # Only fill the upper triangular part
            win_count = matrix_wins.loc[row_model, col_model]
            loss_count = matrix_losses.loc[row_model, col_model]
            tie_count = matrix_ties.loc[row_model, col_model]
            both_bad_count = matrix_both_bad.loc[row_model, col_model]
            total_comparisons = matrix_comparisons.loc[row_model, col_model]
            
            if total_comparisons > 0:
                upper_triangular_matrix.loc[row_model, col_model] = f"W: {win_count}, L: {loss_count}, T: {tie_count}, BB: {both_bad_count}"
            else:
                upper_triangular_matrix.loc[row_model, col_model] = "N/A"

# Display the upper triangular matrix
print("Upper Triangular Pairwise Comparison Matrix:")
print(upper_triangular_matrix)

Upper Triangular Pairwise Comparison Matrix:
                              barebones multi_agent_with_knowledge  \
barebones                                  W: 0, L: 3, T: 0, BB: 0   
multi_agent_with_knowledge                                           
liang_etal                                                           
multi_agent_without_knowledge                                        
human_reviewer                                                       

                                             liang_etal  \
barebones                      W: 26, L: 6, T: 1, BB: 6   
multi_agent_with_knowledge      W: 1, L: 0, T: 0, BB: 0   
liang_etal                                                
multi_agent_without_knowledge                             
human_reviewer                                            

                              multi_agent_without_knowledge  \
barebones                          W: 1, L: 16, T: 3, BB: 0   
multi_agent_with_knowledge        W: 19, L: 4, T: 23,

In [78]:
print("Pairwise Win Rates:")
print("(Row vs. Column)")
print(tabulate(upper_triangular_matrix, headers='keys', tablefmt='fancy_grid'))
print("Key: W = Win, L = Loss, T = Tie, BB = Both Bad")

Pairwise Win Rates:
(Row vs. Column)
╒═══════════════════════════════╤═════════════╤══════════════════════════════╤══════════════════════════╤═════════════════════════════════╤═════════════════════════╕
│                               │ barebones   │ multi_agent_with_knowledge   │ liang_etal               │ multi_agent_without_knowledge   │ human_reviewer          │
╞═══════════════════════════════╪═════════════╪══════════════════════════════╪══════════════════════════╪═════════════════════════════════╪═════════════════════════╡
│ barebones                     │             │ W: 0, L: 3, T: 0, BB: 0      │ W: 26, L: 6, T: 1, BB: 6 │ W: 1, L: 16, T: 3, BB: 0        │ W: 6, L: 5, T: 2, BB: 0 │
├───────────────────────────────┼─────────────┼──────────────────────────────┼──────────────────────────┼─────────────────────────────────┼─────────────────────────┤
│ multi_agent_with_knowledge    │             │                              │ W: 1, L: 0, T: 0, BB: 0  │ W: 19, L: 4, T: 23, BB: 0  