# Updated ELO Calculation for Final Data


In [38]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import re 

In [130]:
df = pd.read_json("arena_votes/arena_votes.jsonl", lines=True)
df.head(3)

Unnamed: 0,session_id,paper_id,reviewer_a,reviewer_b,technical_quality,constructiveness,clarity,overall_quality,review_a,review_b,vote_time
0,7tppsuo8kyq,acl_2024_s59,barebones,multi_agent_with_knowledge,👉 B is better,👉 B is better,👉 B is better,👉 B is better,Here are my main feedback comments as a peer r...,"Critical Review of ""Zero-Shot Cross-Lingual Re...",2024-09-08 19:41:00.624620
1,7tppsuo8kyq,acl_2024_s59,multi_agent_with_knowledge,liang_etal,👈 A is better,👈 A is better,👈 A is better,👈 A is better,"Critical Review of ""Zero-Shot Cross-Lingual Re...",Review outline:\n\n1. Significance and novelty...,2024-09-08 19:41:16.496030
2,7tppsuo8kyq,acl_2024_s59,multi_agent_with_knowledge,multi_agent_without_knowledge,👈 A is better,👈 A is better,👈 A is better,👈 A is better,"Critical Review of ""Zero-Shot Cross-Lingual Re...","Critical Review of ""Zero-Shot Cross-Lingual Re...",2024-09-08 19:41:48.048705


In [94]:
df['review_a'][15]

'Review outline:\n\n1. Significance and novelty\n\n2. Potential reasons for acceptance\n\n3. Potential reasons for rejection\n   a. Lack of theoretical contribution\n      - No new modeling approach or algorithm proposed\n      - Limited innovation beyond dataset cleaning and simple post-processing\n   \n   b. Insufficient experimental evaluation\n      - Only four existing models evaluated\n      - No comparison to very recent state-of-the-art approaches\n   \n   c. Questionable generalizability of the refinement solution\n      - Rule-based approach may not generalize well to other datasets/domains\n      - Relies on specific properties of Chinese characters and existing tools\n\n   d. Inadequate analysis of dataset issues and fixes\n      - No rigorous quantification of dataset quality improvements\n      - Potential for introducing new biases during manual correction process\n\n4. Suggestions for improvement'

In [23]:
models = ['multi_agent_with_knowledge', 'multi_agent_without_knowledge', 'liang_etal', 'barebones', 'human_reviewer']
elo_ratings = {model: 1500 for model in models}

K = 32 # Common value, adjustable

def expected_score(rating_a, rating_b):
    return 1 / (1 + 10** ((rating_b-rating_a) / 400))

def update_elo(rating_a, rating_b, outcome_a):
    expected_a = expected_score(rating_a, rating_b)
    rating_a_new = rating_a + K * (outcome_a - expected_a)
    rating_b_new = rating_b + K * ((1-outcome_a)-(1-expected_a))
    return rating_a_new, rating_b_new

for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['overall_quality']
    rating_a = elo_ratings[reviewer_a]
    rating_b = elo_ratings[reviewer_b]
    
    if outcome == '👈  A is better':
        outcome_a = 1
    elif outcome == '👉  B is better':
        outcome_a = 0
    elif outcome == '🤝  Tie':
        outcome_a = 0.5
    elif outcome == '👎  Both are bad':
        #continue  # No change in ratings if both are bad
        outcome_a = 0.5
    new_rating_a, new_rating_b = update_elo(rating_a, rating_b, outcome_a)

    elo_ratings[reviewer_a] = new_rating_a
    elo_ratings[reviewer_b] = new_rating_b
    
for model, rating in elo_ratings.items():
    print(f"{model}: {rating:.2f}")

multi_agent_with_knowledge: 1675.48
multi_agent_without_knowledge: 1629.84
liang_etal: 1408.92
barebones: 1463.79
human_reviewer: 1321.97


In [167]:
import math
from typing import Tuple

class EloSystem:
    def __init__(self, k_factor=32):
        self.k_factor = k_factor
        self.elo_ratings = {}
        self.matches_played = {}
        self.model = LogisticRegression()

    def expected_score(self, rating_A: float, rating_B: float) -> float:
        # Standard ELO Caclulation
        return 1 / (1 + 10 ** ((rating_B - rating_A) / 400))
    
    def extract_features(self, text):
        '''
        Get feature counts:
            # of words
            # of sections
            # of lists (dashed, numbered, or lettered)
        '''
        length = len(text.split())
        sections = text.count('\n')
        
        num_lists = len(re.findall(r'\d+\.', text))
        dashed_lists = len(re.findall(r'- ', text))
        lettered_lists = len(re.findall(r'(?<!\w)[a-zA-Z]\. ', text))
        
        lists = num_lists + dashed_lists + lettered_lists
        #print(f"Lists: {lists}, Headers: {headers}, Length: {length}")
        return {
            'length': length,
            'sections': sections,
            'lists': lists,
        }
    
    def fit_bt_model(self, df):
        '''
        Fit Bradley-Terry Linear Regression Model on data
        explanation: predict p(review a is better than review b) based on feature diffs
        returns: model coefficients [length, sections, lists]
        '''
        
        # Extract features for review a and review b
        df['a_features'] = df['review_a'].apply(self.extract_features)
        df['b_features'] = df['review_b'].apply(self.extract_features)
        df['outcome'] = df['overall_quality'].apply(lambda x: 1 if '👈  A is better' in x else 0)
        
        # This represents how many MORE of each feature (length(a) - length(b))
        df['length_diff'] = df['a_features'].apply(lambda x: x['length']) - df['b_features'].apply(lambda x: x['length'])
        df['sections_diff'] = df['a_features'].apply(lambda x: x['sections']) - df['b_features'].apply(lambda x: x['sections'])
        df['lists_diff'] = df['a_features'].apply(lambda x: x['lists']) - df['b_features'].apply(lambda x: x['lists'])
        
        X = df[['length_diff', 'sections_diff', 'lists_diff']]
        y = df['outcome']
        
        # Training model
        self.model.fit(X, y)
        print(f"Model Coefficients []: {self.model.coef_}")
    
    def predict_bradley_terry(self, response_A_features, response_B_features):
        # Take actual feature difference
        feature_diff = {
            'length_diff': response_A_features['length'] - response_B_features['length'],
            'sections_diff': response_A_features['sections'] - response_B_features['sections'],
            'lists_diff': response_A_features['lists'] - response_B_features['lists'],
        }
        
        # format input for regression
        feature_diff_df = pd.DataFrame([feature_diff])
        feature_diff_df = feature_diff_df[['length_diff', 'sections_diff', 'lists_diff']]
        
        # Predicts classs probabilities [p(review b better), p(review a better)]
        # prob_a_better is the expected_outcome
        prob_A_better = self.model.predict_proba(feature_diff_df)[0][1]  
        return prob_A_better

    def update_elo(self, rating_A: float, rating_B: float, outcome_A: float, expected_A) -> Tuple[float, float]:
        '''
            Params: current Elo for A & B, Actual outcome, Expected outcome
        '''
        print(f"Current A: {rating_A}, Current B: {rating_B}, Expected Outcome A: {expected_A}, Actual Outcome A: {outcome_A}")
        new_rating_A = rating_A + self.k_factor * (outcome_A - expected_A)
        new_rating_B = rating_B + self.k_factor * ((1 - outcome_A) - (1 - expected_A))

        return new_rating_A, new_rating_B

    
    
    def process_match(self, reviewer_a, reviewer_b, outcome, review_a, review_b):
        '''
        ELO Workflow
        '''
        # Grab current ELO ratings for both reviewers, if not assigned, start w/ 1500
        rating_a = self.elo_ratings.get(reviewer_a, 1500)
        rating_b = self.elo_ratings.get(reviewer_b, 1500)
        
        # Caclculate ELO update for expected score (traditional ELO):
        expected_score_a = self.expected_score(rating_a, rating_b)
        new_rating_a_step1, new_rating_b_step1 = self.update_elo(rating_a, rating_b, outcome, expected_score_a)
        
        elo_change_a_step1 = new_rating_a_step1 - rating_a
        elo_change_b_step1 = new_rating_b_step1 - rating_b
        print(f"- Traditional ELO -- Change for A: {elo_change_a_step1}, Change for B: {elo_change_b_step1}")
        
        # Use LR model to adjust for features
        review_a_features = self.extract_features(review_a)
        review_b_features = self.extract_features(review_b)
        prob_a_better = self.predict_bradley_terry(review_a_features, review_b_features)
        new_rating_a_step2, new_rating_b_step2 = self.update_elo(new_rating_a_step1, new_rating_b_step1, outcome, prob_a_better)
        
        elo_change_a_step2 = new_rating_a_step2 - new_rating_a_step1
        elo_change_b_step2 = new_rating_b_step2 - new_rating_b_step1
        print(f"- Logistic Regression - Reviewera: {reviewer_a}, {reviewer_b}:  {prob_a_better} - Change for A: {elo_change_a_step2}, Change for B: {elo_change_b_step2}")

        
        self.elo_ratings[reviewer_a] = new_rating_a_step2
        self.elo_ratings[reviewer_b] = new_rating_b_step2

        self.matches_played[reviewer_a] = self.matches_played.get(reviewer_a, 0) + 1
        self.matches_played[reviewer_b] = self.matches_played.get(reviewer_b, 0) + 1
    
    def _calculate_confidence_interval(self, rating: float, vote_count: float) -> Tuple[float, float]:
        if vote_count == 0:
            return (rating, rating)

        # Standard deviation of the rating
        std_dev = self.k_factor / math.sqrt(vote_count)

        # 95% CI is approximately 1.96 standard deviations from the mean
        margin = 1.96 * std_dev

        lower_bound = rating - margin
        upper_bound = rating + margin

        return (lower_bound, upper_bound)
    
    def get_ratings_with_confidence_intervals(self):

        ratings_with_ci = {}
        for model, rating in self.elo_ratings.items():
            match_count = self.matches_played.get(model, 0)
            lower_bound, upper_bound = self._calculate_confidence_interval(rating, match_count)
            ratings_with_ci[model] = {
                'rating': rating,
                'confidence_interval': (lower_bound, upper_bound)
            }
        return ratings_with_ci
    
    def run_model_on_df(self, df):
        '''
        Process the whole DF
        '''
        if 'outcome' not in df.columns:
            df['outcome'] = df['overall_quality'].apply(lambda x: 1 if '👈  A is better' in x else 0)
        for idx, row in df.iterrows():
            review_a = row['review_a']
            review_b = row['review_b']
            reviewer_a = row['reviewer_a']
            reviewer_b = row['reviewer_b']

            actual_outcome = row['outcome']

            self.process_match(reviewer_a, reviewer_b, actual_outcome, review_a, review_b)

        return self.elo_ratings


    

In [168]:
'''
    Notes on outputs (print statements can be uncommented to look at this)
        - ELO changes relatively small when expected is 50/50
        - More signifficant change when elo/regression are in disagreement
'''

df = pd.read_json("arena_votes/arena_votes.jsonl", lines=True)
elo = EloSystem()
elo.fit_bt_model(df)
updated_ratings = elo.run_model_on_df(df)
print(updated_ratings)


Model Coefficients []: [[0.00452492 0.01901784 0.00314602]]
Current A: 1500, Current B: 1500, Expected Outcome A: 0.5, Actual Outcome A: 0
- Traditional ELO -- Change for A: -16.0, Change for B: 16.0
Current A: 1484.0, Current B: 1516.0, Expected Outcome A: 0.023929023226906224, Actual Outcome A: 0
- Logistic Regression - Reviewera: barebones, multi_agent_with_knowledge:  0.023929023226906224 - Change for A: -0.7657287432609792, Change for B: 0.7657287432609792
Current A: 1516.765728743261, Current B: 1500, Expected Outcome A: 0.524109112525195, Actual Outcome A: 1
- Traditional ELO -- Change for A: 15.228508399193743, Change for B: -15.228508399193743
Current A: 1531.9942371424547, Current B: 1484.7714916008063, Expected Outcome A: 0.9565882488260441, Actual Outcome A: 1
- Logistic Regression - Reviewera: multi_agent_with_knowledge, liang_etal:  0.9565882488260441 - Change for A: 1.3891760375665854, Change for B: -1.3891760375665854
Current A: 1533.3834131800213, Current B: 1500, Expe

In [153]:
'''
** NOT IN USE **

Testing the model for each ELO
'''

# Initialize the Elo system
elo_system = EloSystem(k_factor=32)
print("_______________________________")
print("-----Overall Quality-----------")
print("_______________________________")
# Process matches (example data, substitute with your match data)
# Assuming df is a DataFrame where you have columns 'reviewer_a', 'reviewer_b', and 'overall_quality'
for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['overall_quality']

    elo_system.process_match(reviewer_a, reviewer_b, outcome)

# Get final ratings with confidence intervals
final_ratings_with_ci = elo_system.get_ratings_with_confidence_intervals()

# Display the results
for model, data in final_ratings_with_ci.items():
    rating = data['rating']
    lower_bound, upper_bound = data['confidence_interval']
    print(f"{model}: Rating = {rating:.2f}, 95% CI = ({lower_bound:.2f}, {upper_bound:.2f})")

print("_______________________________")
print("-----Clarity-----------")
print("_______________________________")
# Process matches (example data, substitute with your match data)
# Assuming df is a DataFrame where you have columns 'reviewer_a', 'reviewer_b', and 'overall_quality'
for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['clarity']

    elo_system.process_match(reviewer_a, reviewer_b, outcome)

# Get final ratings with confidence intervals
final_ratings_with_ci = elo_system.get_ratings_with_confidence_intervals()

# Display the results
for model, data in final_ratings_with_ci.items():
    rating = data['rating']
    lower_bound, upper_bound = data['confidence_interval']
    print(f"{model}: Rating = {rating:.2f}, 95% CI = ({lower_bound:.2f}, {upper_bound:.2f})")
    

print("_______________________________")
print("-----Constructiveness-----------")
print("_______________________________")
# Process matches (example data, substitute with your match data)
# Assuming df is a DataFrame where you have columns 'reviewer_a', 'reviewer_b', and 'overall_quality'
for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['constructiveness']

    elo_system.process_match(reviewer_a, reviewer_b, outcome)

# Get final ratings with confidence intervals
final_ratings_with_ci = elo_system.get_ratings_with_confidence_intervals()

# Display the results
for model, data in final_ratings_with_ci.items():
    rating = data['rating']
    lower_bound, upper_bound = data['confidence_interval']
    print(f"{model}: Rating = {rating:.2f}, 95% CI = ({lower_bound:.2f}, {upper_bound:.2f})")
    
print("_______________________________")
print("-----Technical Quality-----------")
print("_______________________________")
# Process matches (example data, substitute with your match data)
# Assuming df is a DataFrame where you have columns 'reviewer_a', 'reviewer_b', and 'overall_quality'
for i, row in df.iterrows():
    reviewer_a = row['reviewer_a']
    reviewer_b = row['reviewer_b']
    outcome = row['technical_quality']

    elo_system.process_match(reviewer_a, reviewer_b, outcome)

# Get final ratings with confidence intervals
final_ratings_with_ci = elo_system.get_ratings_with_confidence_intervals()

# Display the results
for model, data in final_ratings_with_ci.items():
    rating = data['rating']
    lower_bound, upper_bound = data['confidence_interval']
    print(f"{model}: Rating = {rating:.2f}, 95% CI = ({lower_bound:.2f}, {upper_bound:.2f})")

_______________________________
-----Overall Quality-----------
_______________________________


TypeError: EloSystem.process_match() missing 2 required positional arguments: 'review_a' and 'review_b'

In [30]:
def calculate_win_prob(a, b):
    return 1 / (1 + 10 ** ((b - a) / 400))

mammorx = 1676.55
human = 1322.88

win_prob = calculate_win_prob(mammorx, human)

print(f"Probability of winning: {win_prob * 100:.2f}%")

Probability of winning: 88.45%
