In [27]:
import sys, os
sys.path.append('/elo_bench')
import plotly.express as px

import gradio as gr
from pathlib import Path
import pandas as pd

from data import ARENA_K
from elo_rating.rating_helper import get_elo_results_from_battles_data, get_bootstrap_medium_elo, get_bootstrap_result

from datamodel.elo_rating_history import EloRatingHistory, BattleOutcomes
import numpy as np
from collections import defaultdict
from elo_rating.rating_evaluator import evaluate_rank_consistency, evaluate_winrate_at_historypoint
from elo_rating import RatingEntity

from tqdm import tqdm

In [28]:
result_dir = r'/elo_bench/results/google_quora_alpaca_10629_test3'
record_file = Path(result_dir)/'battle_records.csv'

USE_BOOTSTRAP_ON_ELO = False
USE_BOOTSTRAP_ON_HISTORY = False
FIRST_N_BATTLES = None
records_df = pd.read_csv(record_file, nrows=FIRST_N_BATTLES)

In [29]:
history = EloRatingHistory.gen_history(result_dir, use_bootstrap=
                                               False, nrows=FIRST_N_BATTLES, step=1)
elo_rating_history_df = history.to_df()


100%|██████████| 12124/12124 [04:06<00:00, 49.25it/s]


In [34]:
def compute_predict_winrate_awinb(elo_rating_data: pd.DataFrame, model_a: str, model_b: str):
    """
    Predicts the win rate between model_a and model_b based on their Elo ratings.

    Args:
        elo_rating_data (pd.DataFrame): DataFrame containing the Elo ratings of the models.
        model_a (str): Name of model_a.
        model_b (str): Name of model_b.

    Returns:
        float: The predicted win rate between model_a and model_b.
    """
    SCALE=400
    BASE=10
    # Get all the unique models in battles_data
    all_models = sorted(elo_rating_data['model'].tolist())
    
    if model_a not in all_models or model_b not in all_models:
        return np.nan
    
    ratings_dict = {}
    for idx, row in elo_rating_data.iterrows():
        ratings_dict[row['model']] = row['elo_rating']
        
    ea = 1 / (1 + BASE ** ((ratings_dict[model_b] - ratings_dict[model_a]) / SCALE))
    # awinb = ea
    # bwina = 1-ea
    return ea

In [35]:
rating_history = []
            
battled_pairs_df = pd.read_csv(Path(result_dir) / 'battled_pairs.csv')
battled_pairs = BattleOutcomes.read_csv(Path(result_dir) / 'battled_pairs.csv').battled_pairs_in_order   
# TODO: handle the invalid winner when dump files and loading
# valid_winner = set(['model_a', 'model_b', 'tie', 'tie(all bad)'])
# battled_pairs = [x for x in battled_pairs if x.winner in valid_winner]             

for idx, battle_num in tqdm(enumerate(history.recorded_battle_num), desc='calculate rating delta'):
    if idx == 0:
        continue
    point_cur = history.get_point(battle_num)
    point_prev = history.get_point(history.recorded_battle_num[idx-1])
    point_prev_battled_pair = battled_pairs[history.recorded_battle_num[idx-1]]
    ranked_models = point_prev['model'].tolist()
    initial_rating = 1000
    if point_prev_battled_pair.model_a not in ranked_models:
        model_a_rating = initial_rating
    else:
        model_a_rating = point_prev[point_prev['model']==point_prev_battled_pair.model_a]['elo_rating'].values[0]
    if point_prev_battled_pair.model_b not in ranked_models:
        model_b_ratting = initial_rating
    else:
        model_b_ratting = point_prev[point_prev['model']==point_prev_battled_pair.model_b]['elo_rating'].values[0]
    winner = point_prev_battled_pair.winner
    valid_winner = set(['model_a', 'model_b', 'tie', 'tie(all bad)'])
    if winner not in valid_winner:
        continue
    # if winner != 'model_a' and winner != 'model_b':
    #     continue

    model_ab_names = sorted([point_prev_battled_pair.model_a,  point_prev_battled_pair.model_b])
    from elo_rating.rating_evaluator import compute_actual_winrate_awinb
    
    actual_winrate_awinb = compute_actual_winrate_awinb(battled_pairs_df.head(history.recorded_battle_num[idx]), model_ab_names[0], model_ab_names[1])
    predict_winrate_awinb = compute_predict_winrate_awinb(point_cur, model_ab_names[0], model_ab_names[1])
    rating_history.append({
        # 'model': row['model'],
        'models': f'{model_ab_names[0]} vs {model_ab_names[1]}',
        'actual_winrate': actual_winrate_awinb,
        'predict_winrate': predict_winrate_awinb,
        'mae_winrate': abs(actual_winrate_awinb - predict_winrate_awinb),
        # 'model_a_rating': min(model_a_rating, model_b_ratting),
        # 'model_b_rating': max(model_a_rating, model_b_ratting),
        'num_battle': battle_num,
    })
rating_history_pd = pd.DataFrame.from_dict(rating_history)

calculate rating delta: 12124it [13:49, 14.61it/s]


In [36]:
rating_history_pd.to_csv(Path(result_dir) / 'elo_history_winrate.csv', index=False)

In [40]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math
# Determine the number of unique models
unique_models = rating_history_pd['models'].unique()
num_models = len(unique_models)

# Define the layout of subplots (e.g., 3 columns)
num_columns = 1
num_rows = math.ceil(num_models / num_columns)

# rating_history_fig3 = px.line(rating_history_pd, 
#                               x="num_battle", 
#                               y="winrate", 
#                               color='models', 
#                               markers=True)
# rating_history_fig3.update_traces(marker=dict(size=8, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'))
# # Set the y-axis range from 0 to 1
# rating_history_fig3.update_yaxes(range=[0, 1])

# Create subplots
rating_history_fig3 = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=unique_models)

# Populate each subplot
for i, model in enumerate(unique_models, start=1):
    filtered_df = rating_history_pd[rating_history_pd['models'] == model]
    row = math.ceil(i / num_columns)
    col = i - (row - 1) * num_columns

    rating_history_fig3.add_trace(
        go.Scatter(x=filtered_df['num_battle'], y=filtered_df['actual_winrate'], mode='lines+markers', name=model,                line=dict(color='red'),  # Set line color
                marker=dict(color='red')  # Set marker color
                ),
        row=row, col=col
    )
    rating_history_fig3.add_trace(
        go.Scatter(
            x=filtered_df['num_battle'], 
            y=filtered_df['predict_winrate'], 
            mode='lines+markers', 
            name=model,
            line=dict(color='blue'),  # Set line color
            marker=dict(color='blue')  # Set marker color
        ),
        row=row, col=col
    )
    rating_history_fig3.add_trace(
        go.Scatter(
                x=filtered_df['num_battle'], 
                y=filtered_df['mae_winrate'], 
                mode='lines+markers', 
                name=model,
                line=dict(color='green'),  # Set line color
                marker=dict(color='green')  # Set marker color
            ),
        row=row, col=col
    )

# Update layout
rating_history_fig3.update_layout(height=300*num_rows, width=3000*num_columns, title_text="Model Comparisons")
rating_history_fig3.update_traces(marker=dict(size=2, line=dict(width=2, color='DarkSlateGrey')))
rating_history_fig3.update_yaxes(range=[0, 1])

In [None]:
rating_history_fig3.write_image(r'/elo_bench/plots/elo_history_winrate.pdf')