In [1]:
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]


def get_model_df():
    cnt = 0
    q2result = []
    fin = open("data/mt_bench/model_judgment/gpt-4_single.jsonl", "r")
    for line in fin:
        obj = json.loads(line)
        obj["category"] = CATEGORIES[(obj["question_id"]-81)//10]
        q2result.append(obj)
    df = pd.DataFrame(q2result)
    return df

def toggle(res_str):
    if res_str == "win":
        return "loss"
    elif res_str == "loss":
        return "win"
    return "tie"

def get_model_df_pair():
    fin = open("data/mt_bench/model_judgment/gpt-4_pair.jsonl", "r")
    cnt = 0
    q2result = []
    for line in fin:
        obj = json.loads(line)

        result = {}
        result["qid"] = str(obj["question_id"])
        result["turn"] = str(obj["turn"])
        if obj["g1_winner"] == "model_1" and obj["g2_winner"] == "model_1":
            result["result"] = "win"
        elif obj["g1_winner"] == "model_2" and obj["g2_winner"] == "model_2":
            result["result"] = "loss"
        else:
            result["result"] = "tie"
        result["category"] = CATEGORIES[(obj["question_id"]-81)//10]
        result["model"] = obj["model_1"]
        q2result.append(result)

    df = pd.DataFrame(q2result)

    return df

df = get_model_df()
df_pair = get_model_df_pair()

In [2]:
df_pair

Unnamed: 0,qid,turn,result,category,model
0,81,1,loss,Writing,alpaca-13b
1,81,2,loss,Writing,alpaca-13b
2,82,1,loss,Writing,alpaca-13b
3,82,2,loss,Writing,alpaca-13b
4,83,1,loss,Writing,alpaca-13b
...,...,...,...,...,...
4795,158,2,tie,Humanities,wizardlm-30b
4796,159,1,loss,Humanities,wizardlm-30b
4797,159,2,win,Humanities,wizardlm-30b
4798,160,1,loss,Humanities,wizardlm-30b


In [3]:
all_models = df["model"].unique()
print(all_models)
scores_all = []
for model in all_models:
    for cat in CATEGORIES:
        # filter category/model, and score format error (<1% case)
        res = df[(df["category"]==cat) & (df["model"]==model) & (df["score"] >= 0)]
        score = res["score"].mean()

        # # pairwise result
        # res_pair = df_pair[(df_pair["category"]==cat) & (df_pair["model"]==model)]["result"].value_counts()
        # wincnt = res_pair["win"] if "win" in res_pair.index else 0
        # tiecnt = res_pair["tie"] if "tie" in res_pair.index else 0
        # winrate = wincnt/res_pair.sum()
        # winrate_adjusted = (wincnt + tiecnt)/res_pair.sum()
        # # print(winrate_adjusted)

        # scores_all.append({"model": model, "category": cat, "score": score, "winrate": winrate, "wtrate": winrate_adjusted})
        scores_all.append({"model": model, "category": cat, "score": score})

['alpaca-13b' 'baize-v2-13b' 'chatglm-6b' 'claude-instant-v1' 'claude-v1'
 'dolly-v2-12b' 'falcon-40b-instruct' 'fastchat-t5-3b' 'gpt-3.5-turbo'
 'gpt-4' 'gpt4all-13b-snoozy' 'guanaco-33b' 'guanaco-65b'
 'h2ogpt-oasst-open-llama-13b' 'koala-13b' 'llama-13b' 'mpt-30b-chat'
 'mpt-30b-instruct' 'mpt-7b-chat' 'nous-hermes-13b'
 'oasst-sft-4-pythia-12b' 'oasst-sft-7-llama-30b' 'palm-2-chat-bison-001'
 'rwkv-4-raven-14b' 'stablelm-tuned-alpha-7b' 'tulu-30b' 'vicuna-13b-v1.3'
 'vicuna-33b-v1.3' 'vicuna-7b-v1.3' 'wizardlm-13b' 'wizardlm-30b'
 'Llama-2-7b-chat' 'Llama-2-13b-chat' 'Llama-2-70b-chat'
 'mistral-v01-openhermes-1k-longest-15-epoch-neftune10-lr2e-6'
 'mistral-v01-openhermes-1k-longest-curated-15-epoch-neftune10-lr2e-6'
 'mistral-v01-openhermes-1k-longest-curated-15-epoch-neftune3-lr2e-6'
 'mistral-v01-openhermes-1k-longest-raw-15-epoch-neftune10-lr2e-6'
 'mistral-v01-openhermes-1k-longest-curated-15-epoch-neftune10-lr3e-6'
 'mistral-v01-openhermes-1k-longest-curated-15-epoch-neftune1

In [10]:
# target_models = ["Llama-2-7b-chat", "Llama-2-13b-chat", "Llama-2-70b-chat", "gpt-3.5-turbo", "claude-v1", "gpt-4", "OpenHermes-2.5-Mistral-7B", "mistral-v01-openhermes-1k-longest-raw-15-epoch-neftune10-lr2e-6"]
target_models = ["Llama-2-70b-chat", "gpt-3.5-turbo", "claude-v1", "gpt-4", "OpenHermes-2.5-Mistral-7B", "mistral-v01-openhermes-1k-longest-raw-15-epoch-neftune10-lr2e-6", "mistral-v01-openhermes-6k-longest-stratified-10-epoch-neftune10-lr15e-6"]


scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]

rename_map = {"llama-13b": "LLaMA-13B",
              "alpaca-13b": "Alpaca-13B",
              "vicuna-33b-v1.3": "Vicuna-33B",
              "vicuna-13b-v1.3": "Vicuna-13B",
              "gpt-3.5-turbo": "GPT-3.5-turbo",
              "claude-v1": "Claude-v1",
              "gpt-4": "GPT-4",
              "mistral-v01-openhermes-1k-longest-raw-15-epoch-neftune10-lr2e-6": "mistral-v01-openhermes-1k-longest",
              "mistral-v01-openhermes-6k-longest-stratified-10-epoch-neftune10-lr15e-6": "mistral-v01-openhermes-6k-longest"
              }

for k, v in rename_map.items():
    df_score.replace(k, v, inplace=True)

fig = px.line_polar(df_score, r = 'score', theta = 'category', line_close = True, category_orders = {"category": CATEGORIES},
                    color = 'model', markers=True, color_discrete_sequence=px.colors.qualitative.Pastel)

fig.show()