In [None]:
import json
import os
from typing import Any
import numpy as np
import pandas as pd

In [None]:
def get_rule_eval_jsonl(model_name:str, audio_task:str, response_task:str, IF_task:str, data_dir:str="model_responses") -> dict[int, dict[str, Any]]:
    IF_task = IF_task.replace(":", "_")
    file_dir = os.path.join(data_dir, model_name, audio_task, response_task, IF_task, "reports")
    files = os.listdir(file_dir)
    files = [f for f in files if f.startswith("rule_eval@output_") and f.endswith(".jsonl")]
    results = {}
    for file_name in files:
        file_path = os.path.join(file_dir, file_name)
        k = file_name.split("@output_")[-1].split("-shot")[0]
        k = int(k)
        results[k] = []
        with open(file_path, 'r') as f:
            for line in f:
                results[k].append(json.loads(line))
    if len(results) != 9:
        print(f"Warning: Expected 9 shot levels, but got {len(results)} in {file_dir}")
    return results

In [43]:
def createDFfromRuleEvalResults(results:dict[int, dict[str, Any]], performance_metric:str) -> pd.DataFrame:
    d = {
        "IF_task": [],
        "shot_level": [],
        "if_rate_strict": [],
        "if_rate_loose": [],
        "mean_performance": [],
        "n": []
    }

    for k, v in results.items():
        for shot_level, evaluations in v.items():
            performance_scores = [eval[performance_metric] for eval in evaluations] if performance_metric in evaluations[0] else None
            strict_follow_flags = [eval['strict_follow_all_instructions'] for eval in evaluations]
            loose_follow_flags = [eval['loose_follow_all_instructions'] for eval in evaluations]
            mean_performance = np.mean(performance_scores) if performance_scores is not None else None
            if_rate_strict = np.mean(strict_follow_flags)
            if_rate_loose = np.mean(loose_follow_flags)
            n = len(evaluations)
            d["IF_task"].append(k)
            d["shot_level"].append(shot_level)
            d["if_rate_strict"].append(if_rate_strict)
            d["if_rate_loose"].append(if_rate_loose)
            d["mean_performance"].append(mean_performance)
            d["n"].append(n)

    return pd.DataFrame(d)

In [63]:
def eval(model_name:str, to_csv:bool=True):
    response_task = "closed_ended_questions"

    d_df = {}
    for audio_task, performance_metric in zip(["ASR", "GR", "SER"], ["wer", "answer_correct", "answer_correct"]):
        IF_tasks = os.listdir(os.path.join("model_responses", model_name, audio_task, response_task))
        results = {}
        for IF_task in IF_tasks:
            results[IF_task] = get_rule_eval_jsonl(model_name, audio_task, response_task, IF_task)
        d_df[audio_task] = createDFfromRuleEvalResults(results, performance_metric)

    if to_csv:
        for audio_task, df in d_df.items():
            fn = os.path.join("./analysis", f"{model_name}_{audio_task}_{response_task}_summary.csv")
            print(f"Saving summary to {fn}")
            df.to_csv(fn, index=False)

    return d_df

In [81]:
d_df = {}
for model_name in ["qwen", "qwen2", "desta2_5", "blsp-emo"]:
    print(f"------------ {model_name} ------------")
    d_df[model_name] = eval(model_name)

------------ qwen ------------
Saving summary to ./analysis/qwen_ASR_closed_ended_questions_summary.csv
Saving summary to ./analysis/qwen_GR_closed_ended_questions_summary.csv
Saving summary to ./analysis/qwen_SER_closed_ended_questions_summary.csv
------------ qwen2 ------------
Saving summary to ./analysis/qwen2_ASR_closed_ended_questions_summary.csv
Saving summary to ./analysis/qwen2_GR_closed_ended_questions_summary.csv
Saving summary to ./analysis/qwen2_SER_closed_ended_questions_summary.csv
------------ desta2_5 ------------
Saving summary to ./analysis/desta2_5_ASR_closed_ended_questions_summary.csv
Saving summary to ./analysis/desta2_5_GR_closed_ended_questions_summary.csv
Saving summary to ./analysis/desta2_5_SER_closed_ended_questions_summary.csv
------------ blsp-emo ------------
Saving summary to ./analysis/blsp-emo_ASR_closed_ended_questions_summary.csv
Saving summary to ./analysis/blsp-emo_GR_closed_ended_questions_summary.csv
Saving summary to ./analysis/blsp-emo_SER_clo

In [None]:
group_map = {
    "detectable_format:number_bullet_lists": "bullet_lists",
    "length_constraints:number_words": "length_constraints",
    "length_constraints:number_sentences": "length_constraints",
    "length_constraints:number_paragraphs": "length_constraints",
    "keywords:forbidden_words": "keywords",
    "keywords:existence": "keywords",
    "change_case:english_capital": "change_case",
    "change_case:english_lowercase": "change_case",
    "detectable_format:json_format": "json_format",
    "startend:quotation": "wrapping",
    "detectable_format:title": "wrapping",
    "combination:repeat_prompt": "startend",
    "startend:end_checker": "startend",
}
group_map = {k.replace(':', '_'): v for k, v in group_map.items()}

group_map_ceq = {
    "change_case:english_capital": "change_case",
    "change_case:english_lowercase": "change_case",
    "detectable_format:json_format": "json_format",
    "startend:quotation": "wrapping",
    "detectable_format:title": "wrapping",
    "combination:repeat_prompt": "startend",
    "startend:end_checker": "startend",
}
group_map_ceq = {k.replace(':', '_'): v for k, v in group_map_ceq.items()}


In [145]:
model_order = ["qwen", "qwen2", "desta2_5", "blsp-emo"]

# put "other" last; keep the rest in a deterministic order
group_order = []
for v in group_map_ceq.values():
    if v not in group_order and v != "other":
        group_order.append(v)

df_audio_task = {}
fn = os.path.join("./analysis", f"summary_ceq.xlsx")

with pd.ExcelWriter(fn, engine="openpyxl") as writer:
    for audio_task in ["ASR", "GR", "SER"]:
        dfs = []
        for model_name in model_order:
            df = d_df[model_name][audio_task].copy()

            df["model"] = model_name
            df["IF_task_group"] = df["IF_task"].map(group_map_ceq)

            df = df[[
                "IF_task_group", "IF_task", "n", "model",
                "shot_level", "if_rate_strict", "if_rate_loose", "mean_performance"
            ]]
            dfs.append(df)

        df_all = pd.concat(dfs, ignore_index=True)

        # --- ordered categoricals for deterministic sorting ---
        # shot_level: if it's numeric, this works; if it's strings like "0-shot", see note below
        shot_order = sorted(df_all["shot_level"].dropna().unique())
        df_all["shot_level"] = pd.Categorical(df_all["shot_level"], categories=shot_order, ordered=True)

        df_all["IF_task_group"] = pd.Categorical(df_all["IF_task_group"], categories=group_order, ordered=True)
        df_all["model"] = pd.Categorical(df_all["model"], categories=model_order, ordered=True)

        # stable sort so ties keep predictable order
        df_all = df_all.sort_values(
            by=["shot_level", "IF_task_group", "IF_task", "model"],
            ascending=[True, True, True, True],
            kind="mergesort",
        )

        df_all.to_excel(writer, sheet_name=audio_task, index=False)
        df_audio_task[audio_task] = df_all

        df_audio_task[f"{audio_task}_grouped"] = df_all.groupby(["shot_level", "IF_task_group", "model"], observed=False).mean(numeric_only=True).reset_index()
        df_audio_task[f"{audio_task}_grouped"].to_excel(writer, sheet_name=f"{audio_task}_grouped", index=False)

print(f"Saving combined summary to {fn}")


Saving combined summary to ./analysis/summary_ceq.xlsx


In [146]:
df_audio_task.keys()

dict_keys(['ASR', 'ASR_grouped', 'GR', 'GR_grouped', 'SER', 'SER_grouped'])