In [45]:
import pandas as pd
from collections import Counter

base_result_dir = "results/SmolLM2-135M"
result_dir = "results/HW2_lr5e-6"


def get_stat_df(result_dir: str):
    df = pd.read_json(f"{result_dir}/eval_results_strict.jsonl", lines=True)

    instruction_list = []
    follow_instruction = []

    for r in df.to_dict(orient="records"):
        follow_instruction.extend(r["follow_instruction_list"])
        instruction_list.extend(r["instruction_id_list"])

    correct_counter = Counter([instruction_list[i] for i in range(len(instruction_list)) if follow_instruction[i]])
    incorrect_counter = Counter([instruction_list[i] for i in range(len(instruction_list)) if not follow_instruction[i]])

    records = list[tuple[str, float]]()

    for k in set(instruction_list):
        count = correct_counter[k] + incorrect_counter[k]
        acc = correct_counter[k] / count
        records.append((k, acc))

    acc_df = pd.DataFrame(records, columns=['Instruction', 'Accuracy'])

    return acc_df

def get_compare_df(result_dir: str, base_result_dir: str):
    base_acc_df = get_stat_df(base_result_dir)
    acc_df = get_stat_df(result_dir)
    acc_df["Accuracy"] = acc_df["Accuracy"].apply(lambda x: round(x, 2))
    acc_df["Accuracy Bef."] = base_acc_df["Accuracy"].apply(lambda x: round(x, 2))
    acc_df["Increase"] = acc_df["Accuracy"] - acc_df["Accuracy Bef."]
    acc_df["Improved"] = acc_df["Increase"].apply(lambda x: "Yes" if x > 0 else "Unchanged" if x == 0 else "No")

    return acc_df


acc_df = get_compare_df(result_dir, base_result_dir)

In [46]:
acc_df.sort_values('Increase', ascending=False)

Unnamed: 0,Instruction,Accuracy,Accuracy Bef.,Increase,Improved
1,keywords:existence,1.0,0.0,1.0,Yes
22,detectable_format:constrained_response,1.0,0.0,1.0,Yes
19,combination:repeat_prompt,0.78,0.0,0.78,Yes
15,detectable_content:postscript,0.83,0.17,0.66,Yes
10,detectable_format:title,0.67,0.17,0.5,Yes
17,combination:two_responses,0.5,0.0,0.5,Yes
14,change_case:capital_word_frequency,0.57,0.14,0.43,Yes
8,length_constraints:number_paragraphs,0.4,0.0,0.4,Yes
6,length_constraints:number_words,0.43,0.14,0.29,Yes
12,keywords:letter_frequency,0.62,0.38,0.24,Yes
