In [6]:
import os
import pickle
import dotenv
from tqdm import tqdm
dotenv.load_dotenv()
import numpy as np
from collections import defaultdict

def aggregate_scores(eval_dir):    
    scores = []
    for file in tqdm(os.listdir(eval_dir)):
        if file.endswith(".pkl"):
            with open(os.path.join(eval_dir, file), "rb") as f:
                result = pickle.load(f)
            scores.extend(result['qwen_scores'])
    return scores

# eval_dir = "/home/song/code/frag4/output/train/evaluation/Qwen/Qwen2.5-7B-Instruct/squad/greedy_golden"
# scores = aggregate_scores(eval_dir)
# print(np.mean(scores), np.std(scores))

In [17]:
datasets=["squad", "triviaqa"]
models=["Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct"]
samples=["greedy_golden", "greedy_without", "greedy_irrelevant"]
# splits=["train", "test", "validation"]
splits=["train"]

all_scores = defaultdict(dict)
for split in splits:
    for model in models:
        for sample in samples:
            print(f"{split} {model} {sample} mean std")
            for dataset in datasets:
                eval_dir = f"/home/song/code/frag4/output/{split}/evaluation/{model}/{dataset}/{sample}"
                scores = aggregate_scores(eval_dir)
                assert all(x in [0, 1] for x in scores)
                all_scores[f"{split}-{model}-{sample}"][dataset] = f"{np.mean(scores):.2f}"
import json
json.dump(all_scores, open("all_scores.json", "w"), indent=4)

train Qwen/Qwen2.5-7B-Instruct greedy_golden mean std


100%|██████████| 2000/2000 [00:00<00:00, 31109.94it/s]
100%|██████████| 2000/2000 [00:00<00:00, 41213.96it/s]


train Qwen/Qwen2.5-7B-Instruct greedy_without mean std


100%|██████████| 2000/2000 [00:00<00:00, 50776.96it/s]
100%|██████████| 2000/2000 [00:00<00:00, 57578.87it/s]


train Qwen/Qwen2.5-7B-Instruct greedy_irrelevant mean std


100%|██████████| 2000/2000 [00:00<00:00, 62418.95it/s]
100%|██████████| 2000/2000 [00:00<00:00, 61579.06it/s]


train meta-llama/Llama-3.1-8B-Instruct greedy_golden mean std


100%|██████████| 2000/2000 [00:00<00:00, 64603.79it/s]
100%|██████████| 2000/2000 [00:00<00:00, 65037.04it/s]


train meta-llama/Llama-3.1-8B-Instruct greedy_without mean std


100%|██████████| 2000/2000 [00:00<00:00, 62779.11it/s]
100%|██████████| 2000/2000 [00:00<00:00, 63764.06it/s]


train meta-llama/Llama-3.1-8B-Instruct greedy_irrelevant mean std


100%|██████████| 2000/2000 [00:00<00:00, 63766.96it/s]
100%|██████████| 2000/2000 [00:00<00:00, 63711.75it/s]


In [None]:
"""
"all_scores.json" looks like this:
{
    "train-Qwen/Qwen2.5-7B-Instruct-greedy_golden": {
        "squad": "0.88",
        "triviaqa": "0.82"
    },
    "train-Qwen/Qwen2.5-7B-Instruct-greedy_without": {
        "squad": "0.26",
        "triviaqa": "0.62"
    },
    "train-Qwen/Qwen2.5-7B-Instruct-greedy_irrelevant": {
        "squad": "0.11",
        "triviaqa": "0.42"
    },
    "train-meta-llama/Llama-3.1-8B-Instruct-greedy_golden": {
        "squad": "0.76",
        "triviaqa": "0.69"
    },
    "train-meta-llama/Llama-3.1-8B-Instruct-greedy_without": {
        "squad": "0.28",
        "triviaqa": "0.74"
    },
    "train-meta-llama/Llama-3.1-8B-Instruct-greedy_irrelevant": {
        "squad": "0.24",
        "triviaqa": "0.60"
    }
}
"""