In [4]:
import hashlib
import json
import os
import sys

import pandas as pd

import factscore
import factscore.factscorer

sys.path.append("benchmark/aspect_prediction")
from utils import TASK2GT, TASK_NAMES, postprocess_cot
from massw.models import gpt_azure, mixtral_azure


def evaluate_factscore(generations, contexts):
    hash_key = hashlib.md5("\n".join(generations + contexts).encode()).hexdigest()
    cache_dir = f"./cache/factscore/{hash_key}"
    fs = factscore.factscorer.FactScorer(cache_dir=cache_dir)
    dummy_topics = [None] * len(generations)
    scores = fs.get_score(dummy_topics, generations, contexts, verbose=False, gamma=0)
    return scores


DATA_FOLDER = "../massw/data"

In [2]:
"""Evaluate experiment results based on the model generated output (file)."""
model_output_base_dir = "../massw/benchmark/aspect_prediction/outputs"
reference_path = f"{DATA_FOLDER}/benchmark_0531.jsonl"

# Read reference data
df_ref = pd.read_json(reference_path, lines=True)
sample_ids = df_ref["pid"][:100].tolist()


def postprocess_output(
    model_output_dir, reference_path, used_cot=False, model_type="gpt"
):
    """
    Process model output files to match predictions with references.

    Args:
        model_output_dir (str): Directory containing the output files.
        reference_path (str): Path to the file containing reference data.
        used_cot (bool): Flag to determine if COT processing is needed.
        model_type (str): Type of model used to adjust processing logic.

    Returns:
        dict: A dictionary containing predictions and references by task.
    """
    results = {}
    with open(reference_path, "r", encoding="utf-8") as f:
        references = [json.loads(line) for line in f]
    id2ref = {r["pid"]: r for r in references}

    for task_name in TASK_NAMES:
        gt_name = TASK2GT[task_name]
        model_path = f"{model_output_dir}/{task_name}.tsv"

        if model_type == "gpt":
            id2predictions = gpt_azure.raw_output_to_dict(model_path)
        elif model_type == "mixtral":
            id2predictions = mixtral_azure.raw_output_to_dict(model_path)
        else:
            raise ValueError(f"Model type {model_type} not supported.")

        if used_cot:
            for pid in id2predictions:
                try:
                    id2predictions[pid] = postprocess_cot(id2predictions[pid])
                except Exception as e:
                    print(f"Error processing COT for {pid}: {e}")
                    continue

        id2predictions = {
            pid: id2predictions[pid] for pid in id2predictions if pid in sample_ids
        }

        results[task_name] = {
            "predictions": list(id2predictions.values()),
            "references": [
                id2ref[pid][gt_name] for pid in id2ref.keys() if pid in id2predictions
            ],
        }
        assert len(results[task_name]["predictions"]) == len(
            results[task_name]["references"]
        )

    return results

In [None]:
# List all directories in the model output base directory
dump_folder = "./results/"
model_output_dirs = os.listdir(model_output_base_dir)
for d in model_output_dirs:
    model_output_path = f"{model_output_base_dir}/{d}"
    print(f"Processing {model_output_path}")
    used_cot = "chain" in d
    model_type = "gpt" if "gpt" in d else "mixtral"
    results = postprocess_output(model_output_path, reference_path, used_cot, model_type)
    print(f"Model: {d}")
    for task_name in results:
        predictions = results[task_name]["predictions"]
        references = results[task_name]["references"]
        print(f"Task: {task_name}, predictions: {len(predictions)}, references: {len(references)}")
        scores = evaluate_factscore(predictions, references)
        with open(f"{dump_folder}/benchmark/{d}_{task_name}_fs.json", "w") as f:
            json.dump(scores, f)


In [None]:
benchmark_factscore_dir = "results/benchmark"

benchmark_factscore = {}
for name in model_output_dirs:
    benchmark_factscore[name] = {}
    for task_name in TASK_NAMES:
        dump_path = f"{benchmark_factscore_dir}/{name}_{task_name}_fs.json"
        with open(dump_path, "r") as f:
            scores = json.load(f)
        print(f"{name} {task_name}: {scores['score']:0.3f}")
        benchmark_factscore[name][task_name] = scores["score"]