In [1]:
import gc
import sys
from pathlib import Path

import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer

from oumi.datasets import ChatRAGBenchDataset

## Step 0: Setup

### Utils

In [2]:
def write_lines(filename, lines):
    """Write a list of lines to a file."""
    with open(filename, "w", encoding="utf8") as f:
        for output in lines:
            f.write(output + "\n")

### Setup

In [None]:
# Clone the ChatRAG-Bench dataset and evaluation scripts locally
# !git clone git@hf.co:datasets/nvidia/ChatRAG-Bench

In [3]:
# Location of the cloned repo
base_chatqa_folder = "ChatRAG-Bench"

# Add the evaluation scripts to the path
sys.path.append(str(Path(base_chatqa_folder) / "evaluation"))

### Config

In [10]:
# Models we want to test. Can be an Oumi checkpoint or a Hugging Face model
# Checkpoints can be found on gs://
models = [  # (display friendly name, name or path)
    ("phi3-original", "microsoft/Phi-3-mini-4k-instruct"),
    ("chatqa-8b", "nvidia/ChatQA-1.5-8B"),
    ("phi3-finetune-stage2-last-ckpt", "../../stage2/checkpoint-14961"),
    ("phi3-finetune-stage1-last-ckpt", "../../stage1/checkpoint-23000"),
]

# ChatRAGBenchmarks we want to test on
benchmarks = [
    "coqa",
    "inscit",
    "topiocqa",
    "hybridial",
    "doc2dial",
    "quac",
    "sqa",
    # 'qrecc',  # there is a bug with this dataset
    # 'doqa_cooking', 'doqa_movies', 'doqa_travel',  # these are not supported yet
]

# Outputs folder
root_predictions_folder = "predictions"
Path(root_predictions_folder).mkdir(parents=True, exist_ok=True)

## Step 1: Generate Responses

In [11]:
from vllm import LLM, SamplingParams

In [None]:
for model_id, model_name_or_path in tqdm(models, desc="Models"):
    model_predictions_folder = Path(root_predictions_folder) / model_id
    model_predictions_folder.mkdir(parents=True, exist_ok=True)

    # Load model. Handles everything by default (kv caching, GPU)
    # Note: only tested with a single GPU
    sampling_params = SamplingParams(
        temperature=0, top_k=1, max_tokens=64
    )  # Same hparams as the paper
    llm = LLM(model=model_name_or_path)

    # Manually create a tokenizer to be able to generate prompts
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    for subset in tqdm(benchmarks, desc="Benchmarks"):
        # Load dataset and generate prompts
        dataset = ChatRAGBenchDataset(tokenizer=tokenizer, subset=subset)
        prompts = [dataset.prompt(idx) for idx in range(len(dataset))]
        print(f"Loading {len(dataset)} examples from {subset}...")

        # Generate outputs & preprocess outputs
        outputs = llm.generate(prompts)
        processed_outputs = [
            output.outputs[0].text.strip().replace("\n", " ") for output in outputs
        ]  # same pre-processing as paper. Not sure why they strip newlines.

        # Write outputs to file
        output_filename = model_predictions_folder / f"{subset}.txt"
        write_lines(output_filename, processed_outputs)

    # Free up gpu memory to allow loading the next model
    del llm
    gc.collect()
    torch.cuda.empty_cache()

## Step 2: Process Results

In [None]:
def _get_ground_truth_filename(benchmark):
    base_path = Path(base_chatqa_folder) / "data" / benchmark
    dev_file = base_path / "dev.json"
    test_file = base_path / "test.json"

    if dev_file.is_file():
        return str(dev_file)

    if test_file.is_file():
        return str(test_file)

    raise ValueError(f"Could not find ground truth for {benchmark}")

In [None]:
# Same script used by the ChatRAG-Bench authors
from get_scores import evaluate_f1

In [None]:
from pathlib import Path

results = []

for model_id, model_name_or_path in tqdm(models, desc="Models"):
    model_predictions_folder = Path(root_predictions_folder) / model_id

    for benchmark in tqdm(benchmarks, desc="Benchmarks"):
        print("-" * 80)

        # Annoyingly, the ground truth files are named differently depending
        # on the benchmark
        ground_truth_file = _get_ground_truth_filename(benchmark)
        prediction_file = model_predictions_folder / f"{benchmark}.txt"

        precision, recall, f1 = evaluate_f1(
            str(ground_truth_file), str(prediction_file)
        )

        results.append(
            {
                "benchmark": benchmark,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "model": model_id,
            }
        )


results = pd.DataFrame(results)

## Step 3: Analyse metrics

In [31]:
results.sort_values(["benchmark", "f1"], ascending=False)  # .style.highlight_max()

Unnamed: 0,benchmark,precision,recall,f1,model
9,topiocqa,0.283519,0.288733,0.3535,chatqa_eval/nvidia_chatqa_outputs
17,topiocqa,0.270855,0.251446,0.328955,chatqa_eval/finetuned_phi3
2,topiocqa,0.240453,0.278561,0.319551,chatqa_eval/base_phi3_outputs
14,sqa,0.732992,0.510911,0.518191,chatqa_eval/nvidia_chatqa_outputs
21,sqa,0.583139,0.425426,0.429939,chatqa_eval/finetuned_phi3
6,sqa,0.354951,0.366099,0.294228,chatqa_eval/base_phi3_outputs
12,quac,0.31162,0.268384,0.271534,chatqa_eval/nvidia_chatqa_outputs
20,quac,0.199881,0.161826,0.166448,chatqa_eval/finetuned_phi3
13,qrecc,0.446039,0.281528,0.326638,chatqa_eval/nvidia_chatqa_outputs
5,qrecc,0.39123,0.256864,0.293054,chatqa_eval/base_phi3_outputs


## Next Steps
- Add missing dataset splits
- Add Nvidia's template to evaluate their base model more accurately