# Evaluation for RAG Pipeline

## Setup

### Libraries

In [None]:
import os
import requests

import torch
from PIL import Image
from transformers import (
    MllamaForConditionalGeneration,
    AutoProcessor,
    ColPaliForRetrieval,
    ColPaliProcessor,
)
from datasets import load_dataset

from vidore_benchmark.retrievers.colpali_retriever import ColPaliRetriever
from vidore_benchmark.evaluation.vidore_evaluators import (
    ViDoReEvaluatorQA,
    ViDoReEvaluatorBEIR,
)
from vidore_benchmark.retrievers import VisionRetriever

### Machine

In [None]:
def detect_device():
    """
    Detects the best available device (CUDA, MPS, or CPU).
    """
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"


device = detect_device()

### Datasets

In [16]:
# ViDoRe ColPali dataset from https://huggingface.co/datasets/vidore/colpali_train_set
vidore_datasets = load_dataset("vidore/colpali_train_set")

README.md:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/82 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/82 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/82 [00:00<?, ?files/s]

train-00000-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00001-of-00082.parquet:   0%|          | 0.00/652M [00:00<?, ?B/s]

train-00002-of-00082.parquet:   0%|          | 0.00/632M [00:00<?, ?B/s]

train-00003-of-00082.parquet:   0%|          | 0.00/637M [00:00<?, ?B/s]

train-00004-of-00082.parquet:   0%|          | 0.00/619M [00:00<?, ?B/s]

train-00005-of-00082.parquet:   0%|          | 0.00/657M [00:00<?, ?B/s]

train-00006-of-00082.parquet:   0%|          | 0.00/625M [00:00<?, ?B/s]

train-00007-of-00082.parquet:   0%|          | 0.00/621M [00:00<?, ?B/s]

train-00008-of-00082.parquet:   0%|          | 0.00/642M [00:00<?, ?B/s]

train-00009-of-00082.parquet:   0%|          | 0.00/682M [00:00<?, ?B/s]

train-00010-of-00082.parquet:   0%|          | 0.00/643M [00:00<?, ?B/s]

train-00011-of-00082.parquet:   0%|          | 0.00/655M [00:00<?, ?B/s]

train-00012-of-00082.parquet:   0%|          | 0.00/642M [00:00<?, ?B/s]

train-00013-of-00082.parquet:   0%|          | 0.00/620M [00:00<?, ?B/s]

train-00014-of-00082.parquet:   0%|          | 0.00/647M [00:00<?, ?B/s]

train-00015-of-00082.parquet:   0%|          | 0.00/616M [00:00<?, ?B/s]

train-00016-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00017-of-00082.parquet:   0%|          | 0.00/624M [00:00<?, ?B/s]

train-00018-of-00082.parquet:   0%|          | 0.00/626M [00:00<?, ?B/s]

train-00019-of-00082.parquet:   0%|          | 0.00/646M [00:00<?, ?B/s]

train-00020-of-00082.parquet:   0%|          | 0.00/656M [00:00<?, ?B/s]

train-00021-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00022-of-00082.parquet:   0%|          | 0.00/639M [00:00<?, ?B/s]

train-00023-of-00082.parquet:   0%|          | 0.00/624M [00:00<?, ?B/s]

train-00024-of-00082.parquet:   0%|          | 0.00/645M [00:00<?, ?B/s]

train-00025-of-00082.parquet:   0%|          | 0.00/602M [00:00<?, ?B/s]

train-00026-of-00082.parquet:   0%|          | 0.00/614M [00:00<?, ?B/s]

train-00027-of-00082.parquet:   0%|          | 0.00/629M [00:00<?, ?B/s]

train-00028-of-00082.parquet:   0%|          | 0.00/669M [00:00<?, ?B/s]

train-00029-of-00082.parquet:   0%|          | 0.00/631M [00:00<?, ?B/s]

train-00030-of-00082.parquet:   0%|          | 0.00/649M [00:00<?, ?B/s]

train-00031-of-00082.parquet:   0%|          | 0.00/644M [00:00<?, ?B/s]

train-00032-of-00082.parquet:   0%|          | 0.00/654M [00:00<?, ?B/s]

train-00033-of-00082.parquet:   0%|          | 0.00/632M [00:00<?, ?B/s]

train-00034-of-00082.parquet:   0%|          | 0.00/654M [00:00<?, ?B/s]

train-00035-of-00082.parquet:   0%|          | 0.00/615M [00:00<?, ?B/s]

train-00036-of-00082.parquet:   0%|          | 0.00/628M [00:00<?, ?B/s]

train-00037-of-00082.parquet:   0%|          | 0.00/648M [00:00<?, ?B/s]

train-00038-of-00082.parquet:   0%|          | 0.00/644M [00:00<?, ?B/s]

train-00039-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00040-of-00082.parquet:   0%|          | 0.00/623M [00:00<?, ?B/s]

train-00041-of-00082.parquet:   0%|          | 0.00/655M [00:00<?, ?B/s]

train-00042-of-00082.parquet:   0%|          | 0.00/673M [00:00<?, ?B/s]

train-00043-of-00082.parquet:   0%|          | 0.00/649M [00:00<?, ?B/s]

train-00044-of-00082.parquet:   0%|          | 0.00/654M [00:00<?, ?B/s]

train-00045-of-00082.parquet:   0%|          | 0.00/676M [00:00<?, ?B/s]

train-00046-of-00082.parquet:   0%|          | 0.00/654M [00:00<?, ?B/s]

train-00047-of-00082.parquet:   0%|          | 0.00/630M [00:00<?, ?B/s]

train-00048-of-00082.parquet:   0%|          | 0.00/616M [00:00<?, ?B/s]

train-00049-of-00082.parquet:   0%|          | 0.00/634M [00:00<?, ?B/s]

train-00050-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00051-of-00082.parquet:   0%|          | 0.00/658M [00:00<?, ?B/s]

train-00052-of-00082.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

train-00053-of-00082.parquet:   0%|          | 0.00/649M [00:00<?, ?B/s]

train-00054-of-00082.parquet:   0%|          | 0.00/623M [00:00<?, ?B/s]

train-00055-of-00082.parquet:   0%|          | 0.00/656M [00:00<?, ?B/s]

train-00056-of-00082.parquet:   0%|          | 0.00/662M [00:00<?, ?B/s]

train-00057-of-00082.parquet:   0%|          | 0.00/640M [00:00<?, ?B/s]

train-00058-of-00082.parquet:   0%|          | 0.00/647M [00:00<?, ?B/s]

train-00059-of-00082.parquet:   0%|          | 0.00/617M [00:00<?, ?B/s]

train-00060-of-00082.parquet:   0%|          | 0.00/635M [00:00<?, ?B/s]

train-00061-of-00082.parquet:   0%|          | 0.00/659M [00:00<?, ?B/s]

train-00062-of-00082.parquet:   0%|          | 0.00/640M [00:00<?, ?B/s]

train-00063-of-00082.parquet:   0%|          | 0.00/631M [00:00<?, ?B/s]

train-00064-of-00082.parquet:   0%|          | 0.00/627M [00:00<?, ?B/s]

train-00065-of-00082.parquet:   0%|          | 0.00/650M [00:00<?, ?B/s]

train-00066-of-00082.parquet:   0%|          | 0.00/627M [00:00<?, ?B/s]

train-00067-of-00082.parquet:   0%|          | 0.00/629M [00:00<?, ?B/s]

train-00068-of-00082.parquet:   0%|          | 0.00/628M [00:00<?, ?B/s]

train-00069-of-00082.parquet:   0%|          | 0.00/629M [00:00<?, ?B/s]

train-00070-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00071-of-00082.parquet:   0%|          | 0.00/627M [00:00<?, ?B/s]

train-00072-of-00082.parquet:   0%|          | 0.00/646M [00:00<?, ?B/s]

train-00073-of-00082.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

train-00074-of-00082.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00075-of-00082.parquet:   0%|          | 0.00/644M [00:00<?, ?B/s]

train-00076-of-00082.parquet:   0%|          | 0.00/633M [00:00<?, ?B/s]

train-00077-of-00082.parquet:   0%|          | 0.00/618M [00:00<?, ?B/s]

train-00078-of-00082.parquet:   0%|          | 0.00/655M [00:00<?, ?B/s]

train-00079-of-00082.parquet:   0%|          | 0.00/629M [00:00<?, ?B/s]

train-00080-of-00082.parquet:   0%|          | 0.00/652M [00:00<?, ?B/s]

train-00081-of-00082.parquet:   0%|          | 0.00/633M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/118195 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/104 [00:00<?, ?it/s]

### Models

#### Index

In [None]:
# ViDoRe ColPali wrapped in byaldi from https://github.com/AnswerDotAI/byaldi
indexing_model = ColPaliForRetrieval.from_pretrained("vidore/colpali-v1.3-hf").to(
    device
)
indexing_processor = ColPaliProcessor.from_pretrained("vidore/colpali-v1.3-hf").to(
    device
)

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Generate

In [None]:
# Llama vision from https://huggingface.co/alpindale/Llama-3.2-11B-Vision-Instruct
generation_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

generation_model = MllamaForConditionalGeneration.from_pretrained(
    generation_model_id,
    torch_dtype=torch.float16 if device != "cpu" else torch.float32,
    device_map="auto",
)

generation_processor = AutoProcessor.from_pretrained(generation_model_id)

generation_model.to(device)

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk.


preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

## ColPali RAG System

### Ingestion

#### Extract

In [None]:
print(vidore_datasets)
vidore_train_set = vidore_datasets["train"]
vidore_test_set = vidore_datasets["test"]

DatasetDict({
    train: Dataset({
        features: ['image', 'image_filename', 'query', 'answer', 'source', 'options', 'page', 'model', 'prompt', 'answer_type'],
        num_rows: 118195
    })
    test: Dataset({
        features: ['image', 'image_filename', 'query', 'answer', 'source', 'options', 'page', 'model', 'prompt', 'answer_type'],
        num_rows: 500
    })
})


#### Transform

In [None]:
# TODO

#### Index

In [None]:
# Index the documents in the folder
folder_path = "./test"
index_name = "document_index"

if not os.path.exists(index_name):
    indexing_model.index(
        input_path=folder_path,
        index_name=index_name,
        store_collection_with_index=True,
        overwrite=True,
    )

Index exported to .byaldi/document_index


### Retrieval

#### Retrieve

In [None]:
top_k = 5
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
query_image = Image.open(requests.get(url, stream=True).raw)
query_text = "What is the name of the animal in the image?"

# Prepare query payload
query = {"image": query_image, "text": query_text}
results = indexing_model.retrieve(query=query, index_name=index_name, top_k=top_k)

#### Generate

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": "If I had to write a haiku for this one, it would be: ",
            },
        ],
    }
]
input_text = generation_processor.apply_chat_template(
    messages, add_generation_prompt=True
)
inputs = generation_processor(query_image, input_text, return_tensors="pt").to(
    generation_model.device
)

output = generation_model.generate(**inputs, max_new_tokens=512)
print(generation_processor.decode(output[0], skip_special_tokens=True))

## Evaluation with ViDoRe Datasets

In [None]:
# Load the ViDoRe ColPali training set
ds = {
    "corpus": load_dataset("vidore/colpali_train_set", name="corpus", split="test"),
    "queries": load_dataset("vidore/colpali_train_set", name="queries", split="test"),
    "qrels": load_dataset("vidore/colpali_train_set", name="qrels", split="test"),
}

# Indexing Model and Processor
indexing_model = ColPaliForRetrieval.from_pretrained("vidore/colpali-v1.3-hf").to(
    device
)

# Initialize the retirever
retriever = ColPaliRetriever(
    use_visual_embedding=True,
    pretrained_model_name_or_path=indexing_model,
    device=device,
)

# Initialize the Vision Retriever
vision_retriever = VisionRetriever()

# Initialize the evaluator QA
evaluator_qa = ViDoReEvaluatorQA(vision_retriever=retriever)

# Initialize the evaluator BEIR
evaluator_beir = ViDoReEvaluatorBEIR(vision_retriever=retriever)

# Evaluate
# QA Dasataset
metrics_dataset_qa = evaluator_qa.evaluate_dataset(
    ds=ds,
    batch_query=4,
    batch_passage=4,
)
print(metrics_dataset_qa)

# BEIR Dataset
metrics_dataset_beir = evaluator_beir.evaluate_dataset(
    ds=ds,
    batch_query=4,
    batch_passage=4,
)
print(metrics_dataset_beir)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [None]:
from vidore_benchmark import evaluate_retriever

# Define evaluation parameters
model_class = "colpali"
model_name = "vidore/colpali-v1.3"
collection_name = "vidore/docvqa_test_subsampled"
dataset_format = "qa"
split = "test"

# Execute evaluation
metrics = evaluate_retriever(
    model_class=model_class,
    model_name=model_name,
    collection_name=collection_name,
    dataset_format=dataset_format,
    split=split,
)

# Display evaluation metrics
print(metrics)


In [3]:
from datasets import load_dataset

ds = load_dataset("vidore/docvqa_test_subsampled")

In [None]:
import json
import torch
from datasets import load_dataset
from tqdm import tqdm

from vidore_benchmark.retrievers import ColPaliRetriever
from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorQA
from vidore_benchmark.utils.data_utils import get_datasets_from_collection


def test():
    # Configuration
    MODEL_NAME = "vidore/colpali-v1.3"
    HF_COLLECTION = "vidore/vidore-benchmark-667173f98e70a1c0fa4db00d"  # ViDoRe v1 QA
    SPLIT_NAME = "test"
    BATCH_QUERY = 4
    BATCH_PASSAGE = 4
    # optional: you can tweak these for faster dataloader pre-batching:
    PREBATCH_Q = BATCH_QUERY * 10
    PREBATCH_P = BATCH_PASSAGE * 10

    # Instantiate the ColPali retriever
    retriever = ColPaliRetriever(
        pretrained_model_name_or_path=MODEL_NAME,
        device="auto",
        num_workers=4,
    )

    # Instantiate the ViDoRe QA evaluator
    evaluator = ViDoReEvaluatorQA(vision_retriever=retriever)

    # Fetch all dataset names in the HF collection
    dataset_names = get_datasets_from_collection(HF_COLLECTION)

    all_metrics = {}
    for ds_name in tqdm(dataset_names, desc="Datasets"):
        # load the QA dataset (each row has `query`, `image`, `image_filename`, …)
        ds = load_dataset(ds_name, split=SPLIT_NAME)

        # run the full retrieve‐and‐score pipeline
        metrics = evaluator.evaluate_dataset(
            ds=ds,
            batch_query=BATCH_QUERY,
            batch_passage=BATCH_PASSAGE,
            dataloader_prebatch_query=PREBATCH_Q,
            dataloader_prebatch_passage=PREBATCH_P,
        )
        all_metrics[ds_name] = metrics

    # ——— 5) Save and report —————————————————————————————————————
    with open("colpali_vidore_v1_metrics.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    print(json.dumps(all_metrics, indent=2))


test()


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch
from PIL import Image

from transformers import ColPaliForRetrieval, ColPaliProcessor

model_name = "vidore/colpali-v1.3-hf"

model = ColPaliForRetrieval.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device != "cpu" else torch.float32,
    device_map="mps",
).eval()

processor = ColPaliProcessor.from_pretrained(model_name)

# Your inputs
images = [
    Image.new("RGB", (32, 32), color="white"),
    Image.new("RGB", (16, 16), color="black"),
]
queries = [
    "What is the organizational structure for our R&D department?",
    "Can you provide a breakdown of last year’s financial performance?",
]

# Process the inputs
batch_images = processor(images=images).to(model.device)
batch_queries = processor(text=queries).to(model.device)

# Forward pass
with torch.no_grad():
    image_embeddings = model(**batch_images)
    query_embeddings = model(**batch_queries)

# Score the queries against the images
scores = processor.score_retrieval(
    query_embeddings.embeddings, image_embeddings.embeddings
)


NameError: name 'device' is not defined

In [None]:
evaluator = ViDoReEvaluatorQA(vision_retriever=retriever)

In [None]:
import torch
from colpali_engine.models import ColIdefics3, ColIdefics3Processor
from datasets import load_dataset

from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorQA
from vidore_benchmark.retrievers import VisionRetriever

torch.set_default_dtype(torch.float32)

model_name = "vidore/colSmol-256M"
processor = ColIdefics3Processor.from_pretrained(model_name)
model = ColIdefics3.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # mps doesn't support float64
    device_map="mps",
).eval()

model.to(torch.float32).to("mps")
# tensor = tensor.to(torch.float32).to("mps")

# Get retriever instance
vision_retriever = VisionRetriever(model=model, processor=processor)

# Evaluate on a single QA format dataset
vidore_evaluator_qa = ViDoReEvaluatorQA(vision_retriever)
ds = load_dataset("vidore/tabfquad_test_subsampled", split="test")
# metrics_dataset_qa = vidore_evaluator_qa.evaluate_dataset(
#     ds=ds,
#     batch_query=4,
#     batch_passage=4,
# )
# print(metrics_dataset_qa)

In [28]:
print(ds)
# print(ds.features)
# print(ds.features['query'])

Dataset({
    features: ['query', 'image_filename', 'generated_by', 'GPT4 caption', 'image', 'source'],
    num_rows: 280
})
