In [1]:
# Let's start by creating a sample dataset.

from zbench.common_types import QueryDocuments, Query, Document
from zbench.utils import save_pydantic_jsonl

dataset: list[QueryDocuments] = [
    QueryDocuments(
        query=Query(id="Q1", query="What is the capital of France?"),
        documents=[
            Document(id="d1", content="Paris is the capital of France."),
            Document(id="d2", content="Rome is the capital of Italy."),
        ]
    )
]

save_pydantic_jsonl("my_samle_dataset.jsonl", dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# At first, we need to annotate the dataset.
# We can do this by running the following code:

from zbench.annotation import Annotator

annotator = Annotator("my_samle_dataset.jsonl")
annotator.annotate()

# Now, we have an annotated dataset called "my_samle_dataset_annotated.jsonl" in the working directory.

In [None]:
# Having an annotated dataset, we can define a reranker based on it.
# We also can define any other reranker by inheriting from the BaseReranker class.

from zbench.rerankers import EnsembleReranker, BaseReranker, RerankerInput

ensemble_reranker = EnsembleReranker("my_samle_dataset_annotated.jsonl")

class MyReranker(BaseReranker):
    def __init__(self):
        self.my_variable = 1

    async def score(self, input: RerankerInput) -> list[float]:
        return [0.5] * len(input.documents)
    
my_reranker = MyReranker()

In [None]:
# Finally, we can run three benchmarks:

from zbench.benchmark import benchmark_ndcg, benchmark_accuracy, recall_at_k

print(await benchmark_ndcg("my_samle_dataset.jsonl", my_reranker, ensemble_reranker, visualize=True)) # you can visualise the ndcg score distribution
print(await benchmark_accuracy("my_samle_dataset.jsonl", my_reranker, ensemble_reranker))
print(await recall_at_k("my_samle_dataset.jsonl", my_reranker, ensemble_reranker, 1, k_gt=1))

# As for recall at k, we also introduce a parameter k_gt, which is the number of ground truth documents to consider since the annotation does not separate the ground truth, just the order. For the best results, you can tune k_gt depending on the annotations score distritribution.

In [None]:
# You can also use built-in zerank reranker.

from zbench.rerankers import Zerank

zerank = Zerank("zerank-1")

# Now, you can run the benchmarks:

print(await benchmark_ndcg("my_samle_dataset.jsonl", zerank, ensemble_reranker, visualize=True))
print(await benchmark_accuracy("my_samle_dataset.jsonl", zerank, ensemble_reranker))
print(await recall_at_k("my_samle_dataset.jsonl", zerank, ensemble_reranker, 1, k_gt=1))