## Prepare corpus

In [None]:
import json
from tqdm import tqdm
from datasets import Dataset, load_dataset

ds = load_dataset("BeIR/nfcorpus", "corpus")["corpus"]

with open("nfcorpus.jsonl", "w") as f:
    for i in tqdm(range(len(ds))):
        doc = {"id": ds[i]["_id"], "text": ds[i]["title"] + " " + ds[i]["text"]}
        f.write(json.dumps(doc) + "\n")

## Prepare index & ingest pipeline

In [None]:
from utils import get_os_client

client = get_os_client(use_aws_auth=False)

if client.indices.exists(index="test-index"):
    client.indices.delete(index="test-index")
client.indices.create(
    index="test-index",
    body={
        "settings": {"default_pipeline": "nlp-ingest-pipeline-sparse"},
        "mappings": {
            "properties": {
                "id": {"type": "text"},
                "embedding": {"type": "rank_features"},
                "text": {"type": "text"},
            }
        },
    },
)

client.transport.perform_request(
    method="PUT",
    url="/_ingest/pipeline/nlp-ingest-pipeline-sparse",
    body={
        "description": "An sparse encoding ingest pipeline",
        "processors": [
            {
                "sparse_encoding": {
                    "model_id": "G3HGgZcBIhjhFXqCQEjZ",
                    "field_map": {"text": "embedding"},
                }
            }
        ],
    },
)

## Prepare qrels and queries

In [None]:
import datasets
import json

dataset_name = "nfcorpus"
queries_raw = datasets.load_dataset(f"BeIR/{dataset_name}", "queries")["queries"]
qrels_raw = datasets.load_dataset(f"BeIR/{dataset_name}-qrels")["test"]

qrels = {}
for item in qrels_raw:
    query_id = item["query-id"]
    corpus_id = item["corpus-id"]
    score = item["score"]

    if query_id not in qrels:
        qrels[query_id] = {}
    qrels[query_id][corpus_id] = score

queries_raw = queries_raw.filter(lambda x: x["_id"] in qrels)

queries = {}
for item in queries_raw:
    queries[item["_id"]] = item["text"]

with open(f"{dataset_name}-queries.json", "w") as f:
    json.dump(queries, f)

with open(f"{dataset_name}-qrels.json", "w") as f:
    json.dump(qrels, f)