In [None]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

Start Terrier

In [None]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

Set required dataset and index parameters

In [None]:
DATASET_NAME = "beir/nfcorpus"
TESTSET_NAME = "beir/nfcorpus/test"
BM25_INDEX_PATH = 'indices/nfcorpus'
INDEX_PATH = 'indexes/ffindex_nfcorpus_tct_colbert_msmarco.h5'
FIELDS = ["text"]

SHOULD_RUN_GRID = True
DEVSET_NAME = "irds:beir/nfcorpus/train"

Load datasets

In [None]:
import ir_datasets
dataset = pt.get_dataset('irds:' + DATASET_NAME)
ir_ds = ir_datasets.load(DATASET_NAME)

Load or Create the sparse index

In [None]:
from pathlib import Path

idx_path = Path(BM25_INDEX_PATH).absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    blocks=True,
    meta={'docno': ir_ds.docs_metadata()['fields']['doc_id']['max_len'] },
).index(dataset.get_corpus_iter(), fields=FIELDS)
index_ref = index_ref.to_memory()

Initialise BM25 and RM3

In [None]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset('irds:' + TESTSET_NAME)

Initialise TCT-ColBERT Encoder

In [None]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)
q_encoder(["Test query 1", "Test query 2"])

Load the FF-index

In [None]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path(INDEX_PATH), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()

Create re-ranking stage


In [None]:
from fast_forward.util.pyterrier import FFScore
from fast_forward.util.pyterrier import FFInterpolate

ff_score = FFScore(ff_index)
candidates = (bm25 % 5)(testset.get_topics('text')) # Get the candidates
re_ranked = ff_score(candidates)
ff_int = FFInterpolate(alpha=0.05)
ff_int(re_ranked)

Run exhausive search for the parameters

In [None]:
if SHOULD_RUN_GRID:
    devset = pt.get_dataset(DEVSET_NAME)
    pt.GridSearch(
        bm25 % 5 >> rm3 >> bm25 % 100,
        {rm3: {"fb_docs": [3,5,7,10], "fb_terms": [3,10,15]}},
        devset.get_topics("text"),
        devset.get_qrels(),
        metric="recip_rank",
        verbose=True,
    )

In [None]:
print(rm3.fb_docs)
print(rm3.fb_terms)

Output the experiments' results

In [None]:
steps = ['10', '50', '100', '500', '1000', '5000', '25000']

result = pt.Experiment(
    [
        bm25 % 1000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 10 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 10 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 50 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 50 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 100 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 100 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 500 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 500 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 1000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 5000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 5000 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 25000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 25000 >> pt.rewrite.reset() >> ff_score >> ff_int,
    ],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10],
    names=[
        "TCT-ColBERT_10",
        "RM3+TCT-ColBERT_10",
        "TCT-ColBERT_50",
        "RM3+TCT-ColBERT_50",
        "TCT-ColBERT_100",
        "RM3+TCT-ColBERT_100",
        "TCT-ColBERT_500",
        "RM3+TCT-ColBERT_500",
        "TCT-ColBERT_1000",
        "RM3+TCT-ColBERT_1000",
        "TCT-ColBERT_5000",
        "RM3+TCT-ColBERT_5000",
        "TCT-ColBERT_25000",
        "RM3+TCT-ColBERT_25000",
    ],
)
result

Statistical T-test comparing the two models

In [None]:
result = pt.Experiment(
    [
        bm25 % 5 >> pt.rewrite.RM3(index) >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
    ],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=[
        "RM3+TCT-ColBERT",
        "RM3+TCT-ColBERT_tuned"
    ],
    baseline=0
)
result