Install python-terrier and other libs

In [1]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

Note: you may need to restart the kernel to use updated packages.


Configure PyTerrier

In [2]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Import Dataset

In [3]:
dataset = pt.get_dataset("irds:beir/fiqa")

Create a lexical index (for `BM25` and `RM3`)

In [37]:
from pathlib import Path

indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

beir/fiqa documents:   0%|          | 0/57638 [00:00<?, ?it/s]

Create a baseline (BM25 performance)

In [5]:
from pyterrier.measures import RR, nDCG, MAP

bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")
rm3 = pt.rewrite.RM3(index_ref)
testset = pt.get_dataset("irds:beir/fiqa/test")
pt.Experiment(
    [~bm25],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,Cache(BR(BM25)),0.310271,0.252589,0.20864


# Create the Fast-Forward Indices for TCT-ColBERT

### Create the Encoder

In [6]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)

Test the Encoder

In [9]:
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

### Create the Index
*(Warning)* This operation takes a long time to complete!

In [None]:
from fast_forward import OnDiskIndex, Mode, Indexer

ff_index = OnDiskIndex(
    Path("ffindex_fiqa_tct.h5"), dim=768, query_encoder=q_encoder, mode=Mode.MAXP
)

def docs_iter():
    for d in dataset.get_corpus_iter():
        yield {"doc_id": d["docno"], "text": d["text"]}

ff_indexer = Indexer(ff_index, d_encoder, batch_size=1)
ff_indexer.index_dicts(docs_iter())

If the index is present on disk, we can load it directly in Memory (but requires some RAM)

In [7]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path("ffindex_fiqa_tct.h5"), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()

100%|██████████| 57638/57638 [00:00<00:00, 2135197.17it/s]


# Re-ranking BM25 Results

In [8]:
from fast_forward.util.pyterrier import FFScore

ff_score = FFScore(ff_index)

In [9]:
candidates = (~bm25 % 5)(testset.get_topics()) # Get the candidates
candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,4641,36224,376148,0,41.677305,where should i park my rainy day emergency fund
1,4641,47916,497993,1,29.149791,where should i park my rainy day emergency fund
2,4641,55690,580025,2,26.773005,where should i park my rainy day emergency fund
3,4641,24501,253614,3,26.640181,where should i park my rainy day emergency fund
4,4641,3157,32833,4,24.265187,where should i park my rainy day emergency fund
...,...,...,...,...,...,...
644153,2399,33136,343489,0,33.280064,where do web sites get foreign exchange curren...
644154,2399,6704,69171,1,31.335596,where do web sites get foreign exchange curren...
644155,2399,4148,43046,2,31.265964,where do web sites get foreign exchange curren...
644156,2399,46670,484891,3,29.869008,where do web sites get foreign exchange curren...


In [10]:
re_ranked = ff_score(candidates)
re_ranked

Unnamed: 0,qid,docno,score_0,score,query
0,4641,376148,41.677305,67.637367,where should i park my rainy day emergency fund
1,4641,497993,29.149791,68.335365,where should i park my rainy day emergency fund
2,4641,580025,26.773005,67.524490,where should i park my rainy day emergency fund
3,4641,253614,26.640181,67.098091,where should i park my rainy day emergency fund
4,4641,32833,24.265187,68.345680,where should i park my rainy day emergency fund
...,...,...,...,...,...
3235,2399,343489,33.280064,69.723114,where do web sites get foreign exchange curren...
3236,2399,69171,31.335596,67.643097,where do web sites get foreign exchange curren...
3237,2399,43046,31.265964,67.293716,where do web sites get foreign exchange curren...
3238,2399,484891,29.869008,68.099777,where do web sites get foreign exchange curren...


In [11]:
from fast_forward.util.pyterrier import FFInterpolate

ff_int = FFInterpolate(alpha=0.5)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,4641,376148,where should i park my rainy day emergency fund,54.657336
1,4641,497993,where should i park my rainy day emergency fund,48.742578
2,4641,580025,where should i park my rainy day emergency fund,47.148748
3,4641,253614,where should i park my rainy day emergency fund,46.869136
4,4641,32833,where should i park my rainy day emergency fund,46.305434
...,...,...,...,...
3235,2399,343489,where do web sites get foreign exchange curren...,51.501589
3236,2399,69171,where do web sites get foreign exchange curren...,49.489347
3237,2399,43046,where do web sites get foreign exchange curren...,49.279840
3238,2399,484891,where do web sites get foreign exchange curren...,48.984392


In [12]:
# Find best Alpha

devset = pt.get_dataset("irds:beir/fiqa/dev")
pt.GridSearch(
    ~bm25 % 100 >> ff_score >> ff_int,
    {ff_int: {"alpha": [0.05, 0.1, 0.5, 0.9]}},
    devset.get_topics(),
    devset.get_qrels(),
    "map",
    verbose=True,
)
ff_int.alpha

GridScan:   0%|          | 0/4 [00:00<?, ?it/s]

Best map is 0.282656
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x339c03ce0> alpha=0.1']


0.1

# Results

In [41]:
testset.get_topics()

Unnamed: 0,qid,query
0,4641,where should i park my rainy day emergency fund
1,5503,tax considerations for selling a property belo...
2,7803,can the delta be used to calculate the option ...
3,7017,basic algorithmic trading strategy
4,10152,what does a high operating margin but a small ...
...,...,...
643,4102,how can i determine if my rate of return is go...
644,3566,where can i buy stocks if i only want to inves...
645,94,using credit card points to pay for tax deduct...
646,2551,how to find cheaper alternatives to a traditio...


In [49]:
pipeline = bm25 >> rm3 >> bm25

pipeline(testset.get_topics())

JavaException: JVM exception occurred: Failed to process qid 10034 ' nyse^nan stream^nan rule^nan trust^nan etf^0.085714296 tax^0.085714296 uk^0.085714296 research^nan www^nan product^nan implic^0.085714296 fund^nan hold^0.085714296 citizen^0.085714296 http^nan adjust^nan ewu^0.085714296' -- Lexical error at line 1, column 7.  Encountered: "n" (110), after : "" org.terrier.querying.parser.QueryParserException

In [16]:
pt.Experiment(
    [~bm25, ~bm25 % 1000 >> ff_score >> ff_int, ~bm25 % 1000 >> rm3 >> ff_score >> ff_int],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25", "BM25 >> FF", "BM25 >> RM3 >> FF"],
)



KeyError: "['id', 'score'] not in index"