Install python-terrier and other libs

In [36]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


Configure PyTerrier

In [1]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Import Dataset

In [18]:
dataset = pt.get_dataset("irds:wikir/en1k")

Create a lexical index (for `BM25` and `RM3`)

In [21]:
from pathlib import Path

idx_path = Path("cqadupstack_idx_blocks").absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    blocks=True,
).index(dataset.get_corpus_iter(), fields=["text"])

wikir/en1k documents:   0%|          | 0/369721 [00:00<?, ?it/s]

Create a baseline (BM25 performance)

In [22]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset("irds:wikir/en1k/test")
pt.Experiment(
    [~bm25],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,Cache(BR(BM25)),0.662806,0.360993,0.175304


# Create the Fast-Forward Indices for TCT-ColBERT

In [23]:
(bm25 % 5)(testset.get_topics('text')) #candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,158491,218567,607552,0,24.191590,southern methodist university
1,158491,341393,1880296,1,23.999622,southern methodist university
2,158491,234967,2261272,2,23.539333,southern methodist university
3,158491,112984,1957435,3,23.395950,southern methodist university
4,158491,80536,1774491,4,23.361582,southern methodist university
...,...,...,...,...,...,...
69227,712704,353470,2156586,0,20.642111,west indies
69228,712704,168527,498185,1,19.879473,west indies
69229,712704,336532,2066479,2,19.157909,west indies
69230,712704,349922,2004825,3,18.887363,west indies


### Create the Encoder

In [24]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)

Test the Encoder

In [25]:
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

### Create the Index
*(Warning)* This operation takes a long time to complete!

In [None]:
from fast_forward import OnDiskIndex, Mode, Indexer

ff_index = OnDiskIndex(
    Path("ffindex_fiqa_tct.h5"), dim=768, query_encoder=q_encoder, mode=Mode.MAXP
)

def docs_iter():
    for d in dataset.get_corpus_iter():
        yield {"doc_id": d["docno"], "text": d["text"]}

ff_indexer = Indexer(ff_index, d_encoder, batch_size=1)
ff_indexer.index_dicts(docs_iter())

If the index is present on disk, we can load it directly in Memory (but requires some RAM)

In [26]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path("ffindex_fiqa_tct.h5"), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()


  0%|          | 0/57638 [00:00<?, ?it/s][A
100%|██████████| 57638/57638 [00:01<00:00, 46191.55it/s][A


# Re-ranking BM25 Results

In [27]:
from fast_forward.util.pyterrier import FFScore

ff_score = FFScore(ff_index)

In [28]:
candidates = (bm25 % 5)(testset.get_topics('text')) # Get the candidates
candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,158491,218567,607552,0,24.191590,southern methodist university
1,158491,341393,1880296,1,23.999622,southern methodist university
2,158491,234967,2261272,2,23.539333,southern methodist university
3,158491,112984,1957435,3,23.395950,southern methodist university
4,158491,80536,1774491,4,23.361582,southern methodist university
...,...,...,...,...,...,...
69227,712704,353470,2156586,0,20.642111,west indies
69228,712704,168527,498185,1,19.879473,west indies
69229,712704,336532,2066479,2,19.157909,west indies
69230,712704,349922,2004825,3,18.887363,west indies


In [29]:
re_ranked = ff_score(candidates)
re_ranked

no vectors for 607552
no vectors for 1880296
no vectors for 2261272
no vectors for 1957435
no vectors for 1774491
no vectors for 1890681
no vectors for 283099
no vectors for 98452
no vectors for 592647
no vectors for 13554
no vectors for 1354099
no vectors for 2164978
no vectors for 166298
no vectors for 1070
no vectors for 1317839
no vectors for 1378846
no vectors for 834604
no vectors for 784701
no vectors for 671380
no vectors for 2186467
no vectors for 2328959
no vectors for 855680
no vectors for 1325717
no vectors for 40342
no vectors for 5115
no vectors for 1384004
no vectors for 897966
no vectors for 481955
no vectors for 2270937
no vectors for 2392139
no vectors for 15469
no vectors for 1334817
no vectors for 2149594
no vectors for 975904
no vectors for 62953
no vectors for 252433
no vectors for 62956
no vectors for 1366881
no vectors for 210440
no vectors for 152444
no vectors for 242736
no vectors for 1120216
no vectors for 247573
no vectors for 128064
no vectors for 104086
n

Unnamed: 0,qid,docno,score_0,score,query
0,5728,5728,25.289616,64.469879,halakha
1,1313611,4397,21.759257,63.294655,polynesia
2,107590,443002,25.61955,63.321739,west bromwich
3,113000,525590,22.973771,64.26152,sulfide
4,11919,458377,26.717718,64.269073,spider man
5,104453,104453,20.44271,63.383644,uniting church in australia
6,73673,160275,18.706745,64.399902,drava
7,73673,529546,18.516112,64.303322,drava
8,371437,336506,27.738756,64.017334,internal revenue code
9,79032,444531,22.472525,64.386093,elijah wood


In [30]:
from fast_forward.util.pyterrier import FFInterpolate

ff_int = FFInterpolate(alpha=0.5)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,5728,5728,halakha,44.879748
1,1313611,4397,polynesia,42.526956
2,107590,443002,west bromwich,44.470645
3,113000,525590,sulfide,43.617646
4,11919,458377,spider man,45.493396
5,104453,104453,uniting church in australia,41.913177
6,73673,160275,drava,41.553324
7,73673,529546,drava,41.409717
8,371437,336506,internal revenue code,45.878045
9,79032,444531,elijah wood,43.429309


In [31]:
# Find best Alpha

devset = pt.get_dataset("irds:beir/fiqa/dev")
pt.GridSearch(
    ~bm25 % 100 >> ff_score >> ff_int,
    {ff_int: {"alpha": [0.05, 0.1, 0.5, 0.9]}},
    devset.get_topics(),
    devset.get_qrels(),
    "map",
    verbose=True,
)
ff_int.alpha

GridScan:   0%|          | 0/4 [00:00<?, ?it/s]

no vectors for 2591
no vectors for 892874
no vectors for 1105314
no vectors for 605908
no vectors for 1694021
no vectors for 1784234
no vectors for 436662
no vectors for 572710
no vectors for 561734
no vectors for 975714
no vectors for 1437418
no vectors for 1066903
no vectors for 582159
no vectors for 2078706
no vectors for 1381497
no vectors for 892651
no vectors for 2016001
no vectors for 196111
no vectors for 2016023
no vectors for 1785142
no vectors for 2424654
no vectors for 2038585
no vectors for 1879480
no vectors for 473522
no vectors for 943056
no vectors for 2332075
no vectors for 1202871
no vectors for 2264589
no vectors for 2335214
no vectors for 2018088
no vectors for 997476
no vectors for 70007
no vectors for 756322
no vectors for 505641
no vectors for 942692
no vectors for 360885
no vectors for 1920459
no vectors for 2073121
no vectors for 1669116
no vectors for 1605687
no vectors for 562248
no vectors for 1953184
no vectors for 1373435
no vectors for 472619
no vectors 

# Results

In [82]:
testset.get_topics()

Unnamed: 0,qid,query
0,4641,where should i park my rainy day emergency fund
1,5503,tax considerations for selling a property belo...
2,7803,can the delta be used to calculate the option ...
3,7017,basic algorithmic trading strategy
4,10152,what does a high operating margin but a small ...
...,...,...
643,4102,how can i determine if my rate of return is go...
644,3566,where can i buy stocks if i only want to inves...
645,94,using credit card points to pay for tax deduct...
646,2551,how to find cheaper alternatives to a traditio...


In [87]:
qe_pipeline = ~bm25 >> rm3
qe_pipeline(testset.get_topics())

Unnamed: 0,qid,query_0,query
0,10034,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
1,10039,do individual investors use google to obtain s...,applypipeline:off obtain^0.100000009 googl^0.1...
2,10109,why does charles schwab have a mandatory settl...,applypipeline:off transact^0.025715945 sell^0....
3,10122,why diversify stocks investments,applypipeline:off portfolio^0.040961910 higher...
4,10136,how to minimise the risk of a reduction in pur...,applypipeline:off account^0.054545458 loss^0.0...
...,...,...,...
643,9882,money market or cash type etfs for foreigners ...,applypipeline:off cash^0.075000003 account^0.0...
644,9925,what does chapter 11 bankruptcy mean to an inv...,applypipeline:off mean^0.060000002 creditor^0....
645,9929,investing in commodities pros and cons,applypipeline:off excel^0.050314460 wrong^0.00...
646,9961,employer rollover from 403b to 401k,applypipeline:off 403b^0.199246123 rollov^0.19...


In [88]:
pipeline = qe_pipeline >> ~bm25

pipeline(testset.get_topics())

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,10034,19011,197478,0,32.136496,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
1,10034,4347,44955,1,27.074305,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
2,10034,31257,322838,2,19.781771,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
3,10034,7335,75568,3,19.312583,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
4,10034,10817,111274,4,18.207875,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
...,...,...,...,...,...,...,...
647995,9979,37166,386390,995,4.740691,what is the best way to invest in gold as a he...,applypipeline:off gold^0.273850530 store^0.016...
647996,9979,40310,419796,996,4.740594,what is the best way to invest in gold as a he...,applypipeline:off gold^0.273850530 store^0.016...
647997,9979,10118,104389,997,4.736415,what is the best way to invest in gold as a he...,applypipeline:off gold^0.273850530 store^0.016...
647998,9979,21638,224668,998,4.735540,what is the best way to invest in gold as a he...,applypipeline:off gold^0.273850530 store^0.016...


In [89]:
(~bm25 % 1000)(testset.get_topics())

Unnamed: 0,qid,docid,docno,rank,score,query
0,4641,36224,376148,0,41.677305,where should i park my rainy day emergency fund
1,4641,47916,497993,1,29.149791,where should i park my rainy day emergency fund
2,4641,55690,580025,2,26.773005,where should i park my rainy day emergency fund
3,4641,24501,253614,3,26.640181,where should i park my rainy day emergency fund
4,4641,3157,32833,4,24.265187,where should i park my rainy day emergency fund
...,...,...,...,...,...,...
645148,2399,25037,259223,995,10.230066,where do web sites get foreign exchange curren...
645149,2399,33542,348029,996,10.229760,where do web sites get foreign exchange curren...
645150,2399,13368,137444,997,10.221526,where do web sites get foreign exchange curren...
645151,2399,22944,237573,998,10.219489,where do web sites get foreign exchange curren...


In [32]:
pt.Experiment(
    [bm25, bm25 % 1000 >> ff_score >> ff_int, bm25 % 1000 >> rm3 >> bm25 >> ff_score >> ff_int],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25", "BM25 >> FF", "BM25 >> RM3 >> FF"],
)

no vectors for 607552
no vectors for 1880296
no vectors for 2261272
no vectors for 1957435
no vectors for 1774491
no vectors for 625257
no vectors for 663828
no vectors for 635537
no vectors for 158491
no vectors for 589549
no vectors for 1956922
no vectors for 1093529
no vectors for 685181
no vectors for 360918
no vectors for 945068
no vectors for 1170039
no vectors for 2411344
no vectors for 1158969
no vectors for 2337647
no vectors for 1180246
no vectors for 967619
no vectors for 742912
no vectors for 637819
no vectors for 1059585
no vectors for 1397771
no vectors for 1422090
no vectors for 621578
no vectors for 1490799
no vectors for 13801
no vectors for 1094293
no vectors for 2390322
no vectors for 1079407
no vectors for 1485043
no vectors for 289756
no vectors for 2225325
no vectors for 1042344
no vectors for 1454621
no vectors for 345165
no vectors for 182202
no vectors for 547150
no vectors for 1902205
no vectors for 672481
no vectors for 899723
no vectors for 1136740
no vector