Install python-terrier and other libs

In [5]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

Note: you may need to restart the kernel to use updated packages.


Configure PyTerrier

In [6]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

Import Dataset

In [50]:
dataset = pt.get_dataset("irds:beir/fiqa")

Create a lexical index (for `BM25` and `RM3`)

In [51]:
from pathlib import Path

idx_path = Path("beir_idx_blocks").absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    blocks=True,
    # stopwords=None,
    # stemmer=None,
).index(dataset.get_corpus_iter(), fields=["text"])

beir/fiqa documents:   0%|          | 0/57638 [00:00<?, ?it/s]

ValueError: Index already exists at /Users/tomighita/Scoala/Facultate/University-Courses/RP/beir_idx_blocks/data.properties

Create a baseline (BM25 performance)

In [52]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset("irds:beir/fiqa/test")
pt.Experiment(
    [bm25],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BR(BM25),0.310271,0.252589,0.20864


# Create the Fast-Forward Indices for TCT-ColBERT

### Create the Encoder

In [53]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)

Test the Encoder

In [54]:
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

### Create the Index
*(Warning)* This operation takes a long time to complete!

In [None]:
from fast_forward import OnDiskIndex, Mode, Indexer
should_reindex = False

ff_index = OnDiskIndex(
    Path("ffindex_fiqa_tct.h5"), dim=768, query_encoder=q_encoder, mode=Mode.MAXP
)

def docs_iter():
    for d in dataset.get_corpus_iter():
        yield {"doc_id": d["docno"], "text": d["text"]}

if should_reindex:
    ff_indexer = Indexer(ff_index, d_encoder, batch_size=1)
    ff_indexer.index_dicts(docs_iter())

If the index is present on disk, we can load it directly in Memory (but requires some RAM)

In [55]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path("ffindex_fiqa_tct.h5"), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()


100%|██████████| 57638/57638 [00:00<00:00, 2373018.84it/s]


# Re-ranking BM25 Results

In [56]:
from fast_forward.util.pyterrier import FFScore

ff_score = FFScore(ff_index)

In [57]:
candidates = (bm25 % 5)(testset.get_topics()) # Get the candidates
candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,4641,36224,376148,0,41.677305,where should i park my rainy day emergency fund
1,4641,47916,497993,1,29.149791,where should i park my rainy day emergency fund
2,4641,55690,580025,2,26.773005,where should i park my rainy day emergency fund
3,4641,24501,253614,3,26.640181,where should i park my rainy day emergency fund
4,4641,3157,32833,4,24.265187,where should i park my rainy day emergency fund
...,...,...,...,...,...,...
644153,2399,33136,343489,0,33.280064,where do web sites get foreign exchange currency rate quote information
644154,2399,6704,69171,1,31.335596,where do web sites get foreign exchange currency rate quote information
644155,2399,4148,43046,2,31.265964,where do web sites get foreign exchange currency rate quote information
644156,2399,46670,484891,3,29.869008,where do web sites get foreign exchange currency rate quote information


In [58]:
re_ranked = ff_score(candidates)
re_ranked

Unnamed: 0,qid,docno,score_0,score,query
0,4641,376148,41.677305,67.637367,where should i park my rainy day emergency fund
1,4641,497993,29.149791,68.335365,where should i park my rainy day emergency fund
2,4641,580025,26.773005,67.524490,where should i park my rainy day emergency fund
3,4641,253614,26.640181,67.098091,where should i park my rainy day emergency fund
4,4641,32833,24.265187,68.345680,where should i park my rainy day emergency fund
...,...,...,...,...,...
3235,2399,343489,33.280064,69.723114,where do web sites get foreign exchange currency rate quote information
3236,2399,69171,31.335596,67.643097,where do web sites get foreign exchange currency rate quote information
3237,2399,43046,31.265964,67.293716,where do web sites get foreign exchange currency rate quote information
3238,2399,484891,29.869008,68.099777,where do web sites get foreign exchange currency rate quote information


In [59]:
from fast_forward.util.pyterrier import FFInterpolate

ff_int = FFInterpolate(alpha=0.1)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,4641,376148,where should i park my rainy day emergency fund,65.041361
1,4641,497993,where should i park my rainy day emergency fund,64.416806
2,4641,580025,where should i park my rainy day emergency fund,63.449342
3,4641,253614,where should i park my rainy day emergency fund,63.052297
4,4641,32833,where should i park my rainy day emergency fund,63.937631
...,...,...,...,...
3235,2399,343489,where do web sites get foreign exchange currency rate quote information,66.078807
3236,2399,69171,where do web sites get foreign exchange currency rate quote information,64.012344
3237,2399,43046,where do web sites get foreign exchange currency rate quote information,63.690939
3238,2399,484891,where do web sites get foreign exchange currency rate quote information,64.276700


In [28]:
# Find best Alpha

devset = pt.get_dataset("irds:beir/fiqa/dev")
pt.GridSearch(
    ~bm25 % 100 >> ff_score >> ff_int,
    {ff_int: {"alpha": [0.05, 0.1, 0.5, 0.9]}},
    devset.get_topics(),
    devset.get_qrels(),
    "map",
    verbose=True,
)
ff_int.alpha

GridScan:   0%|          | 0/4 [00:00<?, ?it/s]

Best map is 0.282656
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x370bb53d0> alpha=0.1']


0.1

# Results

In [29]:
print(ff_int.alpha)
testset.get_topics()

0.1


Unnamed: 0,qid,query
0,4641,where should i park my rainy day emergency fund
1,5503,tax considerations for selling a property belo...
2,7803,can the delta be used to calculate the option ...
3,7017,basic algorithmic trading strategy
4,10152,what does a high operating margin but a small ...
...,...,...
643,4102,how can i determine if my rate of return is go...
644,3566,where can i buy stocks if i only want to inves...
645,94,using credit card points to pay for tax deduct...
646,2551,how to find cheaper alternatives to a traditio...


In [18]:
qe_pipeline = bm25 >> rm3
qe_pipeline(testset.get_topics())

Unnamed: 0,qid,query_0,query
0,10034,tax implications of holding ewu or other such ...,applypipeline:off tax^0.085714296 ewu^0.085714...
1,10039,do individual investors use google to obtain s...,applypipeline:off obtain^0.100000009 googl^0.1...
2,10109,why does charles schwab have a mandatory settl...,applypipeline:off transact^0.025715945 sell^0....
3,10122,why diversify stocks investments,applypipeline:off portfolio^0.040961910 higher...
4,10136,how to minimise the risk of a reduction in pur...,applypipeline:off account^0.054545458 loss^0.0...
...,...,...,...
643,9882,money market or cash type etfs for foreigners ...,applypipeline:off cash^0.075000003 account^0.0...
644,9925,what does chapter 11 bankruptcy mean to an inv...,applypipeline:off mean^0.060000002 creditor^0....
645,9929,investing in commodities pros and cons,applypipeline:off excel^0.050314460 wrong^0.00...
646,9961,employer rollover from 403b to 401k,applypipeline:off 403b^0.199246123 rollov^0.19...


In [60]:
import re

def _remove_pollution(q) -> str:
    q_old = q["query"].replace('applypipeline:off', '')
    return q["query_1"] + " " + re.sub(r'\^(\d)+\.(\d)+', '', q_old)

# (qe_pipeline >> pt.apply.query(_remove_pollution))(testset.get_topics())

pipeline = qe_pipeline >> pt.apply.query(_remove_pollution) >> bm25

pipeline(testset.get_topics())

Unnamed: 0,qid,docid,docno,rank,score,query_1,query_0,query
0,10034,19011,197478,0,56.546712,tax implications of holding ewu or other such uk etfs as a us citizen,applypipeline:off tax^0.085714296 ewu^0.085714296 etf^0.085714296 rule^0.029999999 requir^0.040000003 gener^0.029999999 uk^0.085714296 research^0.029999999 foreign^0.029999999 carefulli^0.029999999 unattract^0.029999999 implic^0.175714284 hold^0.085714296 citizen^0.135714293 includ^0.040000003,tax implications of holding ewu or other such uk etfs as a us citizen tax ewu etf rule requir gener uk research foreign carefulli unattract implic hold citizen includ
1,10034,4347,44955,1,35.858072,tax implications of holding ewu or other such uk etfs as a us citizen,applypipeline:off tax^0.085714296 ewu^0.085714296 etf^0.085714296 rule^0.029999999 requir^0.040000003 gener^0.029999999 uk^0.085714296 research^0.029999999 foreign^0.029999999 carefulli^0.029999999 unattract^0.029999999 implic^0.175714284 hold^0.085714296 citizen^0.135714293 includ^0.040000003,tax implications of holding ewu or other such uk etfs as a us citizen tax ewu etf rule requir gener uk research foreign carefulli unattract implic hold citizen includ
2,10034,36714,381884,2,28.737805,tax implications of holding ewu or other such uk etfs as a us citizen,applypipeline:off tax^0.085714296 ewu^0.085714296 etf^0.085714296 rule^0.029999999 requir^0.040000003 gener^0.029999999 uk^0.085714296 research^0.029999999 foreign^0.029999999 carefulli^0.029999999 unattract^0.029999999 implic^0.175714284 hold^0.085714296 citizen^0.135714293 includ^0.040000003,tax implications of holding ewu or other such uk etfs as a us citizen tax ewu etf rule requir gener uk research foreign carefulli unattract implic hold citizen includ
3,10034,7335,75568,3,27.248455,tax implications of holding ewu or other such uk etfs as a us citizen,applypipeline:off tax^0.085714296 ewu^0.085714296 etf^0.085714296 rule^0.029999999 requir^0.040000003 gener^0.029999999 uk^0.085714296 research^0.029999999 foreign^0.029999999 carefulli^0.029999999 unattract^0.029999999 implic^0.175714284 hold^0.085714296 citizen^0.135714293 includ^0.040000003,tax implications of holding ewu or other such uk etfs as a us citizen tax ewu etf rule requir gener uk research foreign carefulli unattract implic hold citizen includ
4,10034,50877,528880,4,26.221501,tax implications of holding ewu or other such uk etfs as a us citizen,applypipeline:off tax^0.085714296 ewu^0.085714296 etf^0.085714296 rule^0.029999999 requir^0.040000003 gener^0.029999999 uk^0.085714296 research^0.029999999 foreign^0.029999999 carefulli^0.029999999 unattract^0.029999999 implic^0.175714284 hold^0.085714296 citizen^0.135714293 includ^0.040000003,tax implications of holding ewu or other such uk etfs as a us citizen tax ewu etf rule requir gener uk research foreign carefulli unattract implic hold citizen includ
...,...,...,...,...,...,...,...,...
647995,9979,15061,155245,995,8.637931,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold,applypipeline:off gold^0.273850530 store^0.016868457 valu^0.044504233 gld^0.029231133 physic^0.111028001 best^0.085273385 hedg^0.088927276 wai^0.066666670 hold^0.066666670 bullion^0.016176451 inflat^0.115533784 invest^0.066666670 consid^0.018606717,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold gold store valu gld physic best hedg wai hold bullion inflat invest consid
647996,9979,4728,49069,996,8.634448,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold,applypipeline:off gold^0.273850530 store^0.016868457 valu^0.044504233 gld^0.029231133 physic^0.111028001 best^0.085273385 hedg^0.088927276 wai^0.066666670 hold^0.066666670 bullion^0.016176451 inflat^0.115533784 invest^0.066666670 consid^0.018606717,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold gold store valu gld physic best hedg wai hold bullion inflat invest consid
647997,9979,6525,67327,997,8.632415,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold,applypipeline:off gold^0.273850530 store^0.016868457 valu^0.044504233 gld^0.029231133 physic^0.111028001 best^0.085273385 hedg^0.088927276 wai^0.066666670 hold^0.066666670 bullion^0.016176451 inflat^0.115533784 invest^0.066666670 consid^0.018606717,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold gold store valu gld physic best hedg wai hold bullion inflat invest consid
647998,9979,41999,437383,998,8.629889,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold,applypipeline:off gold^0.273850530 store^0.016868457 valu^0.044504233 gld^0.029231133 physic^0.111028001 best^0.085273385 hedg^0.088927276 wai^0.066666670 hold^0.066666670 bullion^0.016176451 inflat^0.115533784 invest^0.066666670 consid^0.018606717,what is the best way to invest in gold as a hedge against inflation without having to hold physical gold gold store valu gld physic best hedg wai hold bullion inflat invest consid


In [None]:
(bm25 % 1000)(testset.get_topics())

In [63]:
pt.Experiment(
    [
        bm25,
        bm25 >> rm3 >> bm25,
        bm25 % 1 >> rm3 >> bm25,
        bm25 % 1000 >> ff_score >> ff_int,
        pipeline >> ff_score >> ff_int
    ],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=[
        "BM25",
        "RM3",
        "RM3 % 1",
        "BM25 >> FF",
        "BM25 >> RM3 >> FF"
    ],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25,0.310271,0.252589,0.20864
1,RM3,0.264714,0.228014,0.183207
2,RM3 % 1,0.300119,0.247575,0.208084
3,BM25 >> FF,0.384861,0.315708,0.26389
4,BM25 >> RM3 >> FF,0.295702,0.256056,0.207307
