Install python-terrier and other libs

In [1]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

Note: you may need to restart the kernel to use updated packages.


Configure PyTerrier

In [2]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Import Dataset

In [3]:
dataset = pt.get_dataset("irds:beir/webis-touche2020/v2")

Create a lexical index (for `BM25` and `RM3`)

In [14]:
from pathlib import Path

idx_path = Path("indices/webis_idx_blocks").absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    meta={'docno': 40},
    blocks=True,
    # stopwords=None,
    # stemmer=None,
).index(dataset.get_corpus_iter(), fields=["text"])

beir/webis-touche2020/v2 documents:   0%|          | 0/382545 [00:00<?, ?it/s]

12:53:50.952 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer - Indexed 1843 empty documents


Create a baseline (BM25 performance)

In [16]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset("irds:beir/webis-touche2020/v2")
pt.Experiment(
    [bm25],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BR(BM25),0.622846,0.342774,0.209593


# Create the Fast-Forward Indices for TCT-ColBERT

In [17]:
(bm25 % 5)(testset.get_topics('text')) #candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,143806,51530f3f-2019-04-18T18:15:02Z-00004-000,0,31.770993,should teachers get tenure
1,1,164415,b0680508-2019-04-18T13:48:51Z-00002-000,1,31.583122,should teachers get tenure
2,1,4619,c065954f-2019-04-18T14:32:52Z-00003-000,2,31.521730,should teachers get tenure
3,1,4617,c065954f-2019-04-18T14:32:52Z-00001-000,3,31.476861,should teachers get tenure
4,1,163479,ff0947ec-2019-04-18T12:23:12Z-00000-000,4,31.385355,should teachers get tenure
...,...,...,...,...,...,...
48000,50,172150,ffdf2e2e-2019-04-18T11:43:09Z-00005-000,0,22.774540,should everyone get a universal basic income
48001,50,201026,b7051d6f-2019-04-18T11:25:14Z-00001-000,1,22.677757,should everyone get a universal basic income
48002,50,16511,4d103793-2019-04-18T11:35:54Z-00007-000,2,22.232733,should everyone get a universal basic income
48003,50,108425,4d103774-2019-04-18T13:49:55Z-00005-000,3,20.868402,should everyone get a universal basic income


### Create the Encoder

In [18]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)

Test the Encoder

In [19]:
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

### Create the Index
*(Warning)* This operation takes a long time to complete!

In [20]:
from fast_forward import OnDiskIndex, Mode, Indexer

ff_index = OnDiskIndex(
    Path("indices/irds:beir_webis-touche2020_v2.h5"), dim=768, query_encoder=q_encoder, mode=Mode.MAXP
)

def docs_iter():
    for d in dataset.get_corpus_iter():
        yield {"doc_id": d["docno"], "text": d["text"]}

ff_indexer = Indexer(ff_index, d_encoder, batch_size=1)
ff_indexer.index_dicts(docs_iter())

ValueError: File irds:beir_webis-touche2020_v2.h5 exists.

If the index is present on disk, we can load it directly in Memory (but requires some RAM)

In [21]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path("indices/irds:beir_webis-touche2020_v2.h5"), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()

100%|██████████| 382545/382545 [00:00<00:00, 1003050.09it/s]


# Re-ranking BM25 Results

In [22]:
from fast_forward.util.pyterrier import FFScore

ff_score = FFScore(ff_index)

In [25]:
candidates = (bm25 % 5)(testset.get_topics('text')) # Get the candidates
candidates

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,143806,51530f3f-2019-04-18T18:15:02Z-00004-000,0,31.770993,should teachers get tenure
1,1,164415,b0680508-2019-04-18T13:48:51Z-00002-000,1,31.583122,should teachers get tenure
2,1,4619,c065954f-2019-04-18T14:32:52Z-00003-000,2,31.521730,should teachers get tenure
3,1,4617,c065954f-2019-04-18T14:32:52Z-00001-000,3,31.476861,should teachers get tenure
4,1,163479,ff0947ec-2019-04-18T12:23:12Z-00000-000,4,31.385355,should teachers get tenure
...,...,...,...,...,...,...
48000,50,172150,ffdf2e2e-2019-04-18T11:43:09Z-00005-000,0,22.774540,should everyone get a universal basic income
48001,50,201026,b7051d6f-2019-04-18T11:25:14Z-00001-000,1,22.677757,should everyone get a universal basic income
48002,50,16511,4d103793-2019-04-18T11:35:54Z-00007-000,2,22.232733,should everyone get a universal basic income
48003,50,108425,4d103774-2019-04-18T13:49:55Z-00005-000,3,20.868402,should everyone get a universal basic income


In [26]:
re_ranked = ff_score(candidates)
re_ranked

Unnamed: 0,qid,docno,score_0,score,query
0,1,51530f3f-2019-04-18T18:15:02Z-00004-000,31.770993,70.872704,should teachers get tenure
1,1,b0680508-2019-04-18T13:48:51Z-00002-000,31.583122,70.643105,should teachers get tenure
2,1,c065954f-2019-04-18T14:32:52Z-00003-000,31.521730,71.048004,should teachers get tenure
3,1,c065954f-2019-04-18T14:32:52Z-00001-000,31.476861,70.674820,should teachers get tenure
4,1,ff0947ec-2019-04-18T12:23:12Z-00000-000,31.385355,70.853844,should teachers get tenure
...,...,...,...,...,...
240,50,ffdf2e2e-2019-04-18T11:43:09Z-00005-000,22.774540,69.927528,should everyone get a universal basic income
241,50,b7051d6f-2019-04-18T11:25:14Z-00001-000,22.677757,69.021179,should everyone get a universal basic income
242,50,4d103793-2019-04-18T11:35:54Z-00007-000,22.232733,70.329308,should everyone get a universal basic income
243,50,4d103774-2019-04-18T13:49:55Z-00005-000,20.868402,69.252075,should everyone get a universal basic income


In [27]:
from fast_forward.util.pyterrier import FFInterpolate

ff_int = FFInterpolate(alpha=0.5)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,1,51530f3f-2019-04-18T18:15:02Z-00004-000,should teachers get tenure,51.321848
1,1,b0680508-2019-04-18T13:48:51Z-00002-000,should teachers get tenure,51.113113
2,1,c065954f-2019-04-18T14:32:52Z-00003-000,should teachers get tenure,51.284867
3,1,c065954f-2019-04-18T14:32:52Z-00001-000,should teachers get tenure,51.075841
4,1,ff0947ec-2019-04-18T12:23:12Z-00000-000,should teachers get tenure,51.119600
...,...,...,...,...
240,50,ffdf2e2e-2019-04-18T11:43:09Z-00005-000,should everyone get a universal basic income,46.351034
241,50,b7051d6f-2019-04-18T11:25:14Z-00001-000,should everyone get a universal basic income,45.849468
242,50,4d103793-2019-04-18T11:35:54Z-00007-000,should everyone get a universal basic income,46.281020
243,50,4d103774-2019-04-18T13:49:55Z-00005-000,should everyone get a universal basic income,45.060239


In [28]:
# Find best Alpha

devset = pt.get_dataset("irds:beir/fiqa/dev")
pt.GridSearch(
    ~bm25 % 100 >> ff_score >> ff_int,
    {ff_int: {"alpha": [0.05, 0.1, 0.5, 0.9]}},
    devset.get_topics(),
    devset.get_qrels(),
    "map",
    verbose=True,
)
print(ff_int.alpha)

GridScan:   0%|          | 0/4 [00:00<?, ?it/s]

Best map is 0.000000
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x17ffa5af0> alpha=0.05']
0.05


# Results

In [33]:
testset.get_topics('text')

Unnamed: 0,qid,query
0,1,should teachers get tenure
1,2,is vaping with e cigarettes safe
2,3,should insider trading be allowed
3,4,should corporal punishment be used in schools
4,5,should social security be privatized
5,6,is a college education worth it
6,7,should felons who have completed their sentence be allowed to vote
7,8,should abortion be legal
8,9,should students have to wear school uniforms
9,10,should any vaccines be required for children


In [42]:
qe_pipeline = bm25 % 5 >> rm3
qe_pipeline(testset.get_topics('text'))

Unnamed: 0,qid,query_0,query
0,1,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217
1,10,should any vaccines be required for children,applypipeline:off school^0.038674839 vaccin^0.314297915 mandatori^0.022606660 requir^0.238039866 visa^0.022606660 children^0.266453862 risk^0.023241648 child^0.022966055 diseas^0.025329279 parent^0.025783228
2,11,should performance enhancing drugs be accepted in sports,applypipeline:off steroid^0.038802594 perform^0.173687547 drug^0.182416618 hall^0.016850894 fame^0.016850894 sport^0.195734143 athlet^0.028107595 enhanc^0.180147067 profession^0.029944489 accept^0.120000005 appli^0.017458132
3,12,should birth control pills be available over the counter,applypipeline:off sex^0.023842813 school^0.018721025 teenag^0.051245067 birth^0.231887504 peer^0.014482302 pressur^0.014482302 teen^0.019162556 pill^0.227687269 control^0.231887504 counter^0.150000006 parent^0.016601665
4,13,can alternative energy effectively replace fossil fuels,applypipeline:off energi^0.152789652 fuel^0.147528097 save^0.016811328 resourc^0.021018496 effect^0.085714296 altern^0.159448981 run^0.044489149 can^0.085714296 fossil^0.141753212 replac^0.111109912 cost^0.016811328 planet^0.016811328
5,14,is sexual orientation determined at birth,applypipeline:off orient^0.213993460 genit^0.028164396 birth^0.191272587 short^0.020681806 femal^0.037394743 male^0.037394743 appar^0.020681806 sexual^0.213993460 determin^0.195059389 born^0.041363612
6,15,should animals be used for scientific or commercial testing,applypipeline:off scientif^0.204585403 result^0.019330038 commerci^0.204585403 purpos^0.031630971 presum^0.019330038 topic^0.019330038 anim^0.254338264 signific^0.019330038 ethic^0.019330038 test^0.208209783
7,16,should prescription drugs be advertised directly to consumers,applypipeline:off 08^0.019211052 directli^0.120000005 health^0.016326081 2011^0.023155801 drug^0.207209975 advertis^0.209443867 access^0.019211052 direct^0.030993305 consum^0.163783342 patient^0.038612481 prescript^0.152053088
8,17,should recreational marijuana be legal,applypipeline:off teacher^0.010206084 drug^0.010206084 benefit^0.020412168 danger^0.026692834 legal^0.282028139 recreat^0.292011797 illeg^0.017795224 marijuana^0.303748786 tell^0.010206084 harm^0.026692834
9,18,should churches remain tax exempt,applypipeline:off remain^0.150000006 rich^0.017990451 sale^0.011732902 properti^0.014666129 church^0.260852933 commun^0.018301027 poor^0.022488067 tax^0.255366087 exempt^0.214162886 organ^0.015471329 remov^0.018968193


In [43]:
import re

def _remove_pollution(q) -> str:
    q_old = q["query"].replace('applypipeline:off', '')
    return q["query_1"] + " " + re.sub(r'\^(\d)+\.(\d)+', '', q_old)

# (qe_pipeline >> pt.apply.query(_remove_pollution))(testset.get_topics())

pipeline = qe_pipeline >> pt.apply.query(_remove_pollution) >> bm25

pipeline(testset.get_topics('text'))

Unnamed: 0,qid,docid,docno,rank,score,query_1,query_0,query
0,1,143806,51530f3f-2019-04-18T18:15:02Z-00004-000,0,102.532028,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217,should teachers get tenure school teacher fire offer job tenur educ teach posit remov
1,1,164415,b0680508-2019-04-18T13:48:51Z-00002-000,1,96.717591,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217,should teachers get tenure school teacher fire offer job tenur educ teach posit remov
2,1,4619,c065954f-2019-04-18T14:32:52Z-00003-000,2,94.407503,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217,should teachers get tenure school teacher fire offer job tenur educ teach posit remov
3,1,4617,c065954f-2019-04-18T14:32:52Z-00001-000,3,93.737018,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217,should teachers get tenure school teacher fire offer job tenur educ teach posit remov
4,1,163479,ff0947ec-2019-04-18T12:23:12Z-00000-000,4,91.611891,should teachers get tenure,applypipeline:off school^0.032349806 teacher^0.440612972 fire^0.020328300 offer^0.011770617 job^0.013183307 tenur^0.426612645 educ^0.014762216 teach^0.017655928 posit^0.011611005 remov^0.011113217,should teachers get tenure school teacher fire offer job tenur educ teach posit remov
...,...,...,...,...,...,...,...,...
48995,9,128896,f08e9f1b-2019-04-18T19:55:01Z-00000-000,995,22.481862,should students have to wear school uniforms,applypipeline:off school^0.237351269 troubl^0.007555717 won^0.010074289 student^0.222298265 bulli^0.010953694 wear^0.223566383 choos^0.010074289 uniform^0.241878211 design^0.006956442 cloth^0.029291473,should students have to wear school uniforms school troubl won student bulli wear choos uniform design cloth
48996,9,63640,e855b5e5-2019-04-18T18:40:04Z-00000-000,996,22.461359,should students have to wear school uniforms,applypipeline:off school^0.237351269 troubl^0.007555717 won^0.010074289 student^0.222298265 bulli^0.010953694 wear^0.223566383 choos^0.010074289 uniform^0.241878211 design^0.006956442 cloth^0.029291473,should students have to wear school uniforms school troubl won student bulli wear choos uniform design cloth
48997,9,110519,cf842d89-2019-04-18T17:28:01Z-00002-000,997,22.393708,should students have to wear school uniforms,applypipeline:off school^0.237351269 troubl^0.007555717 won^0.010074289 student^0.222298265 bulli^0.010953694 wear^0.223566383 choos^0.010074289 uniform^0.241878211 design^0.006956442 cloth^0.029291473,should students have to wear school uniforms school troubl won student bulli wear choos uniform design cloth
48998,9,353591,4b51d325-2019-04-19T12:44:49Z-00004-000,998,22.393708,should students have to wear school uniforms,applypipeline:off school^0.237351269 troubl^0.007555717 won^0.010074289 student^0.222298265 bulli^0.010953694 wear^0.223566383 choos^0.010074289 uniform^0.241878211 design^0.006956442 cloth^0.029291473,should students have to wear school uniforms school troubl won student bulli wear choos uniform design cloth


In [44]:
(~bm25 % 1000)(testset.get_topics())

There are multiple query fields available: ('text', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,143806,51530f3f-2019-04-18T18:15:02Z-00004-000,0,31.770993,should teachers get tenure
1,1,164415,b0680508-2019-04-18T13:48:51Z-00002-000,1,31.583122,should teachers get tenure
2,1,4619,c065954f-2019-04-18T14:32:52Z-00003-000,2,31.521730,should teachers get tenure
3,1,4617,c065954f-2019-04-18T14:32:52Z-00001-000,3,31.476861,should teachers get tenure
4,1,163479,ff0947ec-2019-04-18T12:23:12Z-00000-000,4,31.385355,should teachers get tenure
...,...,...,...,...,...,...
48995,50,24045,630f7c6f-2019-04-18T12:52:49Z-00001-000,995,9.781759,should everyone get a universal basic income
48996,50,284651,1d684498-2019-04-18T17:05:49Z-00001-000,996,9.781667,should everyone get a universal basic income
48997,50,181658,b4c02573-2019-04-18T11:34:49Z-00000-000,997,9.780855,should everyone get a universal basic income
48998,50,320637,7539ed46-2019-04-18T11:46:02Z-00004-000,998,9.780264,should everyone get a universal basic income


In [45]:
pt.Experiment(
    [
        bm25,
        bm25 >> rm3 >> bm25,
        bm25 % 1 >> rm3 >> bm25,
        bm25 % 1000 >> ff_score >> ff_int,
        pipeline >> ff_score >> ff_int
    ],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=[
        "BM25",
        "RM3",
        "RM3 % 1",
        "BM25 >> FF",
        "BM25 >> RM3 >> FF"
    ],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25,0.622846,0.342774,0.209593
1,RM3,0.567282,0.346563,0.219955
2,RM3 % 1,0.635374,0.37232,0.225382
3,BM25 >> FF,0.630977,0.35666,0.224018
4,BM25 >> RM3 >> FF,0.487156,0.291341,0.195742
