In [45]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [69]:
DATASET_NAME = "beir/webis-touche2020/v2"
TESTSET_NAME = "beir/webis-touche2020/v2"
BM25_INDEX_PATH = 'indices/webis_idx_blocks'
INDEX_PATH = 'indexes/irds:beir_webis-touche2020_v2.h5'
FIELDS = ["text"]

In [70]:
import ir_datasets
dataset = pt.get_dataset('irds:' + DATASET_NAME)
ir_ds = ir_datasets.load(DATASET_NAME)

In [71]:
from pathlib import Path

idx_path = Path(BM25_INDEX_PATH).absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    blocks=True,
    meta={'docno': ir_ds.docs_metadata()['fields']['doc_id']['max_len'] },
    # stopwords=None,
    # stemmer=None,
).index(dataset.get_corpus_iter(), fields=FIELDS)
index_ref = index_ref.to_memory()

beir/webis-touche2020/v2 documents:   0%|          | 0/382545 [00:00<?, ?it/s]

ValueError: Index already exists at /Users/tomighita/Scoala/Facultate/University-Courses/RP/indices/webis_idx_blocks/data.properties

In [72]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset('irds:' + TESTSET_NAME)

In [73]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

In [74]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path(INDEX_PATH), query_encoder=q_encoder, mode=Mode.MAXP
)
ff_index = ff_index.to_memory()


  0%|          | 0/382545 [00:00<?, ?it/s][A
100%|██████████| 382545/382545 [00:00<00:00, 1902895.45it/s][A


In [75]:
from fast_forward.util.pyterrier import FFScore
from fast_forward.util.pyterrier import FFInterpolate

ff_score = FFScore(ff_index)
candidates = (bm25 % 5)(testset.get_topics('text')) # Get the candidates
re_ranked = ff_score(candidates)
ff_int = FFInterpolate(alpha=0.05)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,1,51530f3f-2019-04-18T18:15:02Z-00004-000,should teachers get tenure,68.917621
1,1,b0680508-2019-04-18T13:48:51Z-00002-000,should teachers get tenure,68.690103
2,1,c065954f-2019-04-18T14:32:52Z-00003-000,should teachers get tenure,69.071692
3,1,c065954f-2019-04-18T14:32:52Z-00001-000,should teachers get tenure,68.714918
4,1,ff0947ec-2019-04-18T12:23:12Z-00000-000,should teachers get tenure,68.880417
...,...,...,...,...
240,50,ffdf2e2e-2019-04-18T11:43:09Z-00005-000,should everyone get a universal basic income,67.569879
241,50,b7051d6f-2019-04-18T11:25:14Z-00001-000,should everyone get a universal basic income,66.704010
242,50,4d103793-2019-04-18T11:35:54Z-00007-000,should everyone get a universal basic income,67.924480
243,50,4d103774-2019-04-18T13:49:55Z-00005-000,should everyone get a universal basic income,66.832887


# Per Query Analysis

Run an experiment with the `perquery = True` flag to see individual query performance

In [77]:
result = pt.Experiment(
    [
        bm25 % 1000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
    ],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10], # Use this for BEIR datasets
    # eval_metrics=[RR (rel = 2) @ 10], # Use this for TREC-DL datasets
    names=["TCT-ColBERT", "RM3+TCT-ColBERT"],
    perquery=True
)
result

Unnamed: 0,name,qid,measure,value
49,RM3+TCT-ColBERT,1,RR@10,1.0
50,RM3+TCT-ColBERT,10,RR@10,0.5
51,RM3+TCT-ColBERT,11,RR@10,1.0
52,RM3+TCT-ColBERT,12,RR@10,0.2
53,RM3+TCT-ColBERT,13,RR@10,1.0
...,...,...,...,...
48,TCT-ColBERT,50,RR@10,0.5
5,TCT-ColBERT,6,RR@10,1.0
6,TCT-ColBERT,7,RR@10,0.5
7,TCT-ColBERT,8,RR@10,0.0


Group by the qid and calculate the Delta RR score, by which we sort to get the queries we are interested in

In [78]:
import pandas as pd

df = result
df['value'] = pd.to_numeric(result['value'])
df.groupby("qid")[['value']].apply(lambda g: g - g.iloc[0])

Unnamed: 0_level_0,Unnamed: 1_level_0,value
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,49,0.0
1,0,0.0
10,50,0.0
10,9,0.0
11,51,0.0
...,...,...
7,6,0.0
8,96,0.0
8,7,0.0
9,97,0.0


Select the query with the largest delta RR (lowest value in cell above)

In [79]:
REPRESENTATIVE_QUERY_ID = '5'
all_topics = testset.get_topics('text')
QUERY_SUBSET = all_topics[all_topics['qid'] == REPRESENTATIVE_QUERY_ID]
QUERY_SUBSET

Unnamed: 0,qid,query
4,5,should social security be privatized


Get the QRels from which we need to filter out the relevant document for our query

In [80]:
qrels = testset.get_qrels()
qrels[qrels['qid'] == REPRESENTATIVE_QUERY_ID]

Unnamed: 0,qid,docno,label,iteration
179,5,fb9d1caa-2019-04-18T19:57:07Z-00003-000,0,0
180,5,f8a2755c-2019-04-18T14:34:51Z-00003-000,0,0
181,5,f898f8b6-2019-04-18T11:07:53Z-00001-000,0,0
182,5,ed146d2b-2019-04-18T18:30:30Z-00003-000,2,0
183,5,e28c98a3-2019-04-18T13:23:25Z-00005-000,2,0
184,5,cf4c9cbf-2019-04-17T11:47:24Z-00074-000,1,0
185,5,a7deb84a-2019-04-18T18:13:23Z-00000-000,0,0
186,5,adb11e8-2019-04-18T13:30:49Z-00001-000,0,0
187,5,cf4c9cbf-2019-04-17T11:47:24Z-00034-000,1,0
188,5,ccb2cd3b-2019-04-19T12:44:47Z-00002-000,0,0


Verify Delta RR corresponds

In [84]:
pt.Experiment(
    [
        bm25 % 1000 >> ff_score >> ff_int,
        bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int,
    ],
    QUERY_SUBSET,
    testset.get_qrels(),
    eval_metrics=[RR @ 10], # Use this for BEIR datasets
    # eval_metrics=[RR (rel = 2) @ 10], # Use this for TREC-DL datasets
    names=["TCT-ColBERT", "RM3+TCT-ColBERT"],
    perquery=True
)

Unnamed: 0,name,qid,measure,value
1,RM3+TCT-ColBERT,5,RR@10,1.0
0,TCT-ColBERT,5,RR@10,0.5


See ranking for RM3+TCT-ColBERT

In [85]:
df = (bm25 % 5 >> rm3 >> bm25 % 1000 >> pt.rewrite.reset() >> ff_score >> ff_int)(QUERY_SUBSET)
df[df['qid'] == REPRESENTATIVE_QUERY_ID]

Unnamed: 0,qid,docno,query,score
0,5,cf4c9cbf-2019-04-17T11:47:24Z-00055-000,should social security be privatized,67.410446
1,5,2d6f4e75-2019-04-15T20:22:43Z-00009-000,should social security be privatized,67.025839
2,5,2d6f4e75-2019-04-15T20:22:43Z-00007-000,should social security be privatized,67.664482
3,5,dac7811d-2019-04-18T20:00:32Z-00001-000,should social security be privatized,66.345922
4,5,cf4c9cbf-2019-04-17T11:47:24Z-00062-000,should social security be privatized,67.406605
...,...,...,...,...
995,5,f12999ea-2019-04-18T15:44:02Z-00005-000,should social security be privatized,63.437451
996,5,1e31a0c1-2019-04-18T20:00:01Z-00003-000,should social security be privatized,64.900629
997,5,319205df-2019-04-15T20:23:04Z-00007-000,should social security be privatized,63.765195
998,5,7d47f8ba-2019-04-18T19:36:54Z-00004-000,should social security be privatized,64.065127


See ranking for TCT-ColBERT

In [83]:
df2 = (bm25 % 1000 >> ff_score >> ff_int)(QUERY_SUBSET)
df2[df2['qid'] == REPRESENTATIVE_QUERY_ID]

Unnamed: 0,qid,docno,query,score
0,5,2d6f4e75-2019-04-15T20:22:43Z-00009-000,should social security be privatized,66.802201
1,5,cf4c9cbf-2019-04-17T11:47:24Z-00055-000,should social security be privatized,67.119543
2,5,2d6f4e75-2019-04-15T20:22:43Z-00007-000,should social security be privatized,67.543852
3,5,cf4c9cbf-2019-04-17T11:47:24Z-00062-000,should social security be privatized,67.329656
4,5,2d6f4e75-2019-04-15T20:22:43Z-00015-000,should social security be privatized,66.796272
...,...,...,...,...
995,5,dd3769ca-2019-04-18T19:44:34Z-00002-000,should social security be privatized,63.296032
996,5,d8d74905-2019-04-18T15:19:51Z-00002-000,should social security be privatized,63.953152
997,5,ab490cb9-2019-04-18T19:45:33Z-00003-000,should social security be privatized,64.769290
998,5,ae43584a-2019-04-18T12:49:12Z-00000-000,should social security be privatized,63.262071


Inspect how the query is expanded:

In [87]:
df3 = (bm25 % 5 >> rm3 >> bm25 % 100)(testset.get_topics('text'))
df3[df3['qid'] == REPRESENTATIVE_QUERY_ID]

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
43000,5,370977,cf4c9cbf-2019-04-17T11:47:24Z-00055-000,0,32.324364,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43001,5,334864,2d6f4e75-2019-04-15T20:22:43Z-00009-000,1,31.137817,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43002,5,334863,2d6f4e75-2019-04-15T20:22:43Z-00007-000,2,28.741153,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43003,5,281446,dac7811d-2019-04-18T20:00:32Z-00001-000,3,28.247629,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43004,5,370971,cf4c9cbf-2019-04-17T11:47:24Z-00062-000,4,27.468185,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
...,...,...,...,...,...,...,...
43095,5,307560,8c527629-2019-04-18T19:33:00Z-00003-000,95,20.258738,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43096,5,187761,988cf27-2019-04-18T12:54:20Z-00004-000,96,20.243477,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43097,5,20314,969c1d86-2019-04-18T18:25:10Z-00002-000,97,20.154339,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196
43098,5,242953,effbd50f-2019-04-18T17:55:11Z-00005-000,98,20.137370,should social security be privatized,applypipeline:off social^0.280862361 invest^0.016216118 privat^0.260482877 elderli^0.031810354 benefit^0.026657782 system^0.015973436 secur^0.279393673 poor^0.045462392 retir^0.020409806 incom^0.022731196


See ranking for BM25

In [88]:
df4 = (bm25 % 100)(testset.get_topics('text'))
df4[df4['qid'] == REPRESENTATIVE_QUERY_ID]

Unnamed: 0,qid,docid,docno,rank,score,query
4000,5,334864,2d6f4e75-2019-04-15T20:22:43Z-00009-000,0,26.665043,should social security be privatized
4001,5,370977,cf4c9cbf-2019-04-17T11:47:24Z-00055-000,1,26.506306,should social security be privatized
4002,5,334863,2d6f4e75-2019-04-15T20:22:43Z-00007-000,2,26.328549,should social security be privatized
4003,5,370971,cf4c9cbf-2019-04-17T11:47:24Z-00062-000,3,25.929203,should social security be privatized
4004,5,334867,2d6f4e75-2019-04-15T20:22:43Z-00015-000,4,25.403738,should social security be privatized
...,...,...,...,...,...,...
4095,5,266819,c61cb529-2019-04-18T19:57:16Z-00002-000,95,19.168881,should social security be privatized
4096,5,309353,38b0fdf5-2019-04-18T12:55:15Z-00004-000,96,19.157993,should social security be privatized
4097,5,233198,8270f66a-2019-04-18T11:46:47Z-00005-000,97,19.118600,should social security be privatized
4098,5,305442,4809e6cc-2019-04-18T13:30:35Z-00003-000,98,19.084106,should social security be privatized


Find the document(s) we are interested in

In [86]:
for q in ir_ds.docs_iter():
    if q.doc_id not in ['2d6f4e75-2019-04-15T20:22:43Z-00007-000']:
        continue
    print(q.text)
    print("Document ID:" + q.doc_id)
    print("---------------")

Privatizing social security would enable investment of savings. Commentator Alex Schibuola argues that: "If Social Security were privatized, people would deposit their income with a bank. People actually save resources that businesses can invest. We, as true savers, get more resources in the future."[1]  As a result private accounts would also increase investments, jobs and wages. Michael Tanner of the think tank the Cato Institute argues: "Social Security drains capital from the poorest areas of the country, leaving less money available for new investment and job creation. Privatization would increase national savings and provide a new pool of capital for investment that would be particularly beneficial to the poor."[2] Currently Social Security represents a net loss for taxpayers and beneficiaries. Social Security, although key to the restructuring the of USA’s social contract following the great depression, represents a bad deal for the post-war American economy. Moreover, this deal