In [1]:
!pip install faiss-cpu fast-forward-indexes==0.2.0



In [12]:
import prebuilt_index_info
from urllib.request import urlretrieve
import tarfile 
import faiss
import os
from pathlib import Path
from fast_forward import OnDiskIndex
import shutil

DIR_NAME = 'indexes'
REMOVE_AFTER_DOWNLOAD = True
INTERESTING_DATASETS = ['beir-v1.0.0-trec-covid.contriever','beir-v1.0.0-arguana.contriever', 'beir-v1.0.0-cqadupstack-programmers.contriever-msmarco', 'beir-v1.0.0-cqadupstack-android.contriever','beir-v1.0.0-nfcorpus.contriever-msmarco']

for dataset_name in prebuilt_index_info.FAISS_INDEX_INFO_BEIR.keys():
    # placeholder
    if dataset_name not in INTERESTING_DATASETS:
        continue
        
    dataset_url = prebuilt_index_info.FAISS_INDEX_INFO_BEIR[dataset_name]['urls'][0]
    faiss_idx_filename = prebuilt_index_info.FAISS_INDEX_INFO_BEIR[dataset_name]['filename']
    faiss_dir_name = faiss_idx_filename[:-7]
    faiss_dir_full_path = os.path.join(DIR_NAME, faiss_dir_name)
    
    if not os.path.exists(Path(DIR_NAME, faiss_dir_name + ".h5")) and not os.path.exists(faiss_dir_full_path):
        print(f'Downloading archive: {faiss_idx_filename}')
        # Download archive
        urlretrieve(dataset_url, faiss_idx_filename)
        # Un-archive the .tar.gz
        archive_faiss = tarfile.open(faiss_idx_filename)  
        archive_faiss.extractall(DIR_NAME)
        archive_faiss.close()
        # Delete archive
        os.remove(faiss_idx_filename)
    
        print("Indexing...")    
        index = faiss.read_index(os.path.join(DIR_NAME, faiss_dir_name, "index"))
        with open(os.path.join(DIR_NAME, faiss_dir_name, "docid")) as fp:
            docids = list(fp.read().splitlines())
        
        vectors = index.reconstruct_n(0, len(docids))
        OnDiskIndex(Path(DIR_NAME, faiss_dir_name + ".h5"), 768, max_id_length = 60).add(vectors, doc_ids=docids)
    
        print(f'Finished indexing {dataset_name}.')
        
    if os.path.exists(faiss_dir_full_path) and REMOVE_AFTER_DOWNLOAD:
        shutil.rmtree(os.path.join(DIR_NAME, faiss_dir_name))

In [None]:
pip install python-terrier==0.10.0 fast-forward-indexes==0.2.0

In [3]:
import pyterrier as pt

if not pt.started():
    pt.init(
        tqdm="notebook",
        boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
DATASET_NAME = 'irds:beir/msmarco'
TESTSET_NAME = 'irds:msmarco-passage/trec-dl-hard'
BM25_INDEX_PATH = 'indices/msmarco'
INDEX_PATH = 'indexes/ff_msmarco-v1-passage.tct_colbert.h5'
FIELDS = ['text','title']

In [5]:
dataset = pt.get_dataset(DATASET_NAME)

In [6]:
from pathlib import Path

idx_path = Path(BM25_INDEX_PATH).absolute()

index_ref = pt.index.IterDictIndexer(
    str(idx_path),
    blocks=True,
    meta={'docno': 60},
    # stopwords=None,
    # stemmer=None,
).index(dataset.get_corpus_iter(), fields=["text"])

beir/msmarco documents:   0%|          | 0/8841823 [00:00<?, ?it/s]

ValueError: Index already exists at /Users/tomighita/Scoala/Facultate/University-Courses/RP/indices/msmarco/data.properties

In [7]:
from pyterrier.measures import RR, nDCG, MAP

index = pt.IndexFactory.of(str(idx_path))

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
testset = pt.get_dataset(TESTSET_NAME)
pt.Experiment(
    [bm25],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BR(BM25),0.519579,0.274333,0.164399


In [8]:
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder
import torch

q_encoder = TCTColBERTQueryEncoder("castorini/tct_colbert-msmarco")
d_encoder = TCTColBERTDocumentEncoder(
    "castorini/tct_colbert-msmarco",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)
q_encoder(["Test query 1", "Test query 2"])

array([[-0.0380525 ,  0.01848466,  0.05137944, ..., -0.04796502,
         0.00918062, -0.03880693],
       [-0.06809073,  0.02582865,  0.09803923, ..., -0.09031374,
         0.00014139, -0.06282968]], dtype=float32)

In [9]:
from fast_forward import OnDiskIndex, Mode

ff_index = OnDiskIndex.load(
    Path(INDEX_PATH), query_encoder=q_encoder, mode=Mode.MAXP
)
# ff_index = ff_index.to_memory()


  0%|          | 0/8841823 [00:00<?, ?it/s][A
  1%|          | 72027/8841823 [00:00<00:13, 633902.04it/s][A
  2%|▏         | 198863/8841823 [00:00<00:11, 730233.56it/s][A
  4%|▍         | 351779/8841823 [00:00<00:08, 1026211.90it/s][A
  5%|▌         | 460627/8841823 [00:00<00:09, 850351.59it/s] [A
  6%|▋         | 561880/8841823 [00:00<00:11, 717953.66it/s][A
  8%|▊         | 699051/8841823 [00:00<00:09, 872462.35it/s][A
  9%|▉         | 814865/8841823 [00:01<00:10, 734660.43it/s][A
 11%|█         | 982380/8841823 [00:01<00:08, 944561.55it/s][A
 13%|█▎        | 1118494/8841823 [00:01<00:07, 1045263.80it/s][A
 14%|█▍        | 1236612/8841823 [00:01<00:09, 813766.35it/s] [A
 16%|█▌        | 1373752/8841823 [00:01<00:07, 934475.44it/s][A
 17%|█▋        | 1483508/8841823 [00:01<00:08, 886229.47it/s][A
 18%|█▊        | 1583480/8841823 [00:02<00:12, 602163.27it/s][A
 20%|██        | 1775626/8841823 [00:02<00:08, 841413.68it/s][A
 22%|██▏       | 1957952/8841823 [00:02<00:06, 

In [10]:
from fast_forward.util.pyterrier import FFScore
from fast_forward.util.pyterrier import FFInterpolate

ff_score = FFScore(ff_index)
candidates = (bm25 % 5)(testset.get_topics('text')) # Get the candidates
re_ranked = ff_score(candidates)
ff_int = FFInterpolate(alpha=0.1)
ff_int(re_ranked)

Unnamed: 0,qid,docno,query,score
0,1108939,4069373,what slows down the flow of blood,65.616525
1,1108939,4744533,what slows down the flow of blood,65.808511
2,1108939,7454708,what slows down the flow of blood,65.389526
3,1108939,7724054,what slows down the flow of blood,63.709970
4,1108939,841975,what slows down the flow of blood,65.060473
...,...,...,...,...
245,88495,6579243,causes of stroke,64.547018
246,88495,7088443,causes of stroke,63.961109
247,88495,7112879,causes of stroke,64.208614
248,88495,841692,causes of stroke,65.046491


In [19]:
import re

def _remove_pollution(q) -> str:
    q_old = q["query"].replace('applypipeline:off', '')
    return q["query_1"] + " " + re.sub(r'\^(\d)+\.(\d)+', '', q_old)

def _return_to_init_query(q) -> str:
    return q["query_2"]

pipeline = bm25 % 5 >> rm3 >> pt.apply.query(_remove_pollution) >> bm25 % 1000

pipeline(testset.get_topics('text'))

Unnamed: 0,qid,docid,docno,rank,score,query_1,query_0,query
0,1049519,6921021,6921021,0,57.538152,who said no one can make you feel inferior,applypipeline:off insecur^0.018691590 person^0.019976636 feel^0.257943928 ly^0.018691590 complex^0.040654209 inferior^0.243691593 can^0.150000006 who^0.179906547 accomplish^0.018691590 peopl^0.027453272 sai^0.024299067,who said no one can make you feel inferior insecur person feel ly complex inferior can who accomplish peopl sai
1,1049519,8279758,8279758,1,43.846931,who said no one can make you feel inferior,applypipeline:off insecur^0.018691590 person^0.019976636 feel^0.257943928 ly^0.018691590 complex^0.040654209 inferior^0.243691593 can^0.150000006 who^0.179906547 accomplish^0.018691590 peopl^0.027453272 sai^0.024299067,who said no one can make you feel inferior insecur person feel ly complex inferior can who accomplish peopl sai
2,1049519,6921018,6921018,2,43.068182,who said no one can make you feel inferior,applypipeline:off insecur^0.018691590 person^0.019976636 feel^0.257943928 ly^0.018691590 complex^0.040654209 inferior^0.243691593 can^0.150000006 who^0.179906547 accomplish^0.018691590 peopl^0.027453272 sai^0.024299067,who said no one can make you feel inferior insecur person feel ly complex inferior can who accomplish peopl sai
3,1049519,1316185,1316185,3,40.086729,who said no one can make you feel inferior,applypipeline:off insecur^0.018691590 person^0.019976636 feel^0.257943928 ly^0.018691590 complex^0.040654209 inferior^0.243691593 can^0.150000006 who^0.179906547 accomplish^0.018691590 peopl^0.027453272 sai^0.024299067,who said no one can make you feel inferior insecur person feel ly complex inferior can who accomplish peopl sai
4,1049519,5355117,5355117,4,37.014415,who said no one can make you feel inferior,applypipeline:off insecur^0.018691590 person^0.019976636 feel^0.257943928 ly^0.018691590 complex^0.040654209 inferior^0.243691593 can^0.150000006 who^0.179906547 accomplish^0.018691590 peopl^0.027453272 sai^0.024299067,who said no one can make you feel inferior insecur person feel ly complex inferior can who accomplish peopl sai
...,...,...,...,...,...,...,...,...
49995,966413,7506828,7506828,995,21.800356,where are the benefits of cinnamon as a supplement,applypipeline:off lower^0.018758623 help^0.018758623 blood^0.026206898 benefit^0.265379339 sugar^0.026206898 studi^0.018758623 health^0.056000002 cinnamon^0.326896548 supplement^0.228137955 honei^0.014896553,where are the benefits of cinnamon as a supplement lower help blood benefit sugar studi health cinnamon supplement honei
49996,966413,2089398,2089398,996,21.799519,where are the benefits of cinnamon as a supplement,applypipeline:off lower^0.018758623 help^0.018758623 blood^0.026206898 benefit^0.265379339 sugar^0.026206898 studi^0.018758623 health^0.056000002 cinnamon^0.326896548 supplement^0.228137955 honei^0.014896553,where are the benefits of cinnamon as a supplement lower help blood benefit sugar studi health cinnamon supplement honei
49997,966413,1481171,1481171,997,21.798988,where are the benefits of cinnamon as a supplement,applypipeline:off lower^0.018758623 help^0.018758623 blood^0.026206898 benefit^0.265379339 sugar^0.026206898 studi^0.018758623 health^0.056000002 cinnamon^0.326896548 supplement^0.228137955 honei^0.014896553,where are the benefits of cinnamon as a supplement lower help blood benefit sugar studi health cinnamon supplement honei
49998,966413,8054508,8054508,998,21.792947,where are the benefits of cinnamon as a supplement,applypipeline:off lower^0.018758623 help^0.018758623 blood^0.026206898 benefit^0.265379339 sugar^0.026206898 studi^0.018758623 health^0.056000002 cinnamon^0.326896548 supplement^0.228137955 honei^0.014896553,where are the benefits of cinnamon as a supplement lower help blood benefit sugar studi health cinnamon supplement honei


In [20]:
exp_results_dir = Path("exp_results")
exp_results_dir.mkdir(exist_ok=True)

result = pt.Experiment(
    [
        bm25,
        bm25 % 5 >> rm3 >> bm25,
        bm25 % 1 >> rm3 >> bm25,
        bm25 % 1000 >> ff_score >> ff_int,
        pipeline >> ff_score >> ff_int,
        pipeline >> pt.apply.query(_return_to_init_query) >> ff_score >> ff_int
    ],
    testset.get_topics('text'),
    testset.get_qrels(),
    eval_metrics=[RR(rel=2) @ 10, nDCG @ 10, MAP @ 100],
    names=[
        "BM25",
        "RM3",
        "RM3 % 1",
        "BM25 >> FF",
        "BM25 >> RM3 >> FF",
        "BM25 >> RM3 >> FF (og q)"
    ]
)
result

Unnamed: 0,name,RR(rel=2)@10,nDCG@10,AP@100
0,BM25,0.415056,0.274333,0.164399
1,RM3,0.389222,0.27087,0.178831
2,RM3 % 1,0.414468,0.292751,0.184966
3,BM25 >> FF,0.551,0.39035,0.238197
4,BM25 >> RM3 >> FF,0.395278,0.285421,0.180305
5,BM25 >> RM3 >> FF (og q),0.431389,0.327697,0.204134


In [11]:
import csv   

name_to_csv = {
    "BM25": "results/BM25.csv",
    "RM3": "results/RM3.csv",
    "RM3 % 1": "results/RM3_1.csv",
    "BM25 >> FF": "results/BM25_FF.csv",
    "BM25 >> RM3 >> FF": "results/BM25_RM3_FF.csv",
}

for index, row in result.iterrows():
    with open(name_to_csv[row['name']], 'a') as f:
        writer = csv.writer(f)
        changed_row = [TESTSET_NAME, row.iloc[1], row.iloc[2], row.iloc[3]]
        writer.writerow(changed_row)