In [1]:
pip install -q python-terrier

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch

torch.cuda.is_available()

True

In [4]:
import pyterrier as pt
from sentence_transformers import SentenceTransformer
import pandas as pd
import os
from pathlib import Path
import numpy as np
from pyterrier.measures import RR, nDCG, MAP
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFScore, FFInterpolate
from fast_forward.encoder.base import Encoder
from abc import ABC, abstractmethod

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

BASE_DIR= Path.cwd()

# Load TREC-COVID dataset
dataset = pt.get_dataset("irds:cord19/trec-covid")

# Indexing
index_loc = "./index_path"
if not os.path.exists(os.path.join(index_loc, "data.properties")):
    indexer = pt.IterDictIndexer(index_loc)
    indexref = indexer.index(dataset.get_corpus_iter(), fields=("title", "abstract"), meta=["docno", "title"])
else:
    indexref = pt.IndexRef.of(os.path.join(index_loc, "data.properties"))

count = sum(1 for _ in dataset.get_corpus_iter())
print("Number of documents:", count)

# Baseline retrieval using BM25
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25", metadata=["docno", "title"])

  if not pt.started():
cord19/trec-covid documents: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 192509/192509 [00:01<00:00, 182486.54it/s]

Number of documents: 192509



  bm25 = pt.BatchRetrieve(indexref, wmodel="BM25", metadata=["docno", "title"])


In [7]:
# Encoder class definition
class MiniLMEncoder(Encoder):
    def __init__(self, model_name="sentence-transformers/msmarco-MiniLM-L6-cos-v5"):
        self.model = SentenceTransformer(model_name)

    def _encode(self, texts: "Sequence[str]") -> "np.ndarray":
        """Encodes texts into embeddings using MiniLM."""
        return np.array(self.model.encode(texts, convert_to_numpy=True))

In [8]:
# Helper function
def docs_iter():
    for d in dataset.get_corpus_iter():
        yield {
            "doc_id": d["docno"],  # Map docno → doc_id for Fast-Forward
            "text": f"{d['title']} {d.get('abstract', '')}".strip()
        }

# FF Score
ff_index_path = BASE_DIR / "ffindex_cord19_minilm.h5"
if ff_index_path.exists(): # Use existing index
    ff_index = OnDiskIndex.load(
        ff_index_path,
        query_encoder=MiniLMEncoder(),
        mode=Mode.MAXP, # Super important that you use Mode.MAXP. Apparently this tells FFIndex to use all the text.
    )
else: # Create new one if it isn't already present
    ff_index = OnDiskIndex(
        ff_index_path,
        query_encoder=MiniLMEncoder(),
        mode=Mode.MAXP,
    )
    
    print("Indexing documents with MiniLM...")
    Indexer(ff_index, MiniLMEncoder(), batch_size=8).from_dicts(docs_iter())

print("Number of documents in index:", len(ff_index))
ff_index = ff_index.to_memory()
 
#%% Defining FF index reranker using MiniLM
ff_score = FFScore(ff_index)

Indexing documents with MiniLM...


0it [00:00, ?it/s]
8it [00:00, 21.32it/s]ments:   0%|                                                                                                                                                    | 0/192509 [00:00<?, ?it/s]
32it [00:00, 75.98it/s]ents:   0%|                                                                                                                                          | 8/192509 [00:00<2:29:26, 21.47it/s]
56it [00:00, 114.83it/s]nts:   0%|                                                                                                                                           | 32/192509 [00:00<42:02, 76.32it/s]
80it [00:00, 147.56it/s]nts:   0%|                                                                                                                                          | 56/192509 [00:00<27:51, 115.13it/s]
104it [00:00, 169.12it/s]ts:   0%|                                                                                                           

In [28]:
# Hyperparameter Tuning
grid_search = pt.GridSearch(
    bm25 % 100 >> ff_score >> ff_int,
    {ff_int: {"alpha": np.arange(0,1,0.1)}}, 
    dataset.get_topics('title'),
    dataset.get_qrels(),
    "map",
    verbose=True,
)

GridScan: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.60s/it]

Best map is 0.077905
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x7f093834c580> alpha=0.1']





In [30]:
# Initialize alpha as best value
ff_int = FFInterpolate(alpha=0.064010) #title:0.077905, description: 0.085003, narrative: 0.064010

# Apply MiniLM as a re-ranker
pipeline = bm25 % 100 >> ff_score >> ff_int
topic = 'narrative'
print(f"\nRunning experiment with alpha={ff_int.alpha}... for topic: {topic}")

# Run experiment comparing BM25 and BM25 + MiniLM
experiment = pt.Experiment(
    [bm25 % 100, pipeline % 100],
    dataset.get_topics(topic),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25", "BM25 + MiniLM"]
)

# Print results
print(experiment)


Running experiment with alpha=0.06401... for topic: narrative
            name     RR@10   nDCG@10    AP@100
0           BM25  0.724167  0.518681  0.060577
1  BM25 + MiniLM  0.784833  0.571531  0.064482
