In [1]:
import torch
from transformers import BertTokenizer, BertModel
from ReRank import ReRank
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

In [None]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [None]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
run = bm25(pt_dataset.get_topics('text'))

print('Done. Here are the first 10 entries of the run')
run.head(10)

In [None]:
# Evaluate the results
qrels_df = pt_dataset.get_qrels()
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

In [None]:
# Filter run to include only judged documents
qrels_df = pt_dataset.get_qrels()
judged_docnos = qrels_df['docno'].unique()
filtered_run = run[run['docno'].isin(judged_docnos)]

# Evaluate the results
eval = pt.Evaluate(filtered_run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

In [None]:
persist_and_normalize_run(run, system_name='bm25-rerank', default_output='../runs')

In [2]:
# Define a function to encode queries and documents using BERT
def encode_text(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example query and documents
query = "example query text"
documents = [
    "document text 1",
    "document text 2",
    "document text 3",
]

# Encode the query and documents
query_embedding = encode_text([query], tokenizer, model)
document_embeddings = encode_text(documents, tokenizer, model)

# Compute cosine similarity scores
cosine_similarity = torch.nn.functional.cosine_similarity
scores = cosine_similarity(query_embedding, document_embeddings)

# Rerank the documents based on similarity scores
ranked_indices = scores.argsort(descending=True)
ranked_documents = [documents[i] for i in ranked_indices]

# Output the reranked documents
for rank, doc in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: {doc}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Rank 1: document text 1
Rank 2: document text 3
Rank 3: document text 2
