In [None]:
import torch
import random
import numpy as np
import os

SEED = 42

# seed for Python's random library
random.seed(SEED)

# setting seed for NumPy
np.random.seed(SEED)

# setting seed for PyTorch
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  

# Make PyTorch operations deterministic
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set a seed for Python's hash function to control for hash randomization
os.environ['PYTHONHASHSEED'] = str(SEED)

In [None]:
# installing dependencies & import libraries
%pip install python-terrier sentence-transformers tqdm pandas

import pyterrier as pt
import pandas as pd
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import numpy as np

os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-21"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

if not pt.started():
    pt.init()

In [None]:
#  parsing TREC format files

def parse_trec_corpus(filepath):
    corpus = []
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        docs = re.findall(r"<DOC>(.*?)</DOC>", content, re.S)
        for doc in docs:
            docno = re.search(r"<DOCNO>(.*?)</DOCNO>", doc).group(1).strip()
            body_match = re.search(r"<BODY>(.*?)</BODY>", doc, re.S)
            body = body_match.group(1).strip() if body_match else ""
            corpus.append({"docno": docno, "text": body})
    return corpus

def parse_trec_queries(filepath):
    queries = []
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        tops = re.findall(r"<top>(.*?)</top>", content, re.S)
        for top in tops:
            qid = re.search(r"<num>(.*?)</num>", top).group(1).strip()
            title = re.search(r"<title>(.*?)</title>", top, re.S).group(1).strip()
            queries.append({"qid": qid, "query": title})
    return queries

def parse_qrels(filepath):
    qrels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 4:
                qid, _, docno, label = parts
                qrels.append({"qid": qid, "docno": docno, "label": int(label)})
    return qrels

In [None]:
# Load files
corpus = parse_trec_corpus("../data/Baseline_Corpus.trec")
train_queries = parse_trec_queries("../data/Train_query.trec")
test_queries = parse_trec_queries("../data/Test_query.trec")
qrels = parse_qrels("../data/QRels_Train.txt")

print(f"Corpus size: {len(corpus)}")
print(f"Train queries: {len(train_queries)}, Test queries: {len(test_queries)}")
print(f"Qrels entries: {len(qrels)}")

In [None]:
# Build BM25 Index
index_path = "./index"

# ensuring directory exists and is writable
index_path = os.path.abspath(index_path)
os.makedirs(index_path, exist_ok=True)

indexer = pt.IterDictIndexer(index_path)
indexref = indexer.index(({"docno": d["docno"], "text": d["text"]} for d in corpus))

index = pt.IndexFactory.of(indexref)
bm25 = pt.BatchRetrieve(index, wmodel="BM25", num_results=100, controls={"parse.controls": "false"})

In [None]:
# BERT Reranker
model = SentenceTransformer("all-mpnet-base-v2")

def rerank_with_bert(query, docs, top_k=10):
    query_emb = model.encode(query, convert_to_tensor=True)
    doc_texts = [doc["text"] for doc in docs]
    doc_embs = model.encode(doc_texts, convert_to_tensor=True)
    scores = util.cos_sim(query_emb, doc_embs)[0].cpu().numpy()
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    reranked_docs = []
    for rank, (doc, score) in enumerate(ranked[:top_k], 1):
        reranked_docs.append({
            "docno": doc["docno"],
            "score": float(score),
            "rank": rank
        })
    return reranked_docs

In [None]:
# retrieve & rerank
bm25_reranked_results = []
doc_lookup = {d["docno"]: d["text"] for d in corpus}  # for BERT reranker

def clean_query(q):
    return re.sub(r"[^\w\s]", " ", q)

for q in tqdm(train_queries, desc="Processing queries"):
    qid = q["qid"]
    raw_query = q["query"]
    query = clean_query(raw_query)  

    bm25_res = bm25.search(query).to_dict('records')

    docs_with_text = []
    for res in bm25_res:
        if res["docno"] in doc_lookup:
            docs_with_text.append({"docno": res["docno"], "text": doc_lookup[res["docno"]]})

    if not docs_with_text:
        continue

    reranked = rerank_with_bert(query, docs_with_text, top_k=10)

    for rank, r in enumerate(reranked, start=1):
        bm25_reranked_results.append({
            "qid": qid,
            "docno": r["docno"],
            "rank": rank,
            "score": r["score"]
        })

results_df = pd.DataFrame(bm25_reranked_results)
print(f"Results DataFrame size: {results_df.shape}")
print(results_df.head())

In [None]:
qrels_df = pd.DataFrame(qrels)

print(qrels_df.head())

In [None]:
# Metric Calculations (MAP, nDCG@10, P@10, MRR)

def dcg(scores):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(scores))

def ndcg(ranked_list, ground_truth, k=10):
    rels = [ground_truth.get(doc, 0) for doc in ranked_list[:k]]
    ideal_rels = sorted(ground_truth.values(), reverse=True)[:k]
    return dcg(rels) / dcg(ideal_rels) if ideal_rels else 0.0

def precision_at_k(ranked_list, ground_truth, k):
    rels = [1 if ground_truth.get(doc, 0) > 0 else 0 for doc in ranked_list[:k]]
    return sum(rels) / k

def average_precision(ranked_list, ground_truth):
    hits = 0
    sum_precisions = 0.0
    for i, doc in enumerate(ranked_list):
        if ground_truth.get(doc, 0) > 0:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / hits if hits > 0 else 0.0

def reciprocal_rank(ranked_list, ground_truth):
    for i, doc in enumerate(ranked_list):
        if ground_truth.get(doc, 0) > 0:
            return 1.0 / (i + 1)
    return 0.0

metrics = {"map": [], "ndcg": [], "P@5": [], "P@10": [], "recip_rank": []}

for qid in results_df["qid"].unique():
    ranked_docs = results_df[results_df["qid"] == qid].sort_values(by="score", ascending=False)["docno"].tolist()
    gt = qrels_df[qrels_df["qid"] == qid].set_index("docno")["label"].to_dict()

    metrics["map"].append(average_precision(ranked_docs, gt))
    metrics["ndcg"].append(ndcg(ranked_docs, gt, k=10))
    metrics["P@10"].append(precision_at_k(ranked_docs, gt, k=10))
    metrics["recip_rank"].append(reciprocal_rank(ranked_docs, gt))
    metrics["P@5"].append(precision_at_k(ranked_docs, gt, k=5))


print("MAP:", np.mean(metrics["map"]))
print("nDCG@10:", np.mean(metrics["ndcg"]))
print("P@10:", np.mean(metrics["P@10"]))
print("P@5:", np.mean(metrics["P@5"]))
print("MRR:", np.mean(metrics["recip_rank"]))

In [None]:
# saving final results for submission (qid, docno, rank, score)

# Sort results by qid, then by score descending
results_df = results_df.sort_values(
    by=["qid", "score"], 
    ascending=[True, False]
)

final_df = results_df[["qid", "docno", "rank", "score"]]

# Save as CSV in current working directory
output_path = "../output/bm25_bert_submission.csv"
final_df.to_csv(output_path, index=False)

print(f"Submission file saved at: {output_path}")
