# BERT-based DENSE Retreival System

In [1]:
#install libaries
!pip3 install transformers faiss-cpu datasets
!pip3 install rank_bm25
!pip3 install tira ir-datasets python-terrier nltk
!pip3 install torch tensorflow

[0m

In [2]:
#import libaries

from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import faiss
from rank_bm25 import BM25Okapi
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import torch
import pyterrier as pt
import json
import gzip
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score



In [3]:
#load pre-trained BERT model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [4]:
#Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
# Load documents from JSONL.GZ
def load_jsonl_gz(file_path):
    documents = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            documents.append(data['text'])  # Adjust the key if necessary
    return documents

In [6]:
# Load queries from JSONL
def load_jsonl(file_path):
    queries = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            queries.append(data['query'])  # Adjust the key if necessary
    return queries

In [7]:
# Load the documents and queries
documents_file_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-8/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/input-data/documents.jsonl.gz'
queries_file_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-8/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/input-data/queries.jsonl'

documents = load_jsonl_gz(documents_file_path)
documents = documents[:300]
test_queries = load_jsonl(queries_file_path)

print(f"Loaded {len(documents)} documents")
print(f"Loaded {len(test_queries)} queries")

Loaded 300 documents
Loaded 68 queries


In [8]:
# Define the retrieval function
# encode the documents
def encode_documents(documents):
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

document_embeddings = encode_documents(documents)

In [9]:
#implement faiss index
index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings.numpy())

In [10]:
def retrieve(query, top_k=500):
    query_embedding = encode_documents([query]).numpy()
    _, faiss_indices = index.search(query_embedding, top_k)
    bm25_scores = bm25.get_scores(query.split())
    bm25_indices = bm25_scores.argsort()[-top_k:][::-1]
    combined_indices = list(set(faiss_indices[0]).union(set(bm25_indices)))
    combined_scores = [(i, bm25_scores[i] + faiss_indices[0].tolist().index(i) if i in faiss_indices[0] else 0) for i in combined_indices]
    combined_scores.sort(key=lambda x: x[1], reverse=True)
    return [documents[i] for i, _ in combined_scores[:top_k]]

In [11]:
# Implement BM25
tokenized_corpus = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

In [12]:
# Define the evaluation function
def evaluate_retrieval(retrieval_func, test_queries, top_k=500):
    precisions = []
    avg_precisions = []
    reciprocal_ranks = []
    
    for query in test_queries:
        retrieved_docs = retrieval_func(query, top_k)
        relevance = [1 if any(word in doc.lower() for word in query.lower().split()) else 0 for doc in retrieved_docs]
        precision = sum(relevance) / len(relevance)
        avg_precision = average_precision_score([1] * sum(relevance) + [0] * (len(relevance) - sum(relevance)), relevance)
        first_relevant = next((i for i, rel in enumerate(relevance) if rel == 1), None)
        reciprocal_rank = 1 / (first_relevant + 1) if first_relevant is not None else 0
        precisions.append(precision)
        avg_precisions.append(avg_precision)
        reciprocal_ranks.append(reciprocal_rank)
    
    return {
        'MAP': np.mean(avg_precisions),
        'P_300': np.mean(precisions),
        'recip_rank': np.mean(reciprocal_ranks)
    }

In [13]:
# Evaluate BERT + BM25 retrieval
combined_scores = evaluate_retrieval(retrieve, test_queries)



In [14]:
# Evaluate BM25 only
def bm25_retrieve(query, top_k=500):
    bm25_scores = bm25.get_scores(query.split())
    top_indices = bm25_scores.argsort()[-top_k:][::-1]
    return [documents[i] for i in top_indices]

bm25_scores = evaluate_retrieval(bm25_retrieve, test_queries)
# Create a DataFrame for displaying the results
df = pd.DataFrame([
    {
        'name': 'BERT+BM25', 
        'map': combined_scores['MAP'], 
        'recip_rank': combined_scores['recip_rank'],
        'P_300': combined_scores['P_300']
    },
    {
        'name': 'BM25', 
        'map': bm25_scores['MAP'], 
        'recip_rank': bm25_scores['recip_rank'],
        'P_300': bm25_scores['P_300']
    }
])
# Add placeholder columns for statistical tests
df['map +'] = None
df['map -'] = None
df['map p-value'] = None


# Display the results
print(df.to_string(index=False))


     name      map  recip_rank    P_300 map + map - map p-value
BERT+BM25 0.488612    0.677323 0.504446  None  None        None
     BM25 0.661232    0.956618 0.503922  None  None        None




In [15]:
# # Full Code 

# # Load documents from JSONL.GZ
# def load_jsonl_gz(file_path):
#     documents = []
#     with gzip.open(file_path, 'rt', encoding='utf-8') as f:
#         for line in f:
#             data = json.loads(line.strip())
#             documents.append(data['text'])  # Adjust the key if necessary
#     return documents

# # Load queries from JSONL
# def load_jsonl(file_path):
#     queries = []
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             data = json.loads(line.strip())
#             queries.append(data['query'])  # Adjust the key if necessary
#     return queries

# # Replace with the actual paths to your files
# documents_file_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-8/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/input-data/documents.jsonl.gz'
# queries_file_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-8/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/input-data/queries.jsonl'

# documents = load_jsonl_gz(documents_file_path)
# documents = documents[:300]
# test_queries = load_jsonl(queries_file_path)

# print(f"Loaded {len(documents)} documents")
# print(f"Loaded {len(test_queries)} queries")

# # Define the retrieval function
# def encode_documents(documents):
#     inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
#     with torch.no_grad():
#         embeddings = model(**inputs).last_hidden_state.mean(dim=1)
#     return embeddings

# document_embeddings = encode_documents(documents)

# #implement faiss index
# index = faiss.IndexFlatL2(document_embeddings.shape[1])
# index.add(document_embeddings.numpy())

# def retrieve(query, top_k=500):
#     query_embedding = encode_documents([query]).numpy()
#     _, faiss_indices = index.search(query_embedding, top_k)
#     bm25_scores = bm25.get_scores(query.split())
#     bm25_indices = bm25_scores.argsort()[-top_k:][::-1]
#     combined_indices = list(set(faiss_indices[0]).union(set(bm25_indices)))
#     combined_scores = [(i, bm25_scores[i] + faiss_indices[0].tolist().index(i) if i in faiss_indices[0] else 0) for i in combined_indices]
#     combined_scores.sort(key=lambda x: x[1], reverse=True)
#     return [documents[i] for i, _ in combined_scores[:top_k]]

# # Implement BM25
# tokenized_corpus = [doc.split(" ") for doc in documents]
# bm25 = BM25Okapi(tokenized_corpus)

# # Define the evaluation function
# def evaluate_retrieval(retrieval_func, test_queries, top_k=500):
#     precisions = []
#     avg_precisions = []
#     reciprocal_ranks = []
    
#     for query in test_queries:
#         retrieved_docs = retrieval_func(query, top_k)
#         relevance = [1 if any(word in doc.lower() for word in query.lower().split()) else 0 for doc in retrieved_docs]
#         precision = sum(relevance) / len(relevance)
#         avg_precision = average_precision_score([1] * sum(relevance) + [0] * (len(relevance) - sum(relevance)), relevance)
#         first_relevant = next((i for i, rel in enumerate(relevance) if rel == 1), None)
#         reciprocal_rank = 1 / (first_relevant + 1) if first_relevant is not None else 0
#         precisions.append(precision)
#         avg_precisions.append(avg_precision)
#         reciprocal_ranks.append(reciprocal_rank)
    
#     return {
#         'MAP': np.mean(avg_precisions),
#         'P_300': np.mean(precisions),
#         'recip_rank': np.mean(reciprocal_ranks)
#     }

# # Evaluate BERT + BM25 retrieval
# combined_scores = evaluate_retrieval(retrieve, test_queries)

# # Evaluate BM25 only
# def bm25_retrieve(query, top_k=500):
#     bm25_scores = bm25.get_scores(query.split())
#     top_indices = bm25_scores.argsort()[-top_k:][::-1]
#     return [documents[i] for i in top_indices]

# bm25_scores = evaluate_retrieval(bm25_retrieve, test_queries)

# # Create DataFrame
# df = pd.DataFrame([
#     {
#         'name': 'BERT+BM25', 
#         'map': combined_scores['MAP'], 
#         'recip_rank': combined_scores['recip_rank'],
#         'P_300': combined_scores['P_300']
#     },
#     {
#         'name': 'BM25', 
#         'map': bm25_scores['MAP'], 
#         'recip_rank': bm25_scores['recip_rank'],
#         'P_300': bm25_scores['P_300']
#     }
# ])

# # Add placeholder columns for statistical tests
# df['map +'] = None
# df['map -'] = None
# df['map p-value'] = None

# # Display the DataFrame
# print(df.to_string(index=False))