# Reranking and Filtering Retrieved Documents

Install all the necessary libraries

In [None]:
!pip install transformers sentence-transformers




import all the necessary libraries & Define and Encode Documents

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your bi-encoder model
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')

# Sample documents
documents = [
    "Pinecone is a vector database.",
    "Vectors represent data in numerical form.",
    "Embedding models convert text to vectors.",
    "Pinecone helps in managing large-scale vector databases."
]

# Encode documents
def encode_documents(documents):
    return bi_encoder.encode(documents, convert_to_tensor=True)

document_embeddings = encode_documents(documents)




Define a query, encode it, and retrieve top documents based on similarity



In [None]:
# Define the query
query = "How does Pinecone work?"

# Encode the query
query_embedding = bi_encoder.encode([query], convert_to_tensor=True)

# Compute cosine similarity between query and document embeddings
similarities = cosine_similarity(query_embedding, document_embeddings)

# Flatten similarities array and sort
similarity_scores = similarities.flatten()
sorted_indices = np.argsort(similarity_scores)[::-1]
top_k = 3  # Number of top documents to retrieve

# Initial retrieval
print("Initial Query Results:")
for idx in sorted_indices[:top_k]:
    print(f"Document: {documents[idx]}, Score: {similarity_scores[idx]:.4f}")


Initial Query Results:
Document: Pinecone is a vector database., Score: 0.6133
Document: Pinecone helps in managing large-scale vector databases., Score: 0.5196
Document: Vectors represent data in numerical form., Score: 0.0549


Reranking with Cross-Encoders

In [None]:
from sentence_transformers import CrossEncoder

# Load a cross-encoder model (fine-tuned for reranking)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Rerank top-k documents
def rerank_documents(query, documents, cross_encoder):
    query_pairs = [(query, doc) for doc in documents]
    rerank_scores = cross_encoder.predict(query_pairs)
    return rerank_scores

# Get rerank scores
rerank_scores = rerank_documents(query, [documents[idx] for idx in sorted_indices[:top_k]], cross_encoder)

# Sort based on rerank scores
reranked_indices = np.argsort(rerank_scores)[::-1]

# Reranked results
print("Reranked Query Results:")
for idx in reranked_indices:
    print(f"Document: {documents[sorted_indices[idx]]}, Score: {rerank_scores[idx]:.4f}")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Reranked Query Results:
Document: Pinecone helps in managing large-scale vector databases., Score: 6.9126
Document: Pinecone is a vector database., Score: 3.8535
Document: Vectors represent data in numerical form., Score: -11.3829




Print and compare the results from the initial retrieval and reranked results

In [None]:
print("Initial Query Results:")
for idx in sorted_indices[:top_k]:
    print(f"Document: {documents[idx]}, Score: {similarity_scores[idx]:.4f}")

print("\nReranked Query Results:")
for idx in reranked_indices:
    print(f"Document: {documents[sorted_indices[idx]]}, Score: {rerank_scores[idx]:.4f}")


Initial Query Results:
Document: Pinecone is a vector database., Score: 0.6133
Document: Pinecone helps in managing large-scale vector databases., Score: 0.5196
Document: Vectors represent data in numerical form., Score: 0.0549

Reranked Query Results:
Document: Pinecone helps in managing large-scale vector databases., Score: 6.9126
Document: Pinecone is a vector database., Score: 3.8535
Document: Vectors represent data in numerical form., Score: -11.3829
