## Advanced RAG - Retrieval Strategies
### Dense, Sparse, Hybrid, and Reranking

**Objectives:**
- Understand different retrieval modes
- Implement hybrid search with filters
- Apply reranking for better results
- Build reusable retrieval functions

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse

# re-ranking for better result
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

# metadata filtering
from qdrant_client.models import Filter, FieldCondition, MatchValue

# metadata extraction from LLM
from scripts.schema import ChunkMetadata

In [3]:
# Configuration
COLLECTION_NAME = "financial_documents"
EMBEDDING_MODEL = "models/gemini-embedding-001"
LLM_MODEL = "gemini-2.5-flash"

RERANKER_MODEL = "BAAI/bge-reranker-base"

In [6]:
# Initialize LLM
llm = ChatGoogleGenerativeAI(model=LLM_MODEL)

# Gemini embeddings
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)

# Sparse embeddings
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

# Connect to existing collection
vector_store = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    collection_name=COLLECTION_NAME,
    url="https://1fe44dd3-0e21-40c8-a091-818dea1ecbb7.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key = os.getenv("QDRANT_API_KEY"),
    retrieval_mode=RetrievalMode.HYBRID
)

In [7]:
vector_store.client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='financial_documents'), CollectionDescription(name='financial_docs')])

### Filter Extraction with LLM

In [8]:
def extract_filters(user_query: str):

    prompt = f"""
            Extract metadata filters from the query. Return None for fields not mentioned.

                <USER QUERY STARTS>
                {user_query}
                </USER QUERY ENDS>

                #### EXAMPLES
                COMPANY MAPPINGS:
                - Amazon/AMZN -> amazon
                - Google/Alphabet/GOOGL/GOOG -> google
                - Apple/AAPL -> apple
                - Microsoft/MSFT -> microsoft
                - Tesla/TSLA -> tesla
                - Nvidia/NVDA -> nvidia
                - Meta/Facebook/FB -> meta

                DOC TYPE:
                - Annual report -> 10-k
                - Quarterly report -> 10-q
                - Current report -> 8-k

                EXAMPLES:
                "Amazon Q3 2024 revenue" -> {{"company_name": "amazon", "doc_type": "10-q", "fiscal_year": 2024, "fiscal_quarter": "q3"}}
                "Apple 2023 annual report" -> {{"company_name": "apple", "doc_type": "10-k", "fiscal_year": 2023}}
                "Tesla profitability" -> {{"company_name": "tesla"}}

                Extract metadata based on the user query only:
            """
    
    structurerd_llm = llm.with_structured_output(ChunkMetadata)

    metadata = structurerd_llm.invoke(prompt)

    filters = metadata.model_dump(exclude_none=True)

    return filters
    

In [14]:
extract_filters("What is Google's annual revenue in 2023")

{'company_name': 'google', 'doc_type': '10-k', 'fiscal_year': '2023'}

In [15]:
query = "what is amazon's revenue in 2023 in q1?"
filters = extract_filters(query)

In [16]:
filters

{'company_name': 'amazon',
 'doc_type': '10-q',
 'fiscal_year': '2023',
 'fiscal_quarter': 'q1'}

### Retrieval Function

In [17]:
# metadata filtering
from qdrant_client.models import Filter, FieldCondition, MatchValue


[FieldCondition(key=f"metadata.{key}", match=MatchValue(value=value))  for key, value in filters.items()]

[FieldCondition(key='metadata.company_name', match=MatchValue(value='amazon'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None, is_empty=None, is_null=None),
 FieldCondition(key='metadata.doc_type', match=MatchValue(value='10-q'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None, is_empty=None, is_null=None),
 FieldCondition(key='metadata.fiscal_year', match=MatchValue(value='2023'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None, is_empty=None, is_null=None),
 FieldCondition(key='metadata.fiscal_quarter', match=MatchValue(value='q1'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None, is_empty=None, is_null=None)]

In [18]:
def hybrid_search(query: str, k: int = 5):
    """
    Perform hybrid search (dense + sparse vectors).
    
    Args:
        query: Search query
        k: Number of results
        filters: Optional filters like {"company_name": "amazon", "fiscal_year": 2024}
    
    Returns:
        List of Document objects
    """

    filters = extract_filters(query)

    qdrant_filter = None

    if filters:
        condition = [FieldCondition(key=f"metadata.{key}", match=MatchValue(value=value))
                     for key, value in filters.items()]
        
        qdrant_filter = Filter(must=condition)

    results = vector_store.similarity_search(query=query, k=k, filter=qdrant_filter)

    return results


In [21]:
query = "what is the amazon's revenue"
results = hybrid_search(query, k=10)

In [22]:
results

[Document(metadata={'company_name': 'amazon', 'doc_type': '10-k', 'fiscal_quarter': None, 'fiscal_year': '2023', 'content_type': 'text', 'file_hash': '05f2d434b6eee52a5bbb4155a78068b2eda1eeda86b7af55335beb0634ac0398', 'source_file': 'amazon 10-k 2023.md', 'page': 69, '_id': '58ce2259-4919-4c2d-ab63-e9b2200edc06', '_collection_name': 'financial_documents'}, page_content="\n\nNet sales by groups of similar products and services, which also have similar economic characteristics, is as follows (in millions):\n\n|                                 | Year Ended December 31,   | Year Ended December 31,   | Year Ended December 31,   |\n|---------------------------------|---------------------------|---------------------------|---------------------------|\n|                                 | 2021                      | 2022                      | 2023                      |\n| Net Sales:                      |                           |                           |                           |\n| O

In [29]:
query = "what is amazon's cashflow in 2024 in q1?"

results = hybrid_search(query, k=10)

In [30]:
results

[Document(metadata={'company_name': 'amazon', 'doc_type': '10-q', 'fiscal_quarter': 'q1', 'fiscal_year': '2024', 'content_type': 'text', 'file_hash': 'fc8ce40bd987a0e44099914109a3878d72d392e6ac0b4d879e3d5f692499bd01', 'source_file': 'amazon 10-q q1 2024.md', 'page': 28, '_id': '94bbe3aa-45aa-4f34-83a2-4c39fa7bcc85', '_collection_name': 'financial_documents'}, page_content="\n\ncosts, our level of productivity and accuracy, changes in volume, size, and weight of units received and fulfilled, the extent to which third-party sellers utilize Fulfillment by Amazon services, timing of fulfillment network and physical store expansion, the extent we utilize fulfillment services provided by third parties, mix of products and services sold, and our ability to affect customer service contacts per unit by implementing improvements in our operations and enhancements to our customer self-service features. Additionally, sales by our sellers have higher payment processing and related transaction costs

In [31]:
# re-ranking for better result
from langchain_community.cross_encoders import HuggingFaceCrossEncoder


def rerank_results(query: str, documents: list, top_k: int = 5):
    """
    Rerank documents using cross-encoder.
    
    Args:
        query: Search query
        documents: List of Document objects
        top_k: Number of top results to return
    
    Returns:
        List of (score, Document) tuples sorted by relevance
    """

    reranker = HuggingFaceCrossEncoder(model_name=RERANKER_MODEL, model_kwargs = {'device': 'cpu'})

    query_doc_pairs = [(query, doc.page_content) for doc in documents]

    scores = reranker.score(query_doc_pairs)

    reranked = sorted(zip(scores, documents), key=lambda x: x[0], reverse=True)

    reranked = reranked[:top_k]
    return [rank[1] for rank in reranked]


response = rerank_results(query, results)


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [32]:
response

[Document(metadata={'company_name': 'amazon', 'doc_type': '10-q', 'fiscal_quarter': 'q1', 'fiscal_year': '2024', 'content_type': 'text', 'file_hash': 'fc8ce40bd987a0e44099914109a3878d72d392e6ac0b4d879e3d5f692499bd01', 'source_file': 'amazon 10-q q1 2024.md', 'page': 26, '_id': '7dbbe242-100d-4089-9c69-b62e4c69e7ae', '_collection_name': 'financial_documents'}, page_content='\n\nwere driven largely by our continued focus on price, selection, and convenience for our customers, including from our fast shipping offers. Changes in foreign exchange rates reduced International net sales by $248 million for Q1 2024.\n\nAWS sales increased 17% in Q1 2024 compared to the comparable prior year period. The sales growth primarily reflects increased customer usage, partially offset by pricing changes primarily driven by long-term customer contracts.\n\n## Operating Income (Loss)\n\nOperating income (loss) by segment is as follows (in millions):\n\n|                         | Three Months Ended March 