## Hybrid Retriever- Combining Dense And Sparse Retriever

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
## Step 1: Sample documents
docs = [
    Document(page_content="Pipecode is a vector database for semantic search."),
    Document(page_content="The Eiffel Tower is located in Paris."),
    Document(page_content="LangChain can be used to developed agentic AI application."),
    Document(page_content="LangChain has many types of retrievers."),
    Document(page_content="LangChain helps build LLM applications.")
    
]

## Step 2: Dense Retriever (FAISS + HuggingFace)

embedding_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
dense_vectorstore  = FAISS.from_documents(docs, embedding_model)
dense_retriever = dense_vectorstore.as_retriever()

In [14]:
faiss_index = dense_vectorstore.index

In [15]:
import numpy as np
num_docs = len(docs)

# Reconstruct all vectors
vectors = np.array([faiss_index.reconstruct(i) for i in range(num_docs)])
print(vectors.shape)  
print(vectors)       


(5, 384)
[[-0.01785619 -0.02074999  0.04232785 ... -0.0190144   0.05980122
   0.02640989]
 [ 0.02465541 -0.00338172 -0.0284394  ...  0.09900173  0.03497263
  -0.0354636 ]
 [ 0.06605352  0.03884847  0.01661565 ...  0.03093835  0.07990999
   0.05157552]
 [-0.06784829 -0.01717511 -0.03610207 ...  0.0658505   0.06786532
  -0.02381885]
 [-0.06612371  0.00435344  0.01951677 ...  0.01170399  0.09943537
   0.06213933]]


In [33]:
## Sparse Retriever(BM25)
sparse_retriever  = BM25Retriever.from_documents(docs)
sparse_retriever.k = 3

## step 4: Combine with Ensemble retriever

hybrid_retriever  = EnsembleRetriever(
    retrievers=[ dense_retriever,sparse_retriever ],
    weights=[0.7,0.3]
)

In [34]:
hybrid_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f5ae3dc3110>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7f5ae3dc3610>, k=3)], weights=[0.7, 0.3])

In [35]:
## Step 5: Query and get results

query   = "How can I build an application using LLms"
results = hybrid_retriever.invoke(query)

## Step 6: print result

for i, doc in enumerate(results):
    print(f"\n Document {i+1}:\n {doc.page_content}")


 Document 1:
 LangChain helps build LLM applications.

 Document 2:
 LangChain can be used to developed agentic AI application.

 Document 3:
 LangChain has many types of retrievers.

 Document 4:
 Pipecode is a vector database for semantic search.


## RAG Pipline with hybrid retriever

In [36]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

In [43]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [44]:
## Step 5: Prompt Template
prompt = PromptTemplate.from_template("""
Answer the question based on the context below.

Context:
{context}

Question: {input}
""")


## Step 6: lLm

llm = init_chat_model("openai:gpt-3.5-turbo", temperature = 0.2)
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7f5ae041d7f0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7f5ae041e270>, root_client=<openai.OpenAI object at 0x7f5ae07a74d0>, root_async_client=<openai.AsyncOpenAI object at 0x7f5ae041dfd0>, temperature=0.2, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [45]:
## Create stuff Document Chain

document_chain  = create_stuff_documents_chain(llm=llm, prompt=prompt)
## Create full RAG chain

rag_chain = create_retrieval_chain(retriever=hybrid_retriever, combine_docs_chain=document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f5ae3dc3110>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7f5ae3dc3610>, k=3)], weights=[0.7, 0.3]), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nAnswer the question based on the context below.\n\nContext:\n{context}\n\nQuestion: {input}\n')
            | ChatOpenAI(client=<openai.

In [49]:
## Step 9: Ask a question

query = {"input":"How can I build an app using LLMs?"}
response = rag_chain.invoke(query)

print("Answer:\n", response['answer'])

print("\n Sourse Documents:")

for i, doc in enumerate(response["context"]):
    print(f"\n Doc {i+1}: {doc.page_content}")

Answer:
 You can build an app using LLMs by utilizing LangChain, which helps in developing LLM applications. LangChain can be used to create agentic AI applications and has various types of retrievers that can be utilized in building your app. Additionally, you can also consider using Pipecode, a vector database for semantic search, to enhance the functionality of your app.

 Sourse Documents:

 Doc 1: LangChain helps build LLM applications.

 Doc 2: LangChain can be used to developed agentic AI application.

 Doc 3: LangChain has many types of retrievers.

 Doc 4: Pipecode is a vector database for semantic search.


In [62]:
[[response["context"][i].page_content, i] for i in range(4)]

[['LangChain helps build LLM applications.', 0],
 ['LangChain can be used to developed agentic AI application.', 1],
 ['LangChain has many types of retrievers.', 2],
 ['Pipecode is a vector database for semantic search.', 3]]