retrieval augmented generation 
steps:
load the document
convert into vector store
use llm to chat with the vector store


In [1]:
import os 
import google.generativeai as genai 
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
load_dotenv('/media/varun/Data/data science/main_files/.env')
#set the environment variables
os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

#load the model 
model=genai.GenerativeModel('gemini-pro')
llm = ChatGoogleGenerativeAI(model="gemini-pro")
#call the model
# result = llm.invoke("Write a ballad about LangChain")
# print(result.content)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

#function to read the pdf document
def pdf_read(pdf_filename):
  text = ""
  pdf_reader = PdfReader(pdf_filename)  # Use the single filename directly
  for page in pdf_reader.pages:
    text += page.extract_text()
  return text

text = pdf_read("2303.11366v4.pdf")

#chunking the text documents to smaller chunks
def get_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    return chunks
chunks = get_chunks(text)  

#embedding the chunks
embeddings = SpacyEmbeddings(model_name="en_core_web_sm")

#create a vector store
def vector_store(chunks):
    vector_store = FAISS.from_texts(chunks, embedding=embeddings)
    vector_store.save_local("faiss_db")
    return vector_store
vector=vector_store(chunks)    
retriever = vector.as_retriever()
# docs =retriever.get_relevant_documents('what is reinforcement learning')

#create RAG chian 
template ="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} """
prompt = PromptTemplate.from_template(template)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain.invoke("what is reinforcement learning")

'I do not have the answer to your question from the provided context.'

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [10]:
#hybrid search 
#here we are giving 2 retrievers one with semantic search the normal one
#other just a keyword search 
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
doc_list = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I like computers by Apple",
    "I love fruit juice"
]

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 2

#embedding the chunks
embeddings = SpacyEmbeddings(model_name="en_core_web_sm")

faiss_vectorstore = FAISS.from_texts(doc_list, embeddings)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})
faiss_retriever.get_relevant_documents("A green fruit")

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                       weights=[0.5, 0.5])
docs = ensemble_retriever.get_relevant_documents("A green fruit")
docs = ensemble_retriever.get_relevant_documents("Apple Phones")


[Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple')]