In [1]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [4]:
### Indexing 
import bs4
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma

doc_path = "dev-data/Be_Good.pdf"
loader = PyPDFLoader(doc_path)

doc = loader.load()

# Split 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)

# Make splits
splits = text_splitter.split_documents(doc)

# Index
chromadb = Chroma.from_documents(
    documents=splits,
    embedding=GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
)
retriever = chromadb.as_retriever()

### Prompt

In [8]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

# Mult query: different Perspectives
template = """You are an AI language model assistant. Your task is to generate five different
versions of te given user question to retrieve relevant documents from a vector database.
By generating multiple perpectives on the user question, your goal is to help the user overcome
some of the limitations of the distance-based similarity search. Provide these alternative questions
separated by newlines. Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries_chain = (
    prompt_perspectives
    | ChatGoogleGenerativeAI(model="models/gemini-2.5-flash-preview-05-20")
    | StrOutputParser()
    | (lambda x : x.split("\n"))
)

In [9]:
generate_queries_chain.invoke("Who is the writter of the book")

['Who authored the book?',
 "What is the name of the book's author?",
 'By whom was this publication penned?',
 'Can you identify the individual who wrote the book?',
 'Who is credited with writing the book?']

In [11]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """Unique union of retrieved docs"""
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

question = "Who is the writer of the book?"
retrieval_chain = generate_queries_chain | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question": question})
len(docs)

  return [loads(doc) for doc in unique_docs]


20

In [12]:
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the followng question based on this context

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash-preview-05-20")

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")}
    | prompt 
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

'The writer of the essay "Be Good" is Paul Graham.'