RAG using LangChain

![RAG Pipeline](download.png)

In [26]:
# packages to install

%pip install langchain langchain-core langchain-community langchainhub \
            langchain-google-genai chromadb tiktoken python-dotenv bs4 --quiet




I0000 00:00:1755535635.748401 3900356 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


[0mNote: you may need to restart the kernel to use updated packages.


In [27]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify Gemini API key is set
google_api_key = os.getenv("GEMINI_API_KEY")
if not google_api_key:
    raise ValueError("Please set GEMINI_API_KEY environment variable in .env file")

# Set the API key for Google's services
os.environ["GOOGLE_API_KEY"] = google_api_key


In [28]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Gemini integrations
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

#### INDEXING ####

# Load Documents
from bs4 import BeautifulSoup
import requests

# Custom loader for better content extraction
def load_webpage():
    url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the main content
    content = []
    
    # Get title
    if soup.find('h1'):
        content.append(soup.find('h1').get_text())
    
    # Get main article content
    article = soup.find('article') or soup.find('div', class_='post-content')
    if article:
        # Get all text content
        for elem in article.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'li']):
            content.append(elem.get_text())
    
    return [Document(page_content='\n\n'.join(content))]

# Add custom preprocessing to clean the text
def clean_text(text):
    import re
    # Remove multiple newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters
    text = re.sub(r'[^\w\s\-.,?!()\[\]{}"\']+', ' ', text)
    return text.strip()
# Load and clean document
docs = load_webpage()
docs[0].page_content = clean_text(docs[0].page_content)

# Print first document to verify content
print("Document content preview:")
print(docs[0].page_content[:500])
print("\nTotal documents loaded:", len(docs))

# Split with larger chunk size
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
splits = text_splitter.split_documents(docs)
print("\nTotal splits:", len(splits))

# Embed with Gemini
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
)

# Configure retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}  # Number of documents to return
)

#### RETRIEVAL and GENERATION ####

# Prompt (still use the same hub prompt)
prompt = hub.pull("rlm/rag-prompt")

# LLM → Gemini
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Print a few example chunks to verify content
print("\nExample chunks:")
for i, split in enumerate(splits[:2]):
    print(f"\nChunk {i+1}:")
    print(split.page_content[:200])

# Question about Task Decomposition
query = "Based on the article about LLM Powered Autonomous Agents, explain how agents handle task decomposition, planning, and breaking down complex tasks into subtasks. Include specific examples if mentioned in the text."

# Get the response
print("\nQuerying about Task Decomposition:")
response = rag_chain.invoke(query)

print("\nResponse:")
print(response)


NameError: name 'Document' is not defined