In [53]:
# Install dependencies
%pip install langchain langchain-google-genai chromadb python-dotenv unstructured --quiet



I0000 00:00:1755626431.452666 4712835 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


[0mNote: you may need to restart the kernel to use updated packages.


In [54]:
# Setup environment and load API key

import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Get API key and set it in the environment
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Set GEMINI_API_KEY in your .env file")

# Set the API key for Google Generative AI
os.environ["GOOGLE_API_KEY"] = api_key


In [55]:
# Imports

from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate


In [56]:
DATA_PATH = "alice_in_wonderland.md"  # Updated file extension to .md
PERSIST_DIR = "chroma_db"  # Using a single consistent directory name
COLLECTION = "docs"

# Smaller chunk size for more precise retrieval
CHUNK_SIZE = 1000  # Reduced from 1000
CHUNK_OVERLAP = 500  # Reduced overlap to minimize redundancy

# Using UnstructuredMarkdownLoader for the markdown file
loader = UnstructuredMarkdownLoader(DATA_PATH, show_progress=True)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

Loaded 1 documents


In [57]:
# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")

Split into 286 chunks


In [58]:
# Create embeddings & persist vector DB

# Gemini embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Create vector DB (persistence is automatic in Chroma >= 0.4.x)
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=PERSIST_DIR,
    collection_name=COLLECTION
)
print("Database created and saved to disk")


query = "What happens at the tea party?"
docs = vectordb.similarity_search(query, k=3)
for i, doc in enumerate(docs):
    print(f"\n--- Result {i+1} ---")
    print("Text:", doc.page_content[:500], "...")
    print("Source:", doc.metadata.get("source", "unknown"))



Database created and saved to disk

--- Result 1 ---
Text: The Project Gutenberg eBook of Alice's Adventures in Wonderland ...
Source: alice_in_wonderland.md

--- Result 2 ---
Text: CHAPTER IV. The Rabbit Sends in a Little Bill ...
Source: alice_in_wonderland.md

--- Result 3 ---
Text: garden, among the bright flower-beds and the cool fountains. ...
Source: alice_in_wonderland.md
