In [54]:
# Install packages
%pip install --upgrade google-genai sentence-transformers faiss-cpu langchain-text-splitters python-dotenv --quiet


Note: you may need to restart the kernel to use updated packages.


In [55]:
# Imports

from google import genai
from google.genai import types

import os,math, uuid
from dataclasses import dataclass
from typing import List, Dict, Tuple

import numpy as np
import faiss

from sentence_transformers import SentenceTransformer # for local embedding
from langchain_text_splitters import RecursiveCharacterTextSplitter #simple chunker 

from dotenv import load_dotenv
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY is not set")

client = genai.Client(api_key=GEMINI_API_KEY)

GEMINI_GEN_MODEL="gemini-2.5-flash" #fast, good for RAG

LOCAL_EMBEDDING_MODEL="all-MiniLM-L6-v2" #small, fast, good for RAG






In [56]:
# Document Example on which we'll perform retrieveal 
# Each doc has: id, title, text, and optional metadata.
DOCUMENTS = [
    {
        "id": "doc1",
        "title": "Baking Basics",
        "text": "Cakes are baked at moderate temperatures. Common ingredients are flour, sugar, eggs, and butter. Icing is added after the cake cools.",
        "meta": {"source": "kitchen-notes", "lang": "en"}
    },
    {
        "id": "doc2",
        "title": "Healthy Desserts",
        "text": "For a lighter dessert, substitute part of the sugar with fruit purees. Consider whole-grain flour. Yogurt frostings can reduce fat.",
        "meta": {"source": "health-blog", "lang": "en"}
    },
    {
        "id": "doc3",
        "title": "Birthday Traditions",
        "text": "Many cultures celebrate birthdays with a sweet cake, candles, and a wish. Popular flavors include chocolate and vanilla.",
        "meta": {"source": "culture-wiki", "lang": "en"}
    },
]

In [57]:
# Chunking 
# Goal: split long docs into chunks so retrieval can target the right part.
# We’ll use a character-based splitter with a small overlap.

def chunk_documents(docs:List[Dict],chunk_size,chunk_overlap):
    # Returns: list of chunk dicts with fields: chunk_id, doc_id, text, meta
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks=[]
    for doc in docs:
        parts = splitter.split_text(doc["text"])
        for i,part in enumerate(parts):
            chunks.append({
                "chunk_id":f"{doc['id']}::chunk{i}",
                "doc_id":doc["id"],
                "text":part,
                "meta":doc.get("meta",{}),
                "title":doc.get("title","")
            })
            
    return chunks
CHUNKS = chunk_documents(DOCUMENTS,chunk_size=100,chunk_overlap=40)
print(len(CHUNKS))
# print(CHUNKS[0],end="\n\n")







6


In [None]:
# Embeddings and storing it in FAISS (vector store)