In [1]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
import os
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import pandas as pd

In [3]:
from google.colab import files
uploaded = files.upload()

chunks_df = pd.read_csv("/content/meditation_chunks.csv")
chunks_df.head()

Saving meditation_chunks.csv to meditation_chunks.csv


Unnamed: 0,chunk_id,source_file,chunk_index,text,word_count
0,cureus_cleaned.txt_chunk_0,cureus_cleaned.txt,0,This article discusses the power of meditation...,150
1,cureus_cleaned.txt_chunk_1,cureus_cleaned.txt,1,yield more beneficial clinical outcomes. Every...,150
2,cureus_cleaned.txt_chunk_2,cureus_cleaned.txt,2,that constant reinforcement of happy thoughts ...,150
3,cureus_cleaned.txt_chunk_3,cureus_cleaned.txt,3,energy and possibility is there remains a myst...,150
4,cureus_cleaned.txt_chunk_4,cureus_cleaned.txt,4,benefits at the genetic or immunological level...,150


In [4]:
# location of ChromaDB
INDEX_DIR = "../data/index/meditations"
os.makedirs(INDEX_DIR, exist_ok=True)

# Create a persistent Chroma client
client = chromadb.PersistentClient(path=INDEX_DIR)

# Create (or get) a collection
collection = client.get_or_create_collection(name="meditations")

# Load embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
BATCH_SIZE = 64

ids = chunks_df["chunk_id"].tolist()
documents = chunks_df["text"].tolist()
metadatas = chunks_df[["source_file", "chunk_index", "word_count"]].to_dict(orient="records")

for start in range(0, len(documents), BATCH_SIZE):
    end = start + BATCH_SIZE
    batch_docs = documents[start:end]
    batch_ids = ids[start:end]
    batch_metadatas = metadatas[start:end]

    # Embeddings for this batch
    batch_embeddings = embedder.encode(batch_docs).tolist()

    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        metadatas=batch_metadatas,
        embeddings=batch_embeddings
    )

    print(f"Added chunks {start}–{end}")


Added chunks 0–64
Added chunks 64–128
Added chunks 128–192


In [6]:
# Retrieve top-k chunks
def retrieve_context(query: str, collection, embedder, k: int = 4):
    query_emb = embedder.encode([query]).tolist()
    results = collection.query(
        query_embeddings=query_emb,
        n_results=k
    )
    docs = results["documents"][0]
    metadatas = results["metadatas"][0]
    return docs, metadatas


# Prompt builder tuned for big chat-style models
def build_prompt(query: str, retrieved_docs: list[str]) -> str:
    context = "\n\n---\n\n".join(retrieved_docs)
    prompt = f"""You are a calm, secular meditation teacher.
You write detailed, compassionate guided meditations grounded in the reference texts.

Reference texts:
{context}

User request:
{query}

Write a clear, step-by-step guided meditation in the second person ("you"),
about 400–600 words, with:
1) Settling the body
2) Anchoring attention (e.g., breath or body)
3) Working with thoughts/emotions
4) A gentle closing.

Do not mention the reference texts or describe your process.
Speak as if you are guiding the listener right now.
"""
    return prompt.strip()


# Main RAG generation function
def generate_with_rag(
    query: str,
    collection,
    embedder,
    llm,
    k: int = 4,
    max_new_tokens: int = 600
):
    docs, metas = retrieve_context(query, collection, embedder, k=k)
    prompt = build_prompt(query, docs)

    output = llm(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )[0]["generated_text"]

    return output, docs, metas


Model 3: Flan-T5-XL

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# MODEL 3: Flan-T5-Large as a RAG generator
flan_large_name = "google/flan-t5-large"

flan_large_tokenizer = AutoTokenizer.from_pretrained(flan_large_name)
flan_large_model = AutoModelForSeq2SeqLM.from_pretrained(
    flan_large_name,
    device_map="auto"   # use GPU if available
)

llm_flan_large = pipeline(
    "text2text-generation",
    model=flan_large_model,
    tokenizer=flan_large_tokenizer
)


# Flan-specific prompt builder (shorter, to stay under 512 tokens)
def build_prompt_flan_large(query: str, retrieved_docs: list[str]) -> str:
    context = "\n\n---\n\n".join(retrieved_docs)
    prompt = f"""
Using the reference texts below, write a calm, secular guided meditation
that responds to the user's request.

Reference texts:
{context}

User request:
{query}

Write the meditation in gentle, reassuring second-person language ("you"),
aiming for about 10–14 sentences. Do not mention the reference texts or
describe your process; just guide the listener directly.
"""
    return prompt.strip()


def generate_with_rag_flan_large(
    query: str,
    collection,
    embedder,
    llm,
    k: int = 2,          # fewer chunks to keep input < 512 tokens
    max_length: int = 256
):
    # 1. Retrieve top-k chunks
    docs, metas = retrieve_context(query, collection, embedder, k=k)

    # 2. Build a compact prompt for Flan-T5-Large
    prompt = build_prompt_flan_large(query, docs)

    # 3. Generate
    output = llm(
        prompt,
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )[0]["generated_text"]

    return output, docs, metas


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


The night before an exam, take a deep breath in and out slowly and steadily. Breathe in through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mouth. Breathe out through your nose and out through your mou

In [10]:
# Example call with Model 3 (Flan-T5-Large)
query = "Create a grounding meditation for the night before an exam that gently acknowledges the specific worries that tend to arise (fear of forgetting information, worry about performing poorly, racing thoughts about the future) and helps me settle my mind and body so I can rest."
response_flan_large, docs_flan_large, metas_flan_large = generate_with_rag_flan_large(
    query,
    collection,
    embedder,
    llm_flan_large,
    k=2,
    max_length=256
)

print(response_flan_large)

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


You are a student about to take an exam. You are worried about what you will forget. You are worried about your performance. You are worried about the future. You are worried about forgetting information. You are worried about forgetting information. You are worried about performing poorly. You are worried about the future. You are worried about forgetting information.
