# Theravada Scripture Search POC

In this notebook, we'll create a proof-of-concept of search over the translated Theravadan texts from https://www.dhammatalks.org/suttas/.

## Setup

In [1]:

# Run once.

%load_ext autoreload
%autoreload 2

import os

# Move execution back one directory.
os.chdir('..')

In [55]:
from IPython.display import HTML

HTML(
    """
<style>
.output_scroll {
    max-height: 400px;
    overflow-y: auto;
}
</style>
"""
)

In [52]:
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


True

## Load data

In [40]:
dhamma_talks_suttas = pd.read_csv('data/dhamma_talks_suttas.csv', index_col=0)
print(dhamma_talks_suttas.shape)
dhamma_talks_suttas.head()

(240, 8)


Unnamed: 0,collection,title,url_source,religion,subgroup,source,translation_source,text
0,AN,A Single Thing,https://www.dhammatalks.org/suttas/AN/AN1_21.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu,"21. âI donât envision a single thing that,..."
1,AN,A Pool of Water,https://www.dhammatalks.org/suttas/AN/AN1_45.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu,45. âSuppose there were a pool of waterâsu...
2,AN,Soft,https://www.dhammatalks.org/suttas/AN/AN1_48.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu,"âJust as, of all trees, the balsam is foremo..."
3,AN,Quick to Reverse Itself,https://www.dhammatalks.org/suttas/AN/AN1_49.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu,âI donât envision a single thing that is a...
4,AN,Luminous,https://www.dhammatalks.org/suttas/AN/AN1_50.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu,"âLuminous, monks, is the mind.1 And it is de..."


In [63]:

def decode_text(text):
    if not isinstance(text, str):
        return text
    try:
        # First try to decode as UTF-8
        return text.encode("latin1").decode("utf-8")
    except UnicodeDecodeError:
        try:
            # If that fails, try to decode as latin1 first
            return text.encode("latin1").decode("latin1")
        except:
            # If all else fails, return the original text
            return text

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    print(df.shape)
    df = df.dropna(subset=["text"])
    print(df.shape)
    df = df.drop_duplicates(subset=["text"])
    print(df.shape)
    df["text"] = df["text"].apply(decode_text)
    return df


dhamma_talks_suttas = preprocess(dhamma_talks_suttas)
dhamma_talks_suttas.shape


(235, 8)
(235, 8)
(235, 8)


(235, 8)

## Create Vector Store

In [None]:

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

def create_vector_store(df: pd.DataFrame) -> FAISS:
    """Create a FAISS vector store from the DataFrame."""
    # Initialize the embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    # Create documents for the vector store
    documents = []
    for _, row in df.iterrows():
        doc = {
            "page_content": row["text"],
            "metadata": {
                "title": row["title"],
                "collection": row["collection"],
                "url_source": row["url_source"],
                "religion": row["religion"],
                "subgroup": row["subgroup"],
                "source": row["source"],
                "translation_source": row["translation_source"],
            },
        }
        documents.append(doc)

    # Create and return the vector store
    return FAISS.from_documents(documents, embeddings)


vector_store = create_vector_store(dhamma_talks_suttas)

## Create basic search

In [64]:
from smolagents.tools import Tool


class SuttaSearchTool(Tool):
    def __init__(self, vector_store):
        super().__init__(
            name="search_suttas",
            description="Search through Buddhist suttas to find relevant passages",
            parameters={
                "query": {
                    "type": "string",
                    "description": "The search query to find relevant suttas",
                }
            },
        )
        self.vector_store = vector_store

    def run(self, query: str) -> str:
        # Search for relevant documents
        docs = self.vector_store.similarity_search(query, k=3)

        # Format the results
        results = []
        for doc in docs:
            results.append(
                f"Title: {doc.metadata['title']}\nText: {doc.page_content}\n"
            )

        return "\n".join(results)


9158


In [67]:
print(scripture_search_tool.forward("I struggle with motivation to meditate. What can I do?")[:2000])


 The following scriptures may be helpful:


===== Scripture: The Simile of the Cloth | source: https://www.dhammatalks.org/suttas/MN/MN7.html =====
wouldn’t cleanse
a dark deed.
What can the Sundarikā do?
What the Payāga? What the Bāhuka?
A person of animosity,
one who’s done wrong,
cannot be cleansed there
of evil deeds.
But for one who is pure,
it’s always the Phaggu festival;
for one who is pure,
always the uposatha.
For one who is pure, clean in his deeds,
his practices       always
reach consummation.
Bathe right here, brahman.
Create safety for yourself
with regard to all beings.
If you
don’t tell a lie,
don’t harm living beings,

===== Scripture: To Gaá¹aka MoggallÄna | source: https://www.dhammatalks.org/suttas/MN/MN107.html =====
“What can I do about that, Master Gotama? I’m the one who shows the way.”
“In the same way, brahman—when unbinding is there, and the path leading to unbinding is there, and I am there as the guide—when my disciples are thus exhorted & instructed by

These aren't super useful in their current form. Let's use agents to try to improve the output.

In [None]:
from pathlib import Path
from typing import List

from langchain_community.vectorstores import FAISS
from smolagents import Agent
from smolagents.memory import ConversationMemory

def create_agent(vector_store: FAISS) -> Agent:
    """Create an agent with RAG capabilities."""
    # Create the search tool
    search_tool = SuttaSearchTool(vector_store)

    # Create the agent with the search tool
    agent = Agent(
        tools=[search_tool],
        memory=ConversationMemory(),
        system_prompt="""You are a helpful assistant specializing in Buddhist texts and teachings.
        When a user asks a question, use the scripture_search_tool tool to find relevant passages from the suttas.
        Then, provide a thoughtful response based on the found passages.
        Always cite the sources you use in your response.""",
        # model=model,
    )

    return agent

agent = create_agent(vector_store)

# Example usage
while True:
    query = input("\nEnter your question (or 'quit' to exit): ")
    if query.lower() == "quit":
        break

    response = agent.run(query)
    print("\nResponse:", response)

# # # Initialize the agent
# model = HfApiModel(
#     # model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # it is possible that this model may be overloaded
#     model_id="https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud",
#     token=os.getenv("HF_TOKEN"),
# )
# agent = CodeAgent(tools=[scripture_search_tool], model=model)

# # # Example usage
# response = agent.run(
#     "I struggle with motivation to meditate. What can I do?"
# )

9406


['Start with short sessions and gradually increase the duration.', 'Find a comfortable and quiet place to meditate.', 'Set small, achievable goals for your practice.', 'Be consistent and make meditation a regular habit.', 'Join a meditation group or find a meditation buddy.', 'Reflect on the benefits of meditation.', 'Be patient and kind with yourself.']
