<a href="https://colab.research.google.com/github/swapnildahare/RAG_Sanskrit_Swapnil_Dahare/blob/main/Sanksrit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sentence-transformers faiss-cpu transformers python-docx indic-transliteration pdfminer.six




In [3]:
import os
import re
import json
import numpy as np
import faiss
from docx import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST


In [4]:
def read_docx(path):
    doc = Document(path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text)

In [5]:
doc_path = "/content/Rag-docs (1).docx"
raw_text = read_docx(doc_path)

print("DOCUMENT LOADED")
print(raw_text[:500])

DOCUMENT LOADED
‡§Æ‡•Ç‡§∞‡•ç‡§ñ‡§≠‡•É‡§§‡•ç‡§Ø‡§∏‡•ç‡§Ø

"‡§Ö‡§∞‡•á ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶, ‡§ó‡§ö‡•ç‡§õ‡§æ‡§™‡§£‡§Æ‡•ç, ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ‡§Æ‡•ç ‡§Ü‡§®‡§Ø ‡•§" ‡§á‡§§‡§ø ‡§∏‡•ç‡§µ‡§≠‡•É‡§§‡•ç‡§Ø‡§Æ‡•ç ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶‡§Æ‡•ç ‡§ó‡•ã‡§µ‡§∞‡•ç‡§ß‡§®‡§¶‡§æ‡§∏‡§É ‡§Ü‡§¶‡§ø‡§∂‡§§‡§ø ‡•§ ‡§§‡§§‡§É ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶‡§É ‡§Ü‡§™‡§£‡§Æ‡•ç ‡§ó‡§ö‡•ç‡§õ‡§§‡§ø, ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ‡§Æ‡•ç ‡§ú‡•Ä‡§∞‡•ç‡§£‡•á ‡§µ‡§∏‡•ç‡§§‡•ç‡§∞‡•á ‡§®‡•ç‡§Ø‡§∏‡•ç‡§Ø‡§§‡§ø ‡§ö ‡•§ ‡§§‡§∏‡•ç‡§Æ‡§æ‡§§‡•ç ‡§ú‡•Ä‡§∞‡•ç‡§£‡§µ‡§∏‡•ç‡§§‡•ç‡§∞‡§æ‡§§‡•ç ‡§Æ‡§æ‡§∞‡•ç‡§ó‡•á ‡§è‡§µ ‡§∏‡§∞‡•ç‡§µ‡§æ‡§™‡§ø ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ ‡§∏‡•ç‡§§‡•ç‡§∞‡§µ‡§§‡§ø ‡•§ ‡§§‡§§‡§É ‡§ó‡•ã‡§µ‡§∞‡•ç‡§ß‡§®‡§¶‡§æ‡§∏‡§É ‡§ï‡•ã‡§™‡•á‡§® ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶‡§Æ‡•ç ‡§µ‡§¶‡§§‡§ø, "‡§Ö‡§∞‡•á ‡§Æ‡•Ç‡§¢, ‡§ï‡•Å‡§§‡•ç‡§∞‡§æ‡§∏‡•ç‡§§‡§ø ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ ? ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ‡§¶‡§ø‡§ï‡§Æ‡•ç ‡§è‡§µ‡§Æ‡•ç ‡§ú‡•Ä‡§∞‡•ç‡§£‡•á‡§® ‡§µ‡§∏‡•ç‡§§‡•ç‡§∞‡•á‡§£ ‡§® ‡§è‡§µ‡§æ‡§®‡§Ø‡§®‡•ç‡§§‡§ø ‡§ï‡§¶‡§æ‡§™‡§ø ‡•§ ‡§á‡§§‡§É‡§™‡§∞‡§Æ‡•ç ‡§ï‡§ø‡§Æ‡§™‡§ø ‡§µ‡§∏‡•ç‡§§‡•Å‡§ú‡§æ‡§§‡§Æ‡•ç ‡§¶‡

In [6]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

norm_text = normalize_text(raw_text)

In [7]:
CHUNK_SIZE = 800
STRIDE = 400

def chunk_text(text, doc_id="sanskrit_doc"):
    chunks = []
    start = 0
    end = len(text)
    idx = 0

In [8]:
def chunk_text(text, doc_id="sanskrit_doc"):
    chunks = []
    start = 0
    end = len(text)
    idx = 0
    while start < end:
        chunk = text[start: start + CHUNK_SIZE]

        if any('\u0900' <= c <= '\u097F' for c in chunk):
            deva = chunk
            try:
                translit = transliterate(chunk, DEVANAGARI, IAST)
            except:
                translit = chunk
        else:
            translit = chunk
            try:
                deva = transliterate(chunk, IAST, DEVANAGARI)
            except:
                deva = chunk

        chunks.append({
            "chunk_id": f"{doc_id}_{idx}",
            "text_deva": deva,
            "text_translit": translit
        })

        idx += 1
        start += STRIDE

    return chunks

In [9]:
chunks = chunk_text(norm_text)
print("Total chunks:", len(chunks))

Total chunks: 23


In [10]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

corpus_embeddings = embedding_model.encode(
    [c["text_deva"] for c in chunks],
    convert_to_numpy=True,
    show_progress_bar=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
faiss.normalize_L2(corpus_embeddings)
dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(corpus_embeddings)

print("FAISS index built")

FAISS index built


In [12]:
def is_devanagari(text):
    return any("\u0900" <= ch <= "\u097F" for ch in text)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
generator = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
def retrieve(query, top_k=5):
    if not is_devanagari(query):
        query = transliterate(query, IAST, DEVANAGARI)

    q_emb = embedding_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)

    D, I = index.search(q_emb, top_k)

    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "score": float(score),
            "chunk": chunks[idx]["text_deva"],
            "id": chunks[idx]["chunk_id"]
        })

    return results

In [15]:
def build_prompt(query, retrieved):
    context = ""
    for r in retrieved:
        context += f"[{r['id']}]: {r['chunk']}\n\n"

    prompt = (
        "Answer the question in Sanskrit using ONLY the context.\n"
        "If not found, say '‡§® ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§§‡•á'.\n\n"
        f"Context:\n{context}\n"
        f"Question:\n{query}\n\nAnswer:"
    )
    return prompt

In [16]:
def generate_answer(query):
    retrieved = retrieve(query, top_k=5)
    prompt = build_prompt(query, retrieved)

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = generator.generate(inputs["input_ids"], max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer, retrieved

In [17]:
##example for test
query = "‡§ï‡§É ‡§Æ‡•Ç‡§∞‡•ç‡§ñ‡§≠‡•É‡§§‡•ç‡§Ø‡§É ?"
answer, context = generate_answer(query)

print("üîπ Answer:\n", answer)
print("\nüîπ Retrieved chunks:")
for c in context:
    print(c["id"], "‚Üí", c["score"])
    print(c["chunk"][:150], "...\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (1223 > 512). Running this sequence through the model will result in indexing errors


üîπ Answer:
  

üîπ Retrieved chunks:
sanskrit_doc_0 ‚Üí 0.49122685194015503
‡§Æ‡•Ç‡§∞‡•ç‡§ñ‡§≠‡•É‡§§‡•ç‡§Ø‡§∏‡•ç‡§Ø "‡§Ö‡§∞‡•á ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶, ‡§ó‡§ö‡•ç‡§õ‡§æ‡§™‡§£‡§Æ‡•ç, ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ‡§Æ‡•ç ‡§Ü‡§®‡§Ø ‡•§" ‡§á‡§§‡§ø ‡§∏‡•ç‡§µ‡§≠‡•É‡§§‡•ç‡§Ø‡§Æ‡•ç ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶‡§Æ‡•ç ‡§ó‡•ã‡§µ‡§∞‡•ç‡§ß‡§®‡§¶‡§æ‡§∏‡§É ‡§Ü‡§¶‡§ø‡§∂‡§§‡§ø ‡•§ ‡§§‡§§‡§É ‡§∂‡§Ç‡§ñ‡§®‡§æ‡§¶‡§É ‡§Ü‡§™‡§£‡§Æ‡•ç ‡§ó‡§ö‡•ç‡§õ‡§§‡§ø, ‡§∂‡§∞‡•ç‡§ï‡§∞‡§æ‡§Æ‡•ç ‡§ú‡•Ä‡§∞‡•ç‡§£‡•á ‡§µ‡§∏‡•ç‡§§‡•ç‡§∞‡•á ‡§® ...

sanskrit_doc_10 ‚Üí 0.4661385715007782
‡§Ø‡§æ‡§É ‡§ö‡§æ‡§∞‡•ç‡§§‡•Å‡§Ø‡§Æ‡•ç ‡§Ü‡§∏‡•Ä‡§§‡•ç ‡§ö‡§ø‡§§‡•ç‡§∞‡§™‡•Å‡§∞‡§Æ‡•ç ‡§®‡§æ‡§Æ ‡§ï‡§ø‡§Æ‡§™‡§ø ‡§®‡§ó‡§∞‡§Ç ‡§∂‡•ç‡§∞‡•Ä‡§™‡§∞‡•ç‡§µ‡§§‡§∏‡•ç‡§Ø ‡§∏‡§Æ‡•Ä‡§™‡•á ‡•§ "‡§™‡§∞‡•ç‡§µ‡§§‡§∏‡•ç‡§Ø ‡§∂‡§ø‡§ñ‡§∞‡§™‡•ç‡§∞‡§¶‡•á‡§∂‡•á ‡§ò‡§£‡•ç‡§ü‡§æ‡§ï‡§∞‡•ç‡§£‡§É ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ï‡•ç‡§∑‡§∏‡§É ‡§™‡•ç‡§∞‡§§‡§ø‡§µ‡§∏‡§§‡•Ä" ‡§§‡§ø ‡§ú‡§®‡§™‡•ç‡§∞‡§µ‡§æ‡§¶‡§É ‡§Ö‡§µ‡§∞‡•ç‡§§‡§§‡•ç ‡•§ ‡§Ö‡§•‡•à‡§ï ...

sanskrit_doc_11 ‚Üí 0.4353455603122711
‡§£‡•ç‡§ü‡§æ‡§£‡§æ‡§¶‡§Ç 

In [21]:
query ="‡§ò‡•ã‡§∑‡§ø‡§§‡§Ç ‡§ï‡§¶‡§æ‡§ö‡§ø‡§§‡•ç ‡§≠‡•ã‡§ú‡§∞‡§æ‡§ú‡•ç‡§û‡§æ"
answer, context = generate_answer(query)

print("üîπ Answer:\n", answer)
print("\nüîπ Retrieved chunks:")
for c in context:
    print(c["id"], "‚Üí", c["score"])
    print(c["chunk"][:150], "...\n")

üîπ Answer:
                                                                                                    

üîπ Retrieved chunks:
sanskrit_doc_13 ‚Üí 0.8038051128387451
‡§µ‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§®‡§æ‡§Æ‡•ç ‡§ï‡§∞‡•ã‡§§‡§ø ‡•§ "‡§¶‡•á‡§µ, ‡§ï‡•É‡§™‡§Ø‡§æ ‡§Æ‡§π‡•ç‡§Ø‡§Ç ‡§Ü‡§∞‡•ã‡§ó‡•ç‡§Ø‡§Æ‡•ç ‡§¶‡§¶‡§æ‡§§‡•Å, ‡§ß‡§®‡§Æ‡•ç ‡§¶‡§¶‡§æ‡§§‡•Å" ‡§á‡§§‡§ø ‡•§ ‡§∏‡§É ‡§ï‡§ø‡§Ç‡§ö‡§ø‡§§‡•ç ‡§Ö‡§™‡§ø ‡§™‡•ç‡§∞‡§Ø‡§§‡•ç‡§®‡§Æ‡•ç ‡§® ‡§ï‡§∞‡•ã‡§§‡§ø, ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§Æ‡•ç ‡§® ‡§ï‡§∞‡•ã‡§§‡§ø ‡•§ ‡§¶‡•á‡§µ‡§É ‡§∏‡§æ‡§π‡§æ‡§Ø‡•ç‡§Ø‡§Æ‡•ç ‡§ï‡§∞‡§ø‡§∑‡•ç‡§Ø ...

sanskrit_doc_12 ‚Üí 0.7983924150466919
‡§≠‡•É‡§§‡§Æ‡•ç ‡§Ö‡§§‡§ø‡§∑‡•ç‡§†‡§§‡•ç ‡•§ '‡§µ‡§æ‡§®‡§∞‡§æ‡§É ‡§è‡§µ ‡§ò‡§£‡•ç‡§ü‡§æ‡§Ç ‡§µ‡§æ‡§¶‡§Ø‡§®‡•ç‡§§‡§ø' ‡§á‡§§‡§ø ‡§∏‡§æ ‡§Ö‡§™‡§∂‡•ç‡§Ø‡§§‡•ç ‡•§ ‡§Ö‡§®‡•ç‡§Ø‡•á‡§¶‡•ç‡§Ø‡•Å‡§É ‡§∏‡§æ ‡§µ‡§æ‡§®‡§∞‡•á‡§≠‡•ç‡§Ø‡§É ‡§Æ‡§ß‡•Å‡§∞‡§æ‡§£‡§ø ‡§´‡§≤‡§æ‡§£‡§ø ‡§Ö‡§Ø‡§ö‡•ç‡§õ‡§§‡•ç ‡•§ ‡§Ø‡§æ‡§µ‡§§‡•ç ‡§§‡•á ‡§´‡§≤‡§≠‡§ï‡•ç‡§∑‡§£‡§Æ‡§ó‡•ç‡§®‡§æ‡§É ‡§∏‡§Ç‡§ú‡§æ‡§§‡§æ‡§É, ‡§§‡§æ‡§µ‡§¶