In [1]:
import torch
import numpy as np
from datasets import load_from_disk
from transformers import AutoModel, AutoConfig
import sentencepiece as spm
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_from_disk

DATA_PATH = "bhagavad_gita_qa_sanskrit_partial"   # üî• FOLDER, not file

dataset = load_from_disk(DATA_PATH)

print("Dataset loaded:", dataset)
print("Number of rows:", len(dataset))
print("Sample row:\n", dataset[0])


Dataset loaded: Dataset({
    features: ['chapter_no', 'verse_no', 'question', 'answer', 'question_sa', 'answer_sa'],
    num_rows: 842
})
Number of rows: 842
Sample row:
 {'chapter_no': 1, 'verse_no': 1, 'question': 'Why does Dhritarashtra ask Sanjaya to describe the battlefield?', 'answer': 'Dhritarashtra is blind, both physically and symbolically ‚Äî representing ignorance. He wants Sanjaya to narrate what is happening because he is anxious about the outcome of the war and whether his sons (Kauravas) will win.', 'question_sa': '‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§∏‡§û‡•ç‡§ú‡§Ø‡§Ç ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§∏‡•ç‡§Ø ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§Ø‡§§‡§ø?', 'answer_sa': '‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§Ö‡§®‡•ç‡§ß‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§∂‡§æ‡§∞‡•Ä‡§∞‡§ø‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§™‡•ç‡§∞‡§§‡•Ä‡§ï‡§æ‡§§‡•ç‡§Æ‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§ö-‡§Ö‡§ú‡•ç‡§û‡§æ‡§®‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡§ø‡§§‡•ç‡§µ‡§Ç ‡§ï‡§∞‡•ã‡§§‡§ø‡•§ ‡§∏‡§É ‡§á‡§ö‡•ç‡§õ‡§§‡§ø ‡§Ø‡§

In [4]:
sp = spm.SentencePieceProcessor()
sp.load("sp_unigram_64k.model")

print("Tokenizer loaded.")


Tokenizer loaded.


In [5]:
MODEL_PATH = "bert-sanskrit-light"

print("Loading SanskritBERT model...")

config = AutoConfig.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(DEVICE)
model.eval()

print("Model loaded on", DEVICE)


Loading SanskritBERT model...


Some weights of BertModel were not initialized from the model checkpoint at bert-sanskrit-light and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cpu


In [6]:
def encode(text):
    # Tokenize with your sentencepiece
    pieces = sp.encode_as_pieces(text)
    ids = sp.encode_as_ids(text)

    input_ids = torch.tensor([ids], dtype=torch.long).to(DEVICE)
    attention_mask = torch.ones_like(input_ids).to(DEVICE)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

    # Use mean pooling over token embeddings
    token_embeddings = outputs.last_hidden_state  # (1, seq_len, hidden)
    sentence_embedding = token_embeddings.mean(dim=1)  # (1, hidden)

    return sentence_embedding.cpu().numpy()[0]


In [7]:
print("\nEncoding all Sanskrit questions...")

question_texts = dataset["question_sa"]
answer_texts   = dataset["answer_sa"]

question_embeddings = []

for i, q in enumerate(question_texts):
    emb = encode(q)
    question_embeddings.append(emb)

    if i % 50 == 0:
        print(f"Encoded {i}/{len(question_texts)} questions...")

question_embeddings = np.array(question_embeddings)

print("\nAll question embeddings ready.")
print("Embedding shape:", question_embeddings.shape)



Encoding all Sanskrit questions...
Encoded 0/842 questions...
Encoded 50/842 questions...
Encoded 100/842 questions...
Encoded 150/842 questions...
Encoded 200/842 questions...
Encoded 250/842 questions...
Encoded 300/842 questions...
Encoded 350/842 questions...
Encoded 400/842 questions...
Encoded 450/842 questions...
Encoded 500/842 questions...
Encoded 550/842 questions...
Encoded 600/842 questions...
Encoded 650/842 questions...
Encoded 700/842 questions...
Encoded 750/842 questions...
Encoded 800/842 questions...

All question embeddings ready.
Embedding shape: (842, 256)


In [8]:
def query_model(query, top_k=5):
    print("\n==============================")
    print("QUERY:", query)
    print("==============================")

    query_emb = encode(query)

    # Cosine similarity with all stored questions
    sims = cosine_similarity([query_emb], question_embeddings)[0]

    # Top K matches
    top_indices = np.argsort(sims)[::-1][:top_k]

    print("\nüîù Top Matches:\n")

    for rank, idx in enumerate(top_indices, 1):
        print(f"--- Rank {rank} | Similarity: {sims[idx]:.4f} ---")
        print("Matched Question:", question_texts[idx])
        print("Answer:", answer_texts[idx][:300], "...")
        print()


In [9]:
query = "‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§∏‡§Ç‡§ú‡§Ø‡§Æ‡•ç ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§≠‡•Ç‡§Æ‡•á‡§É ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§™‡•É‡§ö‡•ç‡§õ‡§§‡§ø?"

query_model(query, top_k=5)



QUERY: ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§∏‡§Ç‡§ú‡§Ø‡§Æ‡•ç ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§≠‡•Ç‡§Æ‡•á‡§É ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§™‡•É‡§ö‡•ç‡§õ‡§§‡§ø?

üîù Top Matches:

--- Rank 1 | Similarity: 0.8822 ---
Matched Question: ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§∏‡§û‡•ç‡§ú‡§Ø‡§Ç ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§∏‡•ç‡§Ø ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§Ø‡§§‡§ø?
Answer: ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§Ö‡§®‡•ç‡§ß‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§∂‡§æ‡§∞‡•Ä‡§∞‡§ø‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§™‡•ç‡§∞‡§§‡•Ä‡§ï‡§æ‡§§‡•ç‡§Æ‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§ö-‡§Ö‡§ú‡•ç‡§û‡§æ‡§®‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡§ø‡§§‡•ç‡§µ‡§Ç ‡§ï‡§∞‡•ã‡§§‡§ø‡•§ ‡§∏‡§É ‡§á‡§ö‡•ç‡§õ‡§§‡§ø ‡§Ø‡§§‡•ç ‡§∏‡§û‡•ç‡§ú‡§Ø‡§É ‡§ï‡§ø‡§Ç ‡§ò‡§ü‡§Ø‡§§‡§ø ‡§á‡§§‡§ø ‡§µ‡§∞‡•ç‡§£‡§Ø‡§§‡•Å, ‡§Ø‡§§‡§É ‡§∏‡§É ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§∏‡•ç‡§Ø ‡§™‡§∞‡§ø‡§£‡§æ‡§Æ‡§∏‡•ç‡§Ø ‡§µ‡§ø‡§∑‡§Ø‡•á ‡§ö‡§ø‡§®‡•ç‡§§‡§ø‡§§‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§§‡§∏‡•ç‡§Ø ‡§™‡•Å‡§§‡•ç‡§∞‡§æ‡§É (‡§ï‡•å‡§∞‡§µ‡§æ‡§É) ‡§µ‡§ø‡§ú‡§Ø‡§®‡•ç‡§§‡§ø ‡§µ‡§æ ‡§á‡§§‡§ø ‡§ö‡•§

In [10]:
query = "‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§∏‡§Ç‡§ú‡§Ø‡§Æ‡•ç ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§≠‡•Ç‡§Æ‡•á‡§É ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§™‡•É‡§ö‡•ç‡§õ‡§§‡§ø?"
query_model(query, top_k=3)


QUERY: ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§∏‡§Ç‡§ú‡§Ø‡§Æ‡•ç ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§≠‡•Ç‡§Æ‡•á‡§É ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§™‡•É‡§ö‡•ç‡§õ‡§§‡§ø?

üîù Top Matches:

--- Rank 1 | Similarity: 0.8453 ---
Matched Question: ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§∏‡§û‡•ç‡§ú‡§Ø‡§Ç ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§∏‡•ç‡§Ø ‡§µ‡§∞‡•ç‡§£‡§®‡§Ç ‡§ï‡§ø‡§Æ‡§∞‡•ç‡§•‡§Ç ‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§Ø‡§§‡§ø?
Answer: ‡§ß‡•É‡§§‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§É ‡§Ö‡§®‡•ç‡§ß‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§∂‡§æ‡§∞‡•Ä‡§∞‡§ø‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§™‡•ç‡§∞‡§§‡•Ä‡§ï‡§æ‡§§‡•ç‡§Æ‡§ï‡§∞‡•Ç‡§™‡•á‡§£ ‡§ö-‡§Ö‡§ú‡•ç‡§û‡§æ‡§®‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡§ø‡§§‡•ç‡§µ‡§Ç ‡§ï‡§∞‡•ã‡§§‡§ø‡•§ ‡§∏‡§É ‡§á‡§ö‡•ç‡§õ‡§§‡§ø ‡§Ø‡§§‡•ç ‡§∏‡§û‡•ç‡§ú‡§Ø‡§É ‡§ï‡§ø‡§Ç ‡§ò‡§ü‡§Ø‡§§‡§ø ‡§á‡§§‡§ø ‡§µ‡§∞‡•ç‡§£‡§Ø‡§§‡•Å, ‡§Ø‡§§‡§É ‡§∏‡§É ‡§Ø‡•Å‡§¶‡•ç‡§ß‡§∏‡•ç‡§Ø ‡§™‡§∞‡§ø‡§£‡§æ‡§Æ‡§∏‡•ç‡§Ø ‡§µ‡§ø‡§∑‡§Ø‡•á ‡§ö‡§ø‡§®‡•ç‡§§‡§ø‡§§‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§§‡§∏‡•ç‡§Ø ‡§™‡•Å‡§§‡•ç‡§∞‡§æ‡§É (‡§ï‡•å‡§∞‡§µ‡§æ‡§É) ‡§µ‡§ø‡§ú‡§Ø‡§®‡•ç‡§§‡§ø ‡§µ‡§æ ‡§á‡§§‡§ø ‡§ö‡•§

In [11]:
query = "‡§ï‡§∞‡•ç‡§Æ‡§Ø‡•ã‡§ó‡§∏‡•ç‡§Ø ‡§Æ‡§π‡§§‡•ç‡§§‡•ç‡§µ‡§Ç ‡§ï‡§ø‡§Æ‡•ç?"
query_model(query, top_k=5)



QUERY: ‡§ï‡§∞‡•ç‡§Æ‡§Ø‡•ã‡§ó‡§∏‡•ç‡§Ø ‡§Æ‡§π‡§§‡•ç‡§§‡•ç‡§µ‡§Ç ‡§ï‡§ø‡§Æ‡•ç?

üîù Top Matches:

--- Rank 1 | Similarity: 0.8701 ---
Matched Question: ‡§™‡§¶‡•ç‡§Ø‡•á ‡§â‡§≤‡•ç‡§≤‡§ø‡§ñ‡§ø‡§§‡§∏‡•ç‡§Ø ‡§∞‡§•‡§∏‡•ç‡§Ø ‡§Æ‡§π‡§§‡•ç‡§§‡•ç‡§µ‡§Ç ‡§ï‡§ø‡§Æ‡•ç?
Answer: ‡§Ö‡§Ø‡§Ç ‡§∞‡§•‡§É ‡§µ‡•à‡§≠‡§µ‡§Ø‡•Å‡§§‡§É, ‡§∂‡•ç‡§µ‡•á‡§§-‡§Ö‡§∂‡•ç‡§µ‡•à‡§É ‡§Ü‡§π‡•É‡§§‡§É ‡§ö ‡§á‡§§‡§ø ‡§µ‡§∞‡•ç‡§£‡§ø‡§§‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø, ‡§Ø‡§É ‡§™‡§µ‡§ø‡§§‡•ç‡§∞‡§§‡§æ‡§Ø‡§æ‡§É, ‡§¶‡•à‡§µ‡§ø‡§ï-‡§Ö‡§®‡•Å‡§ó‡•ç‡§∞‡§π‡§∏‡•ç‡§Ø ‡§ö ‡§™‡•ç‡§∞‡§§‡•Ä‡§ï‡§É ‡§Ö‡§∏‡•ç‡§§‡§ø‡•§ ‡§è‡§§‡§§‡•ç ‡§∂‡•ç‡§∞‡•Ä‡§ï‡•É‡§∑‡•ç‡§£‡§∏‡•ç‡§Ø ‡§Ö‡§∞‡•ç‡§ú‡•Å‡§®‡§∏‡•ç‡§Ø ‡§ö ‡§≠‡§µ‡•ç‡§Ø‡§§‡§æ‡§Ç, ‡§¶‡•à‡§µ‡§ø‡§ï-‡§∏‡§Æ‡§∞‡•ç‡§•‡§®‡§Ç ‡§ö ‡§™‡•ç‡§∞‡§ï‡§æ‡§∂‡§Ø‡§§‡§ø‡•§ ...

--- Rank 2 | Similarity: 0.8322 ---
Matched Question: ‡§Ö‡§∏‡•ç‡§Æ‡§ø‡§®‡•ç ‡§∂‡•ç‡§≤‡•ã‡§ï‡§∏‡•ç‡§Ø ‡§µ‡•É‡§∑‡•ç‡§ü‡•ç‡§Ø‡§æ ‡§Ø‡§ú‡•ç‡§û‡§∏‡•ç‡§Ø (‡§Ø‡§ú‡•ç‡§û‡§∏‡•ç‡§Ø) ‡§∏‡§Ç‡§Ø‡•ã‡§ó‡§∏‡•ç‡§Ø ‡§Æ‡§π‡§§‡•ç‡§§‡•ç‡§µ‡§Ç ‡§ï‡§ø‡§Æ‡•ç?
Answer: ‡§Ø‡§ú‡•ç‡§û‡§Ç ‡§µ‡•É‡§