In [24]:
import pytesseract
from pdf2image import convert_from_path
import google.generativeai as genai
from pathlib import Path
import os
import time

PDF_PATH = "Geeta-demo-1-10.pdf"
OUTPUT_FILE = "geeta_cleaned_rotated_keys.md"
LANG = "guj+san+eng"
GEMINI_MODEL = "models/gemini-2.5-flash"

API_KEYS = [
    "AIzaSyAj4GcRu_m9Bk5hn6CgI90-p5_TV16tH00",
    "AIzaSyAe2aZOZb5KMn13fFllBwQ0PPVrX2pJsuE",
    "AIzaSyBOFfKoJPUkkFLyqYRJD8c_8K5a4RAKwfU",
    "AIzaSyA46lK9-zHQUqW_NxeDLNi8v1-7XFg2zFk",
    "AIzaSyB15tLMFCxI-CngJ1yWSSUmZDeNtoiwCIw",
    "AIzaSyBHzW9r23ewwWNmGsC6GydgeJrvCpBacqs",
]

images = convert_from_path(PDF_PATH, dpi=300)

def call_gemini(prompt):
    for key in API_KEYS:
        try:
            genai.configure(api_key=key)
            model = genai.GenerativeModel(GEMINI_MODEL)
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"{key[:6]}... → {str(e)[:80]}")
            time.sleep(1)
    raise RuntimeError("All API keys failed. Check quota or validity.")

cleaned_pages = []

for idx, img in enumerate(images):
    print(f"Page {idx+1}/{len(images)}: OCR + Gemini")
    raw_text = pytesseract.image_to_string(img, lang=LANG, config="--psm 6")[:1200]

    prompt = f"""
You are a multilingual OCR fixer. Below is noisy text from Gujarati/Sanskrit scan.
Clean it, fix the structure, and return Markdown-formatted clean content.

OCR TEXT:
\"\"\"
{raw_text}
\"\"\"
    """

    cleaned = call_gemini(prompt)
    cleaned_pages.append(f"\n\n## Page {idx+1}\n{cleaned}")

Path(OUTPUT_FILE).write_text("\n".join(cleaned_pages), encoding="utf-8")
print(f"Output saved to {OUTPUT_FILE}")


📄 Page 1/10: OCR + Gemini




[⚠️ Key Failed] AIzaSy... → 400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flas
📄 Page 2/10: OCR + Gemini
📄 Page 3/10: OCR + Gemini
📄 Page 4/10: OCR + Gemini
📄 Page 5/10: OCR + Gemini
📄 Page 6/10: OCR + Gemini
📄 Page 7/10: OCR + Gemini
📄 Page 8/10: OCR + Gemini
📄 Page 9/10: OCR + Gemini
📄 Page 10/10: OCR + Gemini
[✅ DONE] Output saved to geeta_cleaned_rotated_keys.md


In [26]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
from pathlib import Path
import json

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
FILE_PATH = "/content/geeta_cleaned_rotated_keys.md"
CHUNK_SIZE = 300
CHUNK_OVERLAP = 50
OUTPUT_FAISS_INDEX = "geeta_index.faiss"
OUTPUT_METADATA = "geeta_chunks_metadata.json"

raw_text = Path(FILE_PATH).read_text(encoding="utf-8")


pages = re.split(r"\n#+ Page \d+", raw_text)
chunks = []

for page in pages:
    words = page.split()
    for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
        chunk = " ".join(words[i:i + CHUNK_SIZE]).strip()
        if len(chunk.split()) > 30:
            chunks.append(chunk)

print(f"Total chunks created: {len(chunks)}")

print(f"Loading embedding model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)

embeddings = model.encode(chunks, show_progress_bar=True, convert_to_numpy=True)


dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, OUTPUT_FAISS_INDEX)
print(f"Saved FAISS index to: {OUTPUT_FAISS_INDEX}")


Path(OUTPUT_METADATA).write_text(
    json.dumps([{"id": i, "text": chunk} for i, chunk in enumerate(chunks)], indent=2, ensure_ascii=False)
)
print(f"Saved chunk metadata to: {OUTPUT_METADATA}")


[INFO] Total chunks created: 10
[INFO] Loading embedding model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[✅] Saved FAISS index to: geeta_index.faiss
[✅] Saved chunk metadata to: geeta_chunks_metadata.json


In [29]:
import faiss
import numpy as np
import json
import google.generativeai as genai
from sentence_transformers import SentenceTransformer


EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
INDEX_PATH = "geeta_index.faiss"
METADATA_PATH = "geeta_chunks_metadata.json"
GEMINI_MODEL = "models/gemini-2.5-flash"
GOOGLE_API_KEY = "AIzaSyDKhc-2R3J7qREmgVbLykYvyV9Xr_6nWsQ"

genai.configure(api_key=GOOGLE_API_KEY)
llm = genai.GenerativeModel(GEMINI_MODEL)
embedder = SentenceTransformer(EMBEDDING_MODEL)

print("Loading FAISS index and chunk metadata")
index = faiss.read_index(INDEX_PATH)
chunks = json.load(open(METADATA_PATH, "r", encoding="utf-8"))

def search_chunks(query, k=5):
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec), k)
    return [chunks[i]["text"] for i in I[0]]


def answer_with_rag(query, top_k=5):
    top_chunks = search_chunks(query, k=top_k)
    context = "\n---\n".join(top_chunks)

    prompt = f"""
You are a knowledgeable assistant trained on Indian religious texts.
Use only the context below to answer the question truthfully and concisely.

### Context:
{context}

### Question:
{query}

### Answer (Gujarati or Sanskrit allowed if meaningful):
"""

    response = llm.generate_content(prompt)
    return response.text.strip(), top_chunks

while True:
    user_query = input("Enter your question: ")
    if user_query.lower() == "exit":
        break

    answer, context_chunks = answer_with_rag(user_query)


    print("Answer:")
    print(answer)


Loading FAISS index and chunk metadata
Enter your question: what dhritrashtra said
Answer:
ધૃતરાષ્ટ્રે સંજયને કહ્યું: “ધર્મક્ષેત્રરૂપ કુરુક્ષેત્રમાં યુદ્ધ માટે એકઠા થયેલા મારા પુત્રો તથા પાંડવોએ શું કર્યું?”

અથવા સંસ્કૃતમાં:
धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः ।
मामकाः पाण्डवाश्चैव किमकुर्वत संजय ॥१॥
Enter your question: exit
