In [None]:
import os, re, sys
from typing import List, Dict
from pathlib import Path
from chromadb import PersistentClient
import ollama
import chromadb
from chromadb.config import Settings
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract

In [3]:

MODEL_LLM = "deepseek-r1:8b" 
MODEL_EMBED = "nomic-embed-text"  
PDF_PATH = "C:\\Users\\tihom\\Desktop\\RAG_Project\\math_book.pdf"  
PERSIST_DIR = "C:\\Users\\tihom\\Desktop\\RAG_Project\\chroma_store"  
TOP_K = 5
CHUNK_SIZE = 900
CHUNK_OVERL = 150
POPPLER_PATH = "C:\\Program Files\\poppler-25.07.0\\Library\\bin"  
TESSERACT_CMD = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" 

In [5]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

pages = convert_from_path(
    r"C:\Users\tihom\Desktop\RAG_Project\math_book.pdf",
    poppler_path=r"C:\Program Files\poppler-25.07.0\Library\bin",
)
text = ""
for page in pages:
    text += pytesseract.image_to_string(page, lang="bul") + "\n"

print(text[:500])

В. Видев, К. Кръстев, М. Иванова
ВИСША МАТЕМАТИКА

първа част

г-а

2

ТРАКИИСКИ УНИВЕРСИТЕТ-СТАРА ЗАГОРА
ФАКУЛТЕТ ? ТЕХНИКИ И ТЕХНОЛОГИИ”

В. Видев, Кр. Кръстев, М. Иванова

ВИСША МАТЕМАТИКА

първа част

са

ЯМБОЛ-2013

Апотация. Настоящият учебник, съдържащ цикъл от лекции от лекции и
упражнения е предназначен за студентите от Факултет “Техники и технологии” на
Тракийски университет, град Стара Загора.  Тематичните единици в учебика
представят основни сведения от линейната алгебра и аналитична


In [8]:
with open("math_book_ocr.txt", "w", encoding="utf-8") as f:
    f.write(text)

In [4]:
try:
    import pytesseract
    from pdf2image import convert_from_path
except Exception:
    pytesseract = None
    convert_from_path = None

if pytesseract is not None and TESSERACT_CMD:
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD


def extract_text_pypdf(path: str) -> str:
    reader = PdfReader(path)
    text = "\\n".join([p.extract_text() or "" for p in reader.pages])

    return re.sub(r"\\s+\\n", "\\n", text).strip()


def ocr_text_from_pdf(path: str) -> str:
    if pytesseract is None or convert_from_path is None:
        raise RuntimeError("OCR libs not available.")
    images = convert_from_path(path, poppler_path=POPPLER_PATH)
    out_text = []
    for img in images:
        out_text.append(pytesseract.image_to_string(img, lang="eng"))
    text = "\\n".join(out_text)
    return re.sub(r"\\s+\\n", "\\n", text).strip()


def get_pdf_text(path: str) -> str:

    txt = extract_text_pypdf(path)
    if txt and txt.strip():
        return txt

    print("No text layer found. Falling back to OCR...")
    return ocr_text_from_pdf(path)


def chunk_text(text: str, size=900, overlap=150) -> List[str]:
    chunks, i = [], 0
    while i < len(text):
        chunk = text[i : i + size]
        if chunk.strip():
            chunks.append(chunk)
        i += size - overlap
    return chunks


def embed_batch(texts: List[str], model: str) -> List[List[float]]:
    embs = []
    for t in texts:
        r = ollama.embeddings(model=model, prompt=t)
        embs.append(r["embedding"])
    return embs


def get_collection(persist_dir: str, name: str):
    client = PersistentClient(path=persist_dir)
    return client.get_or_create_collection(name)


def build_index(
    pdf_path: str,
    persist_dir: str,
    model_embed: str,
    name: str = "pdf_rag",
    chunk_size: int = 900,
    chunk_overlap: int = 150,
):
    text = get_pdf_text(pdf_path)
    chunks = chunk_text(text, size=chunk_size, overlap=chunk_overlap)

    client = PersistentClient(path=persist_dir)
    try:
        client.delete_collection(name)
    except Exception:
        pass

    coll = client.get_or_create_collection(name)
    ids = [f"chunk_{i}" for i in range(len(chunks))]
    embs = embed_batch(chunks, model_embed)
    coll.add(
        documents=chunks,
        ids=ids,
        embeddings=embs,
        metadatas=[
            {"source": os.path.basename(pdf_path), "i": i} for i in range(len(chunks))
        ],
    )
    return coll


def retrieve(coll, query: str, model_embed: str, k: int = 5) -> str:
    q_emb = embed_batch([query], model_embed)[0]
    res = coll.query(query_embeddings=[q_emb], n_results=k)
    docs = res.get("documents", [[]])[0]
    metas = res.get("metadatas", [[]])[0]
    ctx_lines = []
    for d, m in zip(docs, metas):
        ctx_lines.append(f"[{m.get('source','?')}#chunk{m.get('i','?')}]\\n{d}")
    return "\\n---\\n".join(ctx_lines)




In [None]:
SYSTEM_PROMPT = (
    "You are a helpful assistant. Answer ONLY from the provided context. "
    "If the answer is not present, say what is missing. "
    "Be concise and include short step-by-step reasoning when relevant. "
    "Cite chunks like [file.pdf#chunkN]."
)


def ask_llm(
    model_llm: str, context: str, question: str, temperature: float = 0.2
) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"Context:\\n{context}\\n\\nQuestion: {question}\\n\\nAnswer with citations.",
        },
    ]
    resp = ollama.chat(
        model=model_llm, messages=messages, options={"temperature": temperature}
    )
    return resp["message"]["content"]

In [5]:
from pathlib import Path

Path(PERSIST_DIR).mkdir(parents=True, exist_ok=True)
collection = build_index(
    PDF_PATH,
    PERSIST_DIR,
    MODEL_EMBED,
    name="pdf_rag",
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERL,
)
print("Index built and stored at:", PERSIST_DIR)

Index built and stored at: C:\Users\tihom\Desktop\RAG_Project\chroma_store


In [6]:
# === Ask questions grounded to your PDF ===
questions = [
    "Give me a short summary of the key ideas.",
    "List any formulas mentioned and provide a tiny example.",
]
coll = get_collection(PERSIST_DIR, "pdf_rag")
for q in questions:
    ctx = retrieve(coll, q, MODEL_EMBED, k=TOP_K)
    ans = ask_llm(MODEL_LLM, ctx, q)
    print(f"Q: {q}\n---\n{ans}\n{'='*60}\n")

Q: Give me a short summary of the key ideas.
---
I need to analyze the context provided to give a short summary of key ideas. The context is a PDF file called math_book.pdf with several chunks. However, the actual content of the PDF chunks isn't visible in the provided context - it's just shown as [math_book.pdf#chunk0] through many chunks with empty content.

Let me think about what I can do here:

1. The user has provided a context that shows a PDF file with many chunks, but the actual content of these chunks isn't visible in the text. It's just shown as [math_book.pdf#chunk0] followed by many empty lines.

2. Since I don't have the actual content of the math book, I cannot provide a summary of key ideas from it.

3. The instruction says to answer ONLY from the provided context. Since the context doesn't contain the actual content of the math book (just references to chunks), I don't have the information needed to create a summary.

4. I should be clear that the context provided does

In [10]:
your_question = "Explain section 2 in simple terms."
coll = get_collection(PERSIST_DIR, "pdf_rag")
ctx = retrieve(coll, your_question, MODEL_EMBED, k=TOP_K)
print(ask_llm(MODEL_LLM, ctx, your_question))


Hmm, the user wants me to explain section 2 in simple terms from a math book context, and they want citations from the provided context. But looking at the context they shared, it's just [math_book.pdf#chunk0] with a lot of empty space - it seems like there's no actual content in that chunk.

I need to be careful here. The user specified "Answer ONLY from the provided context" and if the answer isn't present, I should say what's missing. The context they provided is just a reference to chunk 0 of math_book.pdf with no actual text content - it's all empty lines and spaces. There's nothing in that chunk to reference.

The user is probably expecting me to explain some math section, but since the context is empty, I can't do that. I should be clear that section 2 isn't present in the provided context. I'll make sure to be concise as requested and cite what's missing.

I recall that in the instructions, I need to say what's missing if the answer isn't present. So I'll state that the context