In [3]:
import base64
import io
import hashlib
from typing import List

from PIL import Image
import pytesseract

from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Table, Image as UImage

from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings


In [36]:
OCR_CACHE = {}
def ocr_images(image_base64_list):
    descriptions = []

    for i, img_b64 in enumerate(image_base64_list, start=1):

        if not img_b64:        # ✅ SAFETY CHECK
            continue

        key = hashlib.md5(img_b64.encode()).hexdigest()

        if key in OCR_CACHE:
            descriptions.append(OCR_CACHE[key])
            continue

        try:
            img_bytes = base64.b64decode(img_b64)
            img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
            text = pytesseract.image_to_string(img).strip()

            if text:
                desc = f"Figure {i} text: {text[:600]}"
            else:
                desc = f"Figure {i} is a diagram or illustration."

        except Exception:
            desc = f"Figure {i} could not be processed."

        OCR_CACHE[key] = desc
        descriptions.append(desc)

    return descriptions


In [37]:
def extract_modalities(chunk):
    text = chunk.text or ""
    tables = []
    images = []

    if hasattr(chunk.metadata, "orig_elements"):
        for el in chunk.metadata.orig_elements:
            if isinstance(el, Table):
                tables.append(
                    getattr(el.metadata, "text_as_html", el.text)
                )

            elif isinstance(el, UImage):
                img_b64 = getattr(el.metadata, "image_base64", None)
                if img_b64:              # ✅ THIS LINE MATTERS
                    images.append(img_b64)

    return text, tables, images


In [39]:
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

def summarize_chunk(text, tables, image_desc):
    prompt = f"""
You are creating a SEARCH-OPTIMIZED summary.

TEXT:
{text}

TABLES:
{tables}

IMAGE DESCRIPTIONS:
{image_desc}

TASK:
- Extract key facts, numbers, definitions
- Include alternative keywords
- Mention questions this chunk can answer
- Max 300 words
"""

    return llm.invoke(prompt).content


In [40]:
def ingest_pdf_to_chroma(
    pdf_path: str,
    persist_dir: str = "chroma_db"
):
    print("📄 Parsing PDF...")
    raw_chunks = partition_pdf(
        filename=pdf_path,
        infer_table_structure=True,
        extract_images_in_pdf=True,
        chunking_strategy="by_title",
        max_characters=1500,
    )

    documents = []

    for i, chunk in enumerate(raw_chunks, start=1):
        print(f"🧠 Processing chunk {i}/{len(raw_chunks)}")

        text, tables, images = extract_modalities(chunk)
        image_desc = ocr_images(images)

        if tables or image_desc:
            summary = summarize_chunk(text, tables, image_desc)
        else:
            summary = text

        documents.append(
            Document(
                page_content=summary,
                metadata={
                    "chunk_id": i,
                    "raw_text": text,
                    "source": pdf_path
                }
            )
        )

    print("🧬 Creating embeddings + storing in Chroma...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    db = Chroma.from_documents(
      documents=documents,
      embedding=embeddings,
      persist_directory=persist_dir
  )
    print("✅ Ingestion complete")
    return db



In [41]:
def search(db, query, k=3):
    retriever = db.as_retriever(search_kwargs={"k": k})
    return retriever.invoke(query)


In [42]:
def answer_question(chunks, query):
    docs = db.similarity_search(query, k=5)

    context = ""
    for d in docs:
        context += d.metadata.get("raw_text", "") + "\n"

    prompt = f"""
Answer the question using ONLY the context below.
If the answer is not present, say you don't know.

IMPORTANT:
- Explicitly extract and state ALL numbers, dimensions, counts, and hyperparameters.
- If a number appears in text or tables, it MUST be written verbatim.


QUESTION:
{query}

CONTEXT:
{context}

ANSWER:
"""

    return llm.invoke(prompt).content


In [11]:
from google.colab import files
files.upload()

Saving 1706.03762v7.pdf to 1706.03762v7.pdf


{'1706.03762v7.pdf': b'%PDF-1.5\n%\x8f\n137 0 obj\n<< /Filter /FlateDecode /Length 4011 >>\nstream\nx\xda\xb5ZK\x93\xe4\xb6\x91\xbe\xebW\xf4\x91\x15\xd1\xc5%^|hc\x0f\x92C+{#4\xe1\xb0zO\xe3>\xb0\xab\xd0U\xf4\xb0\xc8\x16\x1fj\x8d~\xbd3\x91\t\x12`qf\xbd\xbb\xf6\x89 \x90\xc4#\x9f_&\x98=\\\x1e\xb2\x87\x1f\xbf\xc9\xf8\xf9\xfd\xd37\xff\xf6\x9fe\xf1 DZ\x19#\x1f\x9e^\x1fDV>\x14"K3U<<\x9d\x1f>&\xe2p\x14Y\x96%\x7f\xea\xa6\xe1 \xca\xa4?\xcf\xa7\xa9\xe9\xbb\xc3\xf3\xd3\x7f\xc1\xc7\xfa\xa1J\xab\\\xe6\xee\xdb4\x930o\xf6 \xdc4yiRe\xf4\xc3\xd3\r\xa6\xf9\x8b=\xcd\xc3`\xbb\xe9p\x94\xb9N:;\x0fu\xeb\xdb\xd3\xfbAdI?|\x1a\x1f\xb1\xabJ\xda\xbe\xbb\xd0\xe0x\xed\x87\xe98\xd9\xe1F\xef7{\xeb\x87\xcf\xd4\xfe\x88{\x80\xc5hI%s8\x85\x8e\x97\x15\x8ah\xd6\x9d)\x05d"\x8f\xc9\x9ei\xc2\xba;S\xe3r0I=Y~\x1b\xe2\xbd\xc7\xcbjY\xa4\x85\x94\xf1|\xc5vU\xaddZdjw\xd5/\xf1\xc2\xcf!\x8a\x90\xa9\x85N\xb564A\xd3\xc1W:O\xde\xeaajNs[\x0f\x07U%\x8f\xd4y\xad\x0f\xa2J~=\x08\x93X\xeay\xb1\x96?\xf8k\x96\xc9\xe1\xd6"\x1fu\x91\xd8q\xaa_\xdaf\x

In [52]:
# Run once
db = ingest_pdf_to_chroma("/content/1706.03762v7.pdf")

📄 Parsing PDF...
🧠 Processing chunk 1/36
🧠 Processing chunk 2/36
🧠 Processing chunk 3/36
🧠 Processing chunk 4/36
🧠 Processing chunk 5/36
🧠 Processing chunk 6/36
🧠 Processing chunk 7/36
🧠 Processing chunk 8/36
🧠 Processing chunk 9/36
🧠 Processing chunk 10/36
🧠 Processing chunk 11/36
🧠 Processing chunk 12/36
🧠 Processing chunk 13/36
🧠 Processing chunk 14/36
🧠 Processing chunk 15/36
🧠 Processing chunk 16/36
🧠 Processing chunk 17/36
🧠 Processing chunk 18/36
🧠 Processing chunk 19/36
🧠 Processing chunk 20/36
🧠 Processing chunk 21/36
🧠 Processing chunk 22/36
🧠 Processing chunk 23/36
🧠 Processing chunk 24/36
🧠 Processing chunk 25/36
🧠 Processing chunk 26/36
🧠 Processing chunk 27/36
🧠 Processing chunk 28/36
🧠 Processing chunk 29/36
🧠 Processing chunk 30/36
🧠 Processing chunk 31/36
🧠 Processing chunk 32/36
🧠 Processing chunk 33/36
🧠 Processing chunk 34/36
🧠 Processing chunk 35/36
🧠 Processing chunk 36/36
🧬 Creating embeddings + storing in Chroma...
✅ Ingestion complete


In [35]:
# Run many times
query = "How many attention heads does the Transformer use?"

docs = db.similarity_search(query, k=3)
for i, d in enumerate(docs):
    print(f"\n--- DOC {i+1} ---\n")
    print(d.page_content[:800])



--- DOC 1 ---

3.2.3 Applications of Attention in our Model

The Transformer uses multi-head attention in three different ways:

• In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [38, 2, 9].

• The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.

• Similarly, self-attention

--- DOC 2 ---

3.2.3 Applications of Attention in our Model

The Transformer uses multi-head attention in three different ways:

• In "encoder-decoder attention" layers, the queries 

In [54]:
# Run many times
query = "What trade-offs are discussed regarding attention head count?"
chunks = search(db, query)
print(answer_question(chunks, query))

The trade-offs discussed regarding attention head count are:

- The model employs h = 8 parallel attention layers, or heads.
- With a single attention head, averaging inhibits the model's ability to jointly attend to information from different representation subspaces at different positions.
- Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions.
- The total computational cost is similar to that of single-head attention with full dimensionality due to the reduced dimension of each head.

Extracted numbers and dimensions:

- h = 8 (number of parallel attention layers or heads)
- dk = dv = dmodel/h = 64 (dimension of each head)
- dmodel (dimension of keys, values, and queries, but not explicitly stated)
- 4 (number, but its context is unclear)
