In [None]:
!pip install --upgrade pip
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers docling_core pillow faiss-gpu-cu12 sentence-transformers accelerate

In [None]:
from huggingface_hub import login
login(token="<HF_token>")

In [None]:
import torch

from transformers import AutoProcessor, AutoModelForVision2Seq
from docling_core.types.doc.document import DocTagsDocument
from docling_core.types.doc import DoclingDocument
from transformers.image_utils import load_image
from pathlib import Path

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "ibm-granite/granite-docling-258M"
processor = AutoProcessor.from_pretrained(MODEL_ID)

vlm = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
    _attn_implementation="sdpa"
).to(DEVICE)
vlm.eval()

print("Granite‑Docling loaded on", DEVICE)

In [None]:
def convert_to_markdown(image_path, prompt_text=None):
    if prompt_text is None:
        prompt_text = (
            "Extract all readable text, numeric, and labeled content from this image, "
            "including headlines, paragraphs, numbers, labels, and convert to markdown."
        )
        
    image = load_image(image_path)
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        output_ids = vlm.generate(**inputs, max_new_tokens=8192)
        
    offset = inputs.input_ids.shape[1]
    generated = output_ids[:, offset:]
    doctags = processor.batch_decode(generated, skip_special_tokens=False)[0].lstrip()
    doc_obj = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
    doc = DoclingDocument.load_from_doctags(doc_obj, document_name="Document")
    
    return doc.export_to_markdown()

img_dir = Path("/workspace/images")
out_dir = Path("/workspace/corpus")
out_dir.mkdir(exist_ok=True)

for image_file in img_dir.glob("*.[pj][np]g"):
    print(f"Converting {image_file.name}...")
    md_text = convert_to_markdown(str(image_file))
    md_path = out_dir / f"{image_file.stem}.md"
    md_path.write_text(md_text, encoding="utf-8")
    
    print(f"Preview of {md_path.name}:\n{md_text[:500]}\n{'-'*50}")

In [None]:
import numpy as np, faiss

from sentence_transformers import SentenceTransformer
from typing import Iterator, Dict, Sequence
from pathlib import Path

def read_text(path: Path): return path.read_text(errors="ignore")

def iter_chunks(text: str, size=1000, overlap=200):
    step = max(1, size - overlap)
    for i in range(0, len(text), step):
        yield text[i:i+size]

records = []
for md_file in out_dir.glob("*.md"):
    txt = read_text(md_file)
    for idx, ch in enumerate(iter_chunks(txt)):
        records.append({"source": str(md_file), "chunk": ch, "chunk_index": idx})

In [None]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
texts = [r["chunk"] for r in records]
emb = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)

emb = emb / (np.linalg.norm(emb, axis=1, keepdims=True)+1e-12)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

print("Index ready:", index.ntotal, "chunks")

In [None]:
def retrieve(query, k=3):
    qv = embedder.encode([query], convert_to_numpy=True)
    qv = qv / (np.linalg.norm(qv, axis=1, keepdims=True)+1e-12)
    D, I = index.search(qv, k)
    return [(int(i), float(D[0, n])) for n, i in enumerate(I[0])]

query = "Type your query here"
hits = retrieve(query, k=3)

for i, score in hits:
    print(round(score, 3), records[i]["source"])
    print(records[i]["chunk"][:200], "\n")

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

gen_model_id = "google/gemma-2-2b-it"
tok = AutoTokenizer.from_pretrained(gen_model_id)
gen = AutoModelForCausalLM.from_pretrained(
    gen_model_id,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
gen.eval()

In [None]:
def build_prompt(question, context_chunks):
    context = "\n\n".join(context_chunks)
    return f"Use the context to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"

top_k_chunks = [records[i]["chunk"] for i, _ in hits]
prompt = build_prompt(query, top_k_chunks)

In [None]:
inputs = tok(prompt, return_tensors="pt", truncation=True).to(gen.device)

with torch.no_grad():
    out = gen.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

generated_tokens = out[0, inputs.input_ids.shape[1]:]
answer = tok.decode(generated_tokens, skip_special_tokens=True).strip()

print("Answer:", answer)