In [1]:
import fitz  # PyMuPDF for PDF parsing
import io
from PIL import Image
import os
from dotenv import load_dotenv
from pymongo import MongoClient

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv(override=True)

openai_api_key = os.environ.get("OPENAI_API_KEY")
google_api_key = os.environ.get("GOOGLE_API_KEY")

In [9]:
def read_pdf_with_images(pdf_path, output_folder="pdf_images"):
    os.makedirs(output_folder, exist_ok=True)

    doc = fitz.open(pdf_path)
    text_content = []

    for page_num, page in enumerate(doc):
        # Extract text
        text_content.append(page.get_text("text"))

        # Extract images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Save image to disk
            img_pil = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            img_filename = os.path.join(output_folder, f"page_{page_num}_img_{img_index}.png")
            img_pil.save(img_filename)

    return "\n".join(text_content), output_folder

In [10]:
import os
import base64
from PIL import Image
import io
from langchain.chat_models import init_chat_model

# Initialize the LLM
llm = init_chat_model("google_genai:gemini-2.5-flash")

def caption_images(image_folder="extracted_images"):
    img_data = []

    for img_file in sorted(os.listdir(image_folder)):
        if img_file.lower().endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(image_folder, img_file)

            # Open image and convert to bytes
            with open(img_path, "rb") as f:
                img_bytes = f.read()

            # Encode image as base64
            img_b64 = base64.b64encode(img_bytes).decode("utf-8")

            # Create message for LLM
            message = {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": (
                            "You are an assistant tasked with summarizing tables, images and text "
                            "for retrieval. These summaries will be embedded and used to retrieve "
                            "the raw text or table elements. Give a concise summary optimized for retrieval."
                        ),
                    },
                    {
                        "type": "file",
                        "source_type": "base64",
                        "data": img_b64,
                        "mime_type": "image/jpeg",  # or "image/png"
                    },
                ],
            }

            # Invoke the model
            response = llm.invoke([message])

            img_data.append({
                "response": response.text(),
                "name": img_file
            })

    return img_data


In [11]:
def prepare_documents(text, image_captions):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_docs = splitter.create_documents([text])
    img_docs = splitter.create_documents(image_captions)
    return text_docs + img_docs


In [12]:
def store_in_vectorstore(docs):
    client = MongoClient(os.getenv("MONGODB_URI"))
    collection = client["RAG-evaluation"]["RAG-multimodel"]

    embedding = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = MongoDBAtlasVectorSearch.from_documents(
        documents=docs,
        embedding=embedding,
        collection=collection,
        index_name="default"
    )
    return vectorstore

In [13]:
pdf_path = "/Users/vaibhavisavani/Desktop/Gen-AI/Advanced-RAG/data/attention_is_all_you_need.pdf"  
text, image_list = read_pdf_with_images(pdf_path)

print(f"Extracted {len(text)} characters of text and {len(image_list)} images.")


Extracted 39512 characters of text and 10 images.


In [14]:
image_list

'pdf_images'

In [15]:
# Caption images
from langchain.schema import HumanMessage

image_captions = caption_images(image_list)
print("Generated Image Captions:", image_captions)  # preview


Generated Image Captions: [{'response': 'Diagram of the Transformer model architecture, depicting an encoder-decoder structure. The encoder processes inputs using input embedding, positional encoding, multi-head attention, feed forward networks, and Add & Norm layers. The decoder processes shifted outputs using output embedding, positional encoding, masked multi-head attention, cross-attention (Multi-Head Attention), feed forward networks, and Add & Norm layers, culminating in output probabilities via Linear and Softmax layers.', 'name': 'page_2_img_0.png'}, {'response': 'A computational diagram depicting the Scaled Dot-Product Attention mechanism. It shows inputs Q, K, and V, followed by matrix multiplication of Q and K, scaling, optional masking, softmax activation, and a final matrix multiplication with V.', 'name': 'page_3_img_0.png'}, {'response': 'This image illustrates the Multi-Head Attention mechanism, a core component of Transformer models. It shows inputs V (Values), K (Keys

In [22]:
captions = [image_caption['response'] for image_caption in image_captions]
captions

['Diagram of the Transformer model architecture, depicting an encoder-decoder structure. The encoder processes inputs using input embedding, positional encoding, multi-head attention, feed forward networks, and Add & Norm layers. The decoder processes shifted outputs using output embedding, positional encoding, masked multi-head attention, cross-attention (Multi-Head Attention), feed forward networks, and Add & Norm layers, culminating in output probabilities via Linear and Softmax layers.',
 'A computational diagram depicting the Scaled Dot-Product Attention mechanism. It shows inputs Q, K, and V, followed by matrix multiplication of Q and K, scaling, optional masking, softmax activation, and a final matrix multiplication with V.',
 'This image illustrates the Multi-Head Attention mechanism, a core component of Transformer models. It shows inputs V (Values), K (Keys), and Q (Queries) each undergoing linear transformations. These transformed inputs then feed into multiple "Scaled Dot-P

In [23]:
# Prepare docs
docs = prepare_documents(text, captions)


In [24]:
# Store in MongoDB
vectorstore = store_in_vectorstore(docs)
print("Multimodal vector store created successfully.")


  embedding = OpenAIEmbeddings(model="text-embedding-3-small")


Multimodal vector store created successfully.


In [25]:
# Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [27]:
query = "How does attention work"
results = retriever.get_relevant_documents(query)

for r in results:
    print("Retrieved:", r.page_content)



Retrieved: Attention(Q, K, V ) = softmax(QKT
√dk
)V
(1)
The two most commonly used attention functions are additive attention [2], and dot-product (multi-
plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor
of
1
√dk . Additive attention computes the compatibility function using a feed-forward network with
a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is
Retrieved: reduced to a constant number of operations, albeit at the cost of reduced effective resolution due
to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as
described in section 3.2.
Self-attention, sometimes called intra-attention is an attention mechanism relating different positions
of a single sequence in order to compute a representation of the sequence. Self-attention has been
Retrieved: Scaled Dot-Product Attention
Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attenti