### Multimodal RAG (PDF With Images)

![image.png](attachment:image.png)

In [2]:
!pip install PyMuPDF
!pip install langchain_community
!pip install transformers
!pip install scikit-learn
!pip install faiss-cpu




Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
#!pip uninstall -y google-generativeai google-ai-generativelanguage langchain-google-genai
!pip install google-generativeai langchain-google-genai



In [4]:
#!pip install PyMuPDF
#!pip install langchain_community


#!pip uninstall -y google-generativeai google-ai-generativelanguage langchain-google-genai
#!pip install google-generativeai langchain-google-genai




import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [5]:
###Clip Model

from dotenv import load_dotenv
load_dotenv()

from google.colab import userdata

## set up the environment
os.environ["GOOGLE_API_KEY"]=userdata.get("GOOGLE_API_KEY")

### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [6]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data

    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [18]:
## Process PDF
pdf_path="/content/Rag_pdf.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)




In [8]:
doc

Document('/content/Uday_Jinna_Portfolio.pdf')

In [19]:
for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)



    ## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for gemini (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()


In [20]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Retinexformer: One-stage Retinex-based\nTransformer for Low-light Image Enhancement\nYuanhao Cai 1, Hao Bian 1, Jing Lin 1,\nHaoqian Wang 1,* , Radu Timofte 2, Yulun Zhang 3,∗\n1 Tsinghua University, 2 University of W¨urzburg, 3 ETH Z¨urich\nAbstract\nWhen enhancing low-light images, many deep learning\nalgorithms are based on the Retinex theory. However, the\nRetinex model does not consider the corruptions hidden in\nthe dark or introduced by the light-up process. Besides,'),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='the dark or introduced by the light-up process. Besides,\nthese methods usually require a tedious multi-stage training\npipeline and rely on convolutional neural networks, show-\ning limitations in capturing long-range dependencies. In\nthis paper, we formulate a simple yet principled One-stage\nRetinex-based Framework (ORF). ORF first estimates the\nillumination information to light up the 

In [22]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 0.00620995,  0.03214407, -0.03015697, ..., -0.11086666,
         0.0019284 , -0.00166416],
       [ 0.02514893, -0.0049546 , -0.0127898 , ..., -0.02670207,
         0.01263563,  0.00444957],
       [ 0.00423687, -0.00498299,  0.0095452 , ..., -0.00395005,
         0.01384856, -0.02543307],
       ...,
       [ 0.02513283,  0.00913819, -0.00731262, ..., -0.04079138,
         0.02180101,  0.01311531],
       [ 0.04029182,  0.02386706, -0.04083086, ..., -0.04506869,
         0.01502955,  0.02917191],
       [ 0.03149974,  0.02145283, -0.00527158, ..., -0.00457855,
        -0.0105234 ,  0.01146132]], dtype=float32)

In [23]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Retinexformer: One-stage Retinex-based\nTransformer for Low-light Image Enhancement\nYuanhao Cai 1, Hao Bian 1, Jing Lin 1,\nHaoqian Wang 1,* , Radu Timofte 2, Yulun Zhang 3,∗\n1 Tsinghua University, 2 University of W¨urzburg, 3 ETH Z¨urich\nAbstract\nWhen enhancing low-light images, many deep learning\nalgorithms are based on the Retinex theory. However, the\nRetinex model does not consider the corruptions hidden in\nthe dark or introduced by the light-up process. Besides,'),
  Document(metadata={'page': 0, 'type': 'text'}, page_content='the dark or introduced by the light-up process. Besides,\nthese methods usually require a tedious multi-stage training\npipeline and rely on convolutional neural networks, show-\ning limitations in capturing long-range dependencies. In\nthis paper, we formulate a simple yet principled One-stage\nRetinex-based Framework (ORF). ORF first estimates the\nillumination information to light up th

In [24]:


# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7d6a355c4200>

In [25]:
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    google_api_key=userdata.get('GOOGLE_API_KEY'),
    temperature=0
)
llm

ChatGoogleGenerativeAI(model='models/gemini-1.5-flash-latest', google_api_key=SecretStr('**********'), temperature=0.0, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x7d6a56f8dd00>, default_metadata=())

In [26]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)

    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )

    return results

In [27]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images """
    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

In [28]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from gemini
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [33]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What is the abstract summary",
        "what does figure 1 represent ",
        "which model is having third highest psnr in SID dataset"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What is the abstract summary
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 0: to-end. The training process is tedious and time-consuming.
This ICCV paper is the Open Access versi...
  - Text from page 0: The
second category is inspired by the Retinex theory. These
methods [54, 65, 66] usually suffer fro...
  - Text from page 0: the dark or introduced by the light-up process. Besides,
these methods usually require a tedious mul...
  - Text from page 0: visibility and low contrast of low-light images and restore
the corruptions (e.g., noise, artifact, ...
  - Text from page 0: decomposed into two components, i.e., reflectance and illu-
mination. Different from plain methods, ...


Answer: Existing low-light image enhancement methods suffer from drawbacks such as tedious multi-stage training pipelines (especially those based on Retinex theory and using multiple CNNs trained independently then fine-tuned end-to-end) and limitations in