Importing necessary packages

In [97]:
import fitz    #pymupdf
from langchain_core.documents import Document
from transformers import CLIPProcessor,CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS 

In [98]:
### clip model 
import os
from dotenv import load_dotenv
load_dotenv()


os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

### initializing clip model 
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32", force_download=True)
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True, force_download=True)
clip_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [99]:
### embedding images using clip model 
def embed_image(image_data):
    if isinstance(image_data,str):
        image=Image.open(image_data).convert("RGB")
    else:
        image=image_data

    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features=clip_model.get_image_features(**inputs)
        features=features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()
    
### embedding text using clip 
def embed_text(text):
    inputs=clip_processor(text=text,
                          return_tensors="pt",
                          padding=True,
                          truncation=True,
                          max_length=77
                          )
    with torch.no_grad():
        features=clip_model.get_text_features(**inputs)
        features=features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()
    

In [100]:
##Process pdf 
pdf_path= "../data/multimodal_sample.pdf"
doc=fitz.open(pdf_path)     
all_docs = []
all_embeddings = []
image_data_store = {} 

##defining the text splitter 
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)



In [101]:
doc

Document('../data/multimodal_sample.pdf')

In [102]:
## documents to chunks 
for i, page in enumerate(doc):
    image_list = page.get_images(full=True)
    print(f"Page {i}: Found {len(image_list)} image references.")
    text=page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        ##converting chunks to embeddings by calling created embed_text function 
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

Page 0: Found 1 image references.
Page 1: Found 2 image references.
Page 2: Found 1 image references.
Page 3: Found 2 image references.
Page 4: Found 2 image references.
Page 5: Found 1 image references.
Page 6: Found 2 image references.
Page 7: Found 4 image references.
Page 8: Found 5 image references.
Page 9: Found 0 image references.
Page 10: Found 0 image references.


In [103]:
all_embeddings

[array([ 2.52216328e-02,  1.03679048e-02, -2.68075243e-03, -2.62052007e-02,
         2.37149838e-02,  7.42171099e-03,  4.60936092e-02, -7.85023123e-02,
         2.09175777e-02,  5.05556203e-02, -3.16157639e-02,  1.54130058e-02,
        -1.63174216e-02, -2.17019878e-02, -2.36021052e-03, -6.85219746e-03,
         3.43004018e-02, -2.81241369e-02, -2.12842450e-02, -1.43527592e-04,
         5.44454844e-04, -1.07778413e-02,  2.58963294e-02, -4.78921179e-03,
         2.32356857e-03,  1.51563864e-02, -3.05677373e-02, -5.75663522e-04,
        -1.89267145e-03, -3.74815892e-03,  2.03786120e-02, -3.51217203e-02,
        -6.00754935e-03,  1.23955002e-02,  6.30424637e-03, -2.70599108e-02,
         1.89982466e-02,  2.13922421e-03,  1.58739705e-02, -2.69108657e-02,
         1.23935016e-02, -6.29428262e-03, -1.70381777e-02,  7.11371982e-03,
        -1.12148961e-02,  2.18187384e-02, -2.33690208e-03, -3.40158455e-02,
         3.60049494e-02, -2.01846492e-02,  2.84245587e-03, -3.73885743e-02,
         1.0

In [104]:
all_docs

[Document(metadata={'page': 8, 'type': 'text'}, page_content='Create RAG for the Given Matrices\nMake WFG for the given SI-RAG\nMake AM and RM for the above given MI-RAGs'),
 Document(metadata={'page': 9, 'type': 'text'}, page_content='Detection-Algorithm Usage:\nWhen should the deadlock detection be done? Frequently, or infrequently?\nThe answer may depend on how frequently deadlocks are expected to occur, as well as the possible \nconsequences of not catching them immediately. ( If deadlocks are not removed immediately when\nthey occur, then more and more processes can "back up" behind the deadlock, making the eventual \ntask of unblocking the system more difficult and possibly damaging to more processes. )'),
 Document(metadata={'page': 9, 'type': 'text'}, page_content='task of unblocking the system more difficult and possibly damaging to more processes. )\nThere are two obvious approaches, each with trade-offs:\n1) Do deadlock detection after every resource allocation which cannot 

In [105]:
 ## process images
for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            #converting to image embeddings using embed_image function     
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
        
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [106]:
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 0.02522163,  0.0103679 , -0.00268075, ...,  0.03045783,
        -0.00013458, -0.04196021],
       [ 0.01998502, -0.01012876, -0.00161937, ..., -0.02849894,
        -0.01767038,  0.04012385],
       [ 0.02626241, -0.0236796 , -0.02904098, ..., -0.00869949,
        -0.01544242,  0.01609791],
       ...,
       [ 0.00310793, -0.01822211,  0.00074167, ..., -0.03114074,
        -0.03159618,  0.01491076],
       [-0.01860046, -0.02103047, -0.0171123 , ..., -0.03561989,
        -0.01952022,  0.01106495],
       [-0.0153493 , -0.02938162,  0.01828224, ..., -0.00734573,
        -0.00489034,  0.01772705]], shape=(13, 512), dtype=float32)

In [107]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x24dc996b560>

In [108]:

llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash")
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x0000024DC96B3230>, default_metadata=(), model_kwargs={})

In [109]:
def retrieve_multimodal(query, k=5):   ## passing the query and getting top 5 
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)  ##embedded the query using same function
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(    ##hitting query to DB 
        embedding=query_embedding, 
        k=k
    )
    
    return results

In [110]:
def create_multimodal_message(query, retrieved_docs):
    
    from langchain_core.messages import HumanMessage
    
    # 1. Define the System-Level Rules (Best to define as a separate variable)
    # NOTE: You would pass this 'system_instruction' separately to the LLM wrapper, 
    # but we'll include it explicitly here for completeness if the wrapper supports it.
    SYSTEM_INSTRUCTION = (
        "You are an expert, fact-checking document analyst. Your task is to synthesize "
        "information ONLY from the provided [Text Excerpts] and [Image Content]. "
        "Adhere to the following rules: "
        "1. Ground every statement in the provided context. Do not use outside knowledge. "
        "2. If information is unavailable in the context, state clearly, 'Information not found in the provided document.' "
        "3. For all facts derived from a document, cite the original source page number."
    )

    content = []
    
    # --- 1. Initial instruction and query (SETTING THE STAGE) ---
    # We remove the old simple instruction and add the structure markers.
    content.append({
        "type": "text",
        "text": f"[User Query]\n{query}\n\n[Text Excerpts]\n"
    })
    
    # Separate documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # 2. Add text context (Unchanged, just presenting the data)
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"{text_context}\n\n[Image Content]\n"
        })
    
    # 3. Add images (Unchanged, presenting the image data)
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            # Add a text marker before the image
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}] (Visible Below):\n"
            })
            
            # Add the image part 
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}",
                    "detail": "auto" 
                }
            })
    
    # --- 4. Add final instruction (THE CRITICAL COMMAND) ---
    # We replace the simple instruction with a strong, binding command.
    content.append({
        "type": "text",
        "text": "\n\n--- INSTRUCTIONS ---\n"
                "Fully answer the [User Query] using ONLY the provided context and images. "
    })
    
    return HumanMessage(content=content)

In [111]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # query + context =prompt 
    message = create_multimodal_message(query, context_docs)
    
    # Get response from GPT-4V
    response = llm.invoke([message])
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

In [113]:
if __name__ == "__main__":
    # Example queries
    queries = [
        
        "what is bankers algo",
        
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: what is bankers algo
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 8: Create RAG for the Given Matrices
Make WFG for the given SI-RAG
Make AM and RM for the above given M...
  - Text from page 9: 3) How many more resources does the process need to complete.
4) How many processes will need to be ...
  - Text from page 10: c) Starvation - How do you guarantee that a process won't starve because its resources 
            ...
  - Text from page 9: Detection-Algorithm Usage:
When should the deadlock detection be done? Frequently, or infrequently?
...
  - Text from page 9: 3) Preempt resources.
2) Process Termination
Two basic approaches, both of which recover resources a...


Answer: I am sorry, but the provided text excerpts do not contain information about the Banker's Algorithm.
