In [2]:
import fitz  # PyMuPDF\n",
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
###Clip Model
import os
from dotenv import load_dotenv
load_dotenv()
## set up the environment
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
### initialize the Clip Model for unified embeddings\n",
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install 

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
### Embedding functions
def embed_image(image_data):
        """Embed image using CLIP"""
        if isinstance(image_data, str):  # If path\n",
            image = Image.open(image_data).convert("RGB")
        else:  # If PIL Image\n",
            image = image_data
        inputs=clip_processor(images=image,return_tensors="pt")
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            # Normalize embeddings to unit vector\n",
            features = features / features.norm(dim=-1, keepdim=True)
            return features.squeeze().numpy()
def embed_text(text):
        """Embed text using CLIP."""
        inputs = clip_processor(
            text=text, 
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77  # CLIP's max token length\n",
        )
        with torch.no_grad():
            features = clip_model.get_text_features(**inputs)
            # Normalize embeddings\n",
            features = features / features.norm(dim=-1, keepdim=True)
            return features.squeeze().numpy()

In [13]:
## Process PDF\n",
pdf_path="multimodal_sample.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM\n",
# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [14]:
for i,page in enumerate(doc):
     ## process text\n",
        text=page.get_text()
        if text.strip():
     ##create temporary document for splitting
            temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
            text_chunks = splitter.split_documents([temp_doc])
      #Embed each chunk using CLIP\n",
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

In [15]:
## process images
     ##Three Important Actions:
##Convert PDF image to PIL format
##Store as base64 for GPT-4V (which needs base64 images)
##Create CLIP embedding for retrieval
for img_index, img in enumerate(page.get_images(full=True)):
    try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                # Convert to PIL Image\n",
                pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                # Create unique identifier\n",
                image_id = f"page_{i}_img_{img_index}"
                # Store image as base64 for later use with GPT-4V\n",
                buffered = io.BytesIO()
                pil_image.save(buffered, format="PNG")
                img_base64 = base64.b64encode(buffered.getvalue()).decode()
                image_data_store[image_id] = img_base64
                # Embed image using CLIP\n",
                embedding = embed_image(pil_image)
                all_embeddings.append(embedding)
                # Create document for image\n",
                image_doc = Document(
                    page_content=f"[Image: {image_id}]",
                    metadata={"page": i, "type": "image", "image_id": image_id}
                )
                all_docs.append(image_doc)
    except Exception as e:
                print(f"Error processing image {img_index} on page {i}: {e}")
                continue
    doc.close()

In [16]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00267244,  0.01282999, -0.0518314 , ..., -0.00385086,
         0.02977718, -0.00010685],
       [ 0.01732337, -0.01327688, -0.02427033, ...,  0.08994051,
        -0.00272155,  0.03253039]], shape=(2, 512), dtype=float32)

In [18]:
# Create custom FAISS index since we have precomputed embeddings\n",
vector_store = FAISS.from_embeddings(
        text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
        embedding=None,  # We're using precomputed embeddings\n",
        metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x16eb49d7470>

In [19]:
llm = init_chat_model("openai:gpt-4.1")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000016EDA3E0BC0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000016EDA37D340>, root_client=<openai.OpenAI object at 0x0000016EB49B0410>, root_async_client=<openai.AsyncOpenAI object at 0x0000016EB5BF92B0>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [24]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    # Search in unified vector store\n",
    results = vector_store.similarity_search_by_vector(
           embedding=query_embedding,
           k=k
      )
    return results

In [31]:
def create_multimodal_message(query, retrieved_docs):
       """Create a message with both text and images for GPT-4V"""
       content = []
       # Add the query
       content.append({
            "type": "text",
            "text": f"Question: {query} Context:",
        })
       # Separate text and image documents\n",
       text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
       image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
        # Add text context\n",
       if text_docs:
           text_context = "\n\n".join([
                f"[Page {doc.metadata['page']}]: {doc.page_content}"
                for doc in text_docs
            ])
           content.append({
                "type": "text",
                "text": f"Text excerpts:{text_context}"
            })
        # Add images\n",
       for doc in image_docs:
            image_id = doc.metadata.get("image_id")
            if image_id and image_id in image_data_store:
                content.append({
                    "type": "text",
                    "text": f"[Image from page {doc.metadata['page']}]:"
                })
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_data_store[image_id]}"
                    }
                })
        
        # Add instruction\n",
       content.append({
            "type": "text",
            "text": "Please answer the question based on the provided text and images"
        })
       return HumanMessage(content=content)

In [34]:
def multimodal_pdf_rag_pipeline(query):
        """Main pipeline for multimodal RAG."""
        # Retrieve relevant documents
        context_docs = retrieve_multimodal(query, k=5)
        # Create multimodal message\n",
        message = create_multimodal_message(query, context_docs)
        # Get response from GPT-4V\n",
        response = llm.invoke([message])
        # Print retrieved context info\n",
        print(f"Retrieved {len(context_docs)} documents:")
        for doc in context_docs:
            doc_type = doc.metadata.get("type", "unknown")
            page = doc.metadata.get("page", "?")
            if doc_type == "text":
                preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
                print(f"  - Text from page {page}: {preview}")
            else:
                print(f"  - Image from page {page}")
        print("\n")
        return response.content

In [36]:
if __name__ == "__main__":
    # Example queries\n",
    queries = [
        "What does the chart on page 1 show about revenue trends?"
        "Summarize the main findings from the document"
        "What visual elements are present in the document?"
    ]
    for query in queries:
            print(f"Query: {query}")
            print("-" * 50)
            answer = multimodal_pdf_rag_pipeline(query)
            print(f"Answer: {answer}")
            print("=" * 70)

Query: What does the chart on page 1 show about revenue trends?Summarize the main findings from the documentWhat visual elements are present in the document?
--------------------------------------------------
Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: **Chart Analysis (Page 1):**  
The chart on page 1 shows a clear upward trend in revenue over three periods (Q1, Q2, and Q3). Each bar is higher than the previous one, with Q3 having the highest revenue.

**Main Findings from the Document:**
- **Q1:** Revenue increased moderately, attributed to the introduction of new product lines.
- **Q2:** Revenue further increased, outperforming Q1 due to effective marketing campaigns.
- **Q3:** Revenue grew exponentially, driven by global expansion efforts.
- Overall, the main finding is that revenue increased each quarter, with the most substantial growth recorded