In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
PERSIST_DIR = "/content/drive/MyDrive/RAG"

In [3]:
!nvcc --version
!echo $CUDA_HOME

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0



# Installing what we need for qwen2.5-vl

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124


In [3]:

!pip install "transformers==4.43.3" "tokenizers<0.20" "accelerate>=0.21.0,<1.0" "safetensors>=0.4.2"

!pip install qwen-vl-utils[decord]==0.0.8
!pip install bitsandbytes flash-attn



In [4]:
import torch
import sys

def check_environment():
    print(f"Python version: {sys.version}")
    print(f"PyTorch version: {torch.__version__}")

    # Check CUDA availability
    cuda_available = torch.cuda.is_available()
    print(f"CUDA available: {cuda_available}")

    if cuda_available:
        print(f"CUDA version: {torch.version.cuda}")
        print(f"Current CUDA device: {torch.cuda.current_device()}")
        print(f"Device name: {torch.cuda.get_device_name(0)}")
        print(f"Device count: {torch.cuda.device_count()}")

        # Alternative check for flash attention
        try:
            import flash_attn
            print(f"flash_attn package is installed: version {flash_attn.__version__}")
        except ImportError:
            print("flash_attn package is not installed")

    # Memory info if CUDA is available
    if cuda_available:
        print("\nGPU Memory Information:")
        print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Allocated memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"Cached memory: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

if __name__ == "__main__":
    check_environment()

Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
Current CUDA device: 0
Device name: NVIDIA L4
Device count: 1
flash_attn package is installed: version 2.8.3

GPU Memory Information:
Total memory: 23.80 GB
Allocated memory: 0.00 GB
Cached memory: 0.00 GB


In [7]:
import os
import base64
from typing import List, Union, Dict
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import os

class QwenVLProcessor:
    def __init__(
        self,
        model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct",
        device: str = "cuda",
        min_pixels: int = 128*16*16,
        max_pixels: int = 1024*16*16,
        cache_dir: str = None  # Add cache_dir parameter
    ):
        """
        Initialize the QwenVL processor with custom configuration.

        Args:
            model_name: Name or path of the model to load
            device: Device to run the model on ('cuda' or 'cpu')
            use_flash_attention: Whether to use flash attention
            min_pixels: Minimum number of pixels for image processing
            max_pixels: Maximum number of pixels for image processing
        """
        # Configure CUDA memory allocation
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

        # Clear CUDA cache
        if device == "cuda":
            torch.cuda.empty_cache()

        # Load model and assign to self
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map=device,
            attn_implementation="flash_attention_2",
            use_cache=True,
            cache_dir=cache_dir,
        )

         # Load processor and assign to self
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            use_fast=True
        )

        self.device = device

    def _encode_image(self, image_path: str) -> str:
        """
        Encode a local image file to base64.

        Args:
            image_path: Path to the local image file

        Returns:
            Base64 encoded string of the image
        """
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{encoded_string}"

    def prepare_messages(
        self,
        image_paths: Union[str, List[str]],
        prompt: str
    ) -> List[Dict]:
        """
        Prepare messages for the model using local image paths.

        Args:
            image_paths: Single path or list of paths to local images
            prompt: Text prompt to process with the images

        Returns:
            List of formatted messages for the model
        """
        if isinstance(image_paths, str):
            image_paths = [image_paths]

        messages = []
        for path in image_paths:
            encoded_image = self._encode_image(path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": encoded_image},
                    {"type": "text", "text": prompt}
                ]
            })
        return messages

    def process_images(
        self,
        image_paths: Union[str, List[str]],
        prompt: str,
        max_new_tokens: int = 2000,
        temperature: float = 0.01,
        top_p: float = 0.9 # creates a smaller pool of probably avaliable words
    ) -> List[str]:
        """
        Process local images with the given prompt.

        Args:
            image_paths: Single path or list of paths to local images
            prompt: Text prompt to process with the images
            max_new_tokens: Maximum number of tokens to generate
            temperature: Sampling temperature
            top_p: Top-p sampling parameter

        Returns:
            List of generated responses for each image
        """
        messages = self.prepare_messages(image_paths, prompt)

        with torch.inference_mode(): # check pytorch autograd mechanics page
            text = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            image_inputs, video_inputs = process_vision_info(messages)
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            )

            # put the inputs on device
            inputs = inputs.to(self.device)


            generated_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_p=top_p,
                pad_token_id=self.processor.tokenizer.pad_token_id,
                eos_token_id=self.processor.tokenizer.eos_token_id
            )

            generated_ids_trimmed = [
                out_ids[len(in_ids):]
                for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]

            output_text = self.processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )

        return output_text

if __name__ == "__main__":

    processor = QwenVLProcessor()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
# huggingface-cli login --token <token> -> Terminal

# Model card on HF
https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct



In [20]:
import os
print(os.listdir())

['.config', 'drive', 'sample_data']


In [24]:
import os
import time
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Set your image folder path in Drive
# Example: if your image is in "MyDrive/images/image.jpeg"
folder_path = "/content/drive/MyDrive/RAG"
image_path = os.path.join(folder_path, "image.jpeg")

# Step 3: Check that the file exists
if not os.path.exists(image_path):
    raise FileNotFoundError(f"File not found: {image_path}")

# Step 4: Process the image
start_time = time.time()

result = processor.process_images(
    image_path,
    prompt="""You are an expert OCR model who can read and interpret hard images in details
    and in great precision. Given these images extract every detail of it in an organized format."""
)

print(f"Single image result: {result[0]}")

end_time = time.time() - start_time
print(f"time is : {end_time:.2f} seconds")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Single image result: The image contains a handwritten quote on lined notebook paper. The text reads:

"‘Don’t ever let someone tell you, you can’t do something. Not even me. You got a dream you got to protect it. People can’t do something themselves, they want to tell you, you can’t do it. You want something, go get it, period.
All right?’
- From Pursuit of Happiness"

The quote is attributed to the movie "Pursuit of Happiness." The handwriting appears neat and legible, with the lines of the notebook paper providing a structured background for the text.
time is : 13.49 seconds


# Lets see some English images examples
## Visualizations interpretations

In [26]:
image_path = os.path.join(folder_path, "chart_eng.png")
# Step 4: Process the image
start_time = time.time()
result = processor.process_images(
    image_path,
    prompt="""You are an expert OCR model who can read and interpret hard images in details
    and in great precision. Given these images extract every detail of it in an organized format."""
)
print(f"Single image result: {result[0]}")
end_time = time.time() - start_time
print(f"time is : {end_time:.2f} seconds")

Single image result: The image is a bar chart titled "Fintech Market Growth Projections." The chart illustrates the projected market valuation of the fintech industry based on Compound Annual Growth Rate (CAGR) from 2021 to 2029.

### Key Details:
- **Title**: Fintech Market Growth Projections
- **Y-Axis**: Market Valuation Calculated Based on CAGR
- **X-Axis**: Year (from 2021 to 2029)
- **Data Points**:
  - **2021**: Approximately $100B
  - **2022**: Approximately $120B
  - **2023**: Approximately $140B
  - **2024**: Approximately $180B
  - **2025**: Approximately $260B
  - **2026**: Approximately $320B
  - **2027**: Approximately $400B
  - **2028**: Approximately $500B
  - **2029**: Approximately $620B

### Observations:
- The market valuation shows a consistent upward trend over the years.
- There is a significant increase in market valuation each year, indicating steady growth.
- By 2029, the projected market valuation reaches approximately $620B, which is more than five times the

#Prompting can help .. the more specific you are the more accurate results you get

In [27]:
image_path = os.path.join(folder_path, "pif.png")
# Step 4: Process the image
start_time = time.time()
result = processor.process_images(
    image_path,
    prompt="""You are an expert OCR model who can read and interpret hard images in details
    and in great precision. Given these images extract every detail of it in an organized format."""
)
print(f"Single image result: {result[0]}")

end_time = time.time() - start_time
print(f"time is : {end_time:.2f} seconds")

Single image result: Certainly! Below is the extracted information from the provided image, organized into sections:

---

### **PIF Vision Realization Program**

#### **Strategic Review | PIF Vision Realization Program**

**The Public Investment Fund's commitment:**
- In Saudi Vision 2030, PIF is working on driving sustainable and transformative economic change through its investments in the Saudi economy and building its international asset portfolio.
- The PIF Vision emphasizes creating a diversified and sustainable future, positioning the Fund as a pivotal player in Saudi Arabia's broader economic transition.

---

### **EXPECTED IMPACT BY 2025**

- **Cumulative Non-PIF Contribution:**
  - SAR 1.2 TN (cumulative)
  
- **Job Creation:**
  - 1.8 MN (direct, indirect, and induced jobs)

- **Contribution to Local Content:**
  - 60% (including PIF and its portfolio companies)

- **Cumulative Non-governmental Investments:**
  - SAR 1.2 TN (includes domestic and foreign direct investment)

In [28]:
result = processor.process_images(
    image_path,
    prompt="""You are an expert OCR model who can read and interpret hard images in details
    and in great precision. just extract the text / numbers you see ."""
)
print(f"Single image result: {result[0]}")

Single image result: Here is the extracted text and numbers from the image:

**PIF VISION REALIZATION PROGRAM**

- **EXPECTED IMPACT BY 2025**
  - Cumulative Non oil GDP Contribution: SAR 1.2 TN
  - Job Creation: 1.8 MN (Direct, Indirect and Induced jobs)
  - Contribution to Local Content: 60%
  - Cumulative Non-governmental Investments: SAR 1.2 TN (Includes domestic and foreign direct investment)

- **13 STRATEGIC SECTORS OF FOCUS**
  - Aerospace and Defense
  - Real Estate
  - Metals and Mining
  - Consumer Goods and Retail
  - Automotive
  - Entertainment, Leisure and Sports
  - Utilities and Renewables
  - Transport and Logistics
  - Financial Services
  - Building Constructions, Materials and Services
  - Health Care
  - Food and Agriculture
  - Telecom, Media and Technology

- **2025 TARGETS**
  - Assets Under Management (AUM): SAR 4 TN (Baseline: SAR 1.9 TN in 2020)
  - Share of PIF Assets in New Sectors: 21% (Baseline: 10% in 2020)
  - New Local Investments: SAR 150 BN (Annual 

In [29]:
result = processor.process_images(
    image_path,
    prompt="""Extract the page numbers."""
)
print(f"Single image result: {result[0]}")

Single image result: The page numbers in the image are 26 and 27.


# German images

In [31]:
from time import time

start_time = time()

image_path = os.path.join(folder_path, "handschrift002.jpg")
result = processor.process_images(
    image_path,
    prompt="""You are an expert OCR model who can read and interpret hard images in details
    and in great precision. Given these images extract every detail of it in an organized format,
    include any numbers you see .. page numbers also"""
)

end_time = time()
execution_time = end_time - start_time

print(f"Single image result: {result[0]}")
print(f"Execution time: {execution_time:.2f} seconds")

Single image result: The image contains the following text:

```
Dies ist meine "normalere"
Handschrift per Hand. Ab und
zu wechse ich auch zwischen
a und ä. a.
```

There are no numbers or page numbers visible in the image.
Execution time: 5.81 seconds


#Section 2


*   Can we embedd an image directly?
*   Can the model see


### 1. Vison_RAG

### Let's try the pdf and convert it to images.

#Now we will use VisRag-Ret to retrieve relevant information from the images

https://huggingface.co/openbmb/VisRAG-Ret

First prepare the pdf and convert to images

In [None]:
!pip install fitz pymupdf pdf2image
!apt-get install -y poppler-utils


In [37]:
import os
from pdf2image import convert_from_path

def convert_pdf_to_jpg(pdf_path: str, output_folder: str, dpi: int = 300) -> list:
    """
    Convert PDF pages to JPG images using pdf2image.

    Parameters:
    -----------
    pdf_path : str
        Path to the input PDF file
    output_folder : str
        Path to the folder where JPG images will be saved
    dpi : int, optional
        DPI for rendering (higher means better quality but larger files)

    Returns:
    --------
    list
        List of paths to the generated JPG files
    """

    # Validate input PDF file
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    try:
        # Convert PDF to list of images
        images = convert_from_path(pdf_path, dpi=dpi)
        output_files = []

        # Save each image
        for i, image in enumerate(images):
            output_path = os.path.join(output_folder, f"page_{i+1}.jpg")
            image.save(output_path, "JPEG")
            output_files.append(output_path)
            print(f"Converted page {i+1} to {output_path}")

        return output_files

    except Exception as e:
        raise Exception(f"Error converting PDF: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        # Convert a sample PDF
        folder_path = "/content/drive/MyDrive/RAG"
        pdf_file = os.path.join(folder_path, "moodys-rating-report.pdf")
        output_dir = "/content/drive/MyDrive/RAG/moodys-rating-report"

        # Convert PDF to images (higher DPI for better quality)
        image_files = convert_pdf_to_jpg(pdf_file, output_dir, dpi=400)

        print(f"\nSuccessfully converted {len(image_files)} pages")
        print("Output files:", image_files)

    except Exception as e:
        print(f"Error: {str(e)}")

Converted page 1 to /content/drive/MyDrive/RAG/moodys-rating-report/page_1.jpg
Converted page 2 to /content/drive/MyDrive/RAG/moodys-rating-report/page_2.jpg
Converted page 3 to /content/drive/MyDrive/RAG/moodys-rating-report/page_3.jpg
Converted page 4 to /content/drive/MyDrive/RAG/moodys-rating-report/page_4.jpg
Converted page 5 to /content/drive/MyDrive/RAG/moodys-rating-report/page_5.jpg
Converted page 6 to /content/drive/MyDrive/RAG/moodys-rating-report/page_6.jpg
Converted page 7 to /content/drive/MyDrive/RAG/moodys-rating-report/page_7.jpg
Converted page 8 to /content/drive/MyDrive/RAG/moodys-rating-report/page_8.jpg
Converted page 9 to /content/drive/MyDrive/RAG/moodys-rating-report/page_9.jpg
Converted page 10 to /content/drive/MyDrive/RAG/moodys-rating-report/page_10.jpg
Converted page 11 to /content/drive/MyDrive/RAG/moodys-rating-report/page_11.jpg

Successfully converted 11 pages
Output files: ['/content/drive/MyDrive/RAG/moodys-rating-report/page_1.jpg', '/content/drive/M

In [None]:
!pip install SentencePiece timm

In [None]:
# Clean out possibly incompatible versions
!pip uninstall -y transformers tokenizers huggingface-hub

# Install versions that keep is_torch_fx_available available
!pip install "transformers==4.43.3" "tokenizers<0.20" "accelerate>=0.21.0,<1.0" "safetensors>=0.4.2" timm torchvision

# (Optional) sanity check
import transformers, sys
print("Transformers:", transformers.__version__)
print("Python:", sys.version)

In [8]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from PIL import Image
import os
from tqdm import tqdm
import numpy as np
import json
import pickle
import time
class ImageRetriever:
    def __init__(self):
        """Initialize basic attributes without loading the model."""
        self.images = []
        self.image_paths = []
        self.embeddings = None
        self.model = None
        self.tokenizer = None

    def _init_model(self, model_name="openbmb/VisRAG-Ret", use_cuda=True):
        """Initialize the model only when needed."""
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            device = 'cuda' if use_cuda and torch.cuda.is_available() else 'cpu'
            self.model = AutoModel.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32,
                trust_remote_code=True
            ).to(device)
            self.model.eval()

    def weighted_mean_pooling(self, hidden, attention_mask):
        """Apply weighted mean pooling to the hidden states."""
        attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
        s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
        d = attention_mask_.sum(dim=1, keepdim=True).float()
        return s / d

    @torch.no_grad()
    def encode(self, text_or_image_list):
        """Encode text queries or images into embeddings."""
        self._init_model()  # Initialize model only if needed

        if isinstance(text_or_image_list[0], str):
            inputs = {
                "text": text_or_image_list,
                'image': [None] * len(text_or_image_list),
                'tokenizer': self.tokenizer
            }
        else:
            inputs = {
                "text": [''] * len(text_or_image_list),
                'image': text_or_image_list,
                'tokenizer': self.tokenizer
            }

        outputs = self.model(**inputs)
        attention_mask = outputs.attention_mask
        hidden = outputs.last_hidden_state

        reps = self.weighted_mean_pooling(hidden, attention_mask)
        embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
        return embeddings
    def load_images(self, image_dir, save_dir=None):
        """Load images and embeddings, computing only if necessary."""
        print(f"\nAttempting to load images from directory: {image_dir}")
        print(f"Embeddings directory: {save_dir}")

        if not save_dir:
            print("No save_dir provided, will compute embeddings without saving")
            should_compute = True
        else:
            # Check for existing embeddings
            embeddings_path = os.path.join(save_dir, 'embeddings.pkl')
            paths_file = os.path.join(save_dir, 'image_paths.json')

            print(f"Checking for existing embeddings at: {embeddings_path}")
            print(f"Checking for paths file at: {paths_file}")

            if os.path.exists(embeddings_path) and os.path.exists(paths_file):
                try:
                    # Load embeddings and paths
                    print("Found existing embedding files, attempting to load...")
                    with open(embeddings_path, 'rb') as f:
                        self.embeddings = pickle.load(f)
                    with open(paths_file, 'r') as f:
                        self.image_paths = json.load(f)['image_paths']

                    # Verify image paths still exist
                    missing_images = [p for p in self.image_paths if not os.path.exists(p)]
                    if missing_images:
                        print(f"Found {len(missing_images)} missing images, will recompute")
                        should_compute = True
                    else:
                        # Load images
                        print("Loading images from saved paths...")
                        self.images = []
                        for path in self.image_paths:
                            image = Image.open(path).convert('RGB')
                            self.images.append(image)

                        print(f"Successfully loaded {len(self.images)} images and their embeddings")
                        return

                except Exception as e:
                    print(f"Error loading saved embeddings: {e}")
                    print("Will recompute embeddings")
                    should_compute = True
            else:
                print("No existing embedding files found")
                should_compute = True

        # If we get here, we need to compute embeddings
        print("\nComputing new embeddings...")
        supported_formats = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
        self.images = []
        self.image_paths = []

        # Load images
        for filename in os.listdir(image_dir):
            if os.path.splitext(filename)[1].lower() in supported_formats:
                image_path = os.path.join(image_dir, filename)
                try:
                    image = Image.open(image_path).convert('RGB')
                    self.images.append(image)
                    self.image_paths.append(image_path)
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

        if not self.images:
            raise ValueError(f"No valid images found in {image_dir}")

        # Compute embeddings
        print(f"Computing embeddings for {len(self.images)} images...")
        self.embeddings = self.encode(self.images)

        # Save if requested
        if save_dir:
            os.makedirs(save_dir, exist_ok=True)
            with open(os.path.join(save_dir, 'embeddings.pkl'), 'wb') as f:
                pickle.dump(self.embeddings, f)
            with open(os.path.join(save_dir, 'image_paths.json'), 'w') as f:
                json.dump({'image_paths': self.image_paths}, f)
            print(f"Saved new embeddings to {save_dir}")

    def query(self, question, k=3):
        """Query the images with a question and return top-k most relevant images."""
        if self.embeddings is None:
            raise ValueError("No images loaded. Please load images first using load_images()")

        # Prepare and encode query
        query = ["Represent this query for retrieving relevant documents: " + question]
        query_embedding = self.encode(query)

        # Get top-k results
        scores = (query_embedding @ self.embeddings.T)[0]
        top_k_indices = np.argsort(scores)[-k:][::-1]

        return [
            {
                'image_path': self.image_paths[idx],
                'score': float(scores[idx]),
                'image': self.images[idx]
            }
            for idx in top_k_indices
        ]

def main():
    # Initialize retriever
    start_time = time.time()
    retriever = ImageRetriever()

    # Define directories
    image_dir = "moodys-rating-report"  # Replace with your image directory
    embeddings_dir = "embeddings"  # Directory to save/load embeddings

    # Load images and compute/load embeddings
    retriever.load_images(image_dir, save_dir=embeddings_dir)

    # Example queries
    questions = [
        "How is PIF’s cash flow expected to evolve?",
    ]

    # Process each query
    for question in questions:
        print(f"\nQuery: {question}")
        results = retriever.query(question, k=10)

        # Print results
        for i, result in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"Image: {os.path.basename(result['image_path'])}")
            print(f"Score: {result['score']:.4f}")

    total_execution_time = time.time() - start_time
    print(" ")
    print(f"time: {total_execution_time:.4f} second")
if __name__ == "__main__":
    main()


Attempting to load images from directory: moodys-rating-report
Embeddings directory: embeddings
Checking for existing embeddings at: embeddings/embeddings.pkl
Checking for paths file at: embeddings/image_paths.json
Found existing embedding files, attempting to load...
Loading images from saved paths...
Successfully loaded 11 images and their embeddings

Query: How is PIF’s cash flow expected to evolve?


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Result 1:
Image: page_4.jpg
Score: 0.3351

Result 2:
Image: page_3.jpg
Score: 0.3205

Result 3:
Image: page_6.jpg
Score: 0.2969

Result 4:
Image: page_5.jpg
Score: 0.2847

Result 5:
Image: page_2.jpg
Score: 0.2562

Result 6:
Image: page_1.jpg
Score: 0.2426

Result 7:
Image: page_8.jpg
Score: 0.2176

Result 8:
Image: page_9.jpg
Score: 0.1931

Result 9:
Image: page_7.jpg
Score: 0.1666

Result 10:
Image: page_10.jpg
Score: 0.1388
 
time: 15.2973 second


Result 1:
Image: page_4.jpg
Score: 0.3351

Result 2:
Image: page_3.jpg
Score: 0.3205

### Match chatgpt 5 https://chatgpt.com/share/68fe0094-1b74-8010-bf0f-0d27c3ce9ec4

# Now let's try colqwen2.5 Ret
https://github.com/illuin-tech/colpali?tab=readme-ov-file

In [None]:
!pip install colpali-engine

In [None]:
!pip install git+https://github.com/illuin-tech/colpali

# VisRag-Ret with german will it perform well?

In [15]:
import os
import concurrent.futures
from pdf2image import convert_from_path
from tqdm import tqdm

def convert_pdf_to_jpg(pdf_path: str, output_folder: str, dpi: int = 400,
                       threads: int = 4, batch_size: int = 10) -> list:
    """
    Convert PDF pages to JPG images using pdf2image with parallel processing.

    Parameters:
    -----------
    pdf_path : str
        Path to the input PDF file
    output_folder : str
        Path to the folder where JPG images will be saved
    dpi : int, optional
        DPI for rendering (higher means better quality but larger files)
    threads : int, optional
        Number of worker threads to use for parallel processing
    batch_size : int, optional
        Number of pages to process in each batch

    Returns:
    --------
    list
        List of paths to the generated JPG files
    """

    # Validate input PDF file
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    try:
        # Get total number of pages first
        info = convert_from_path(pdf_path, dpi=dpi, first_page=1, last_page=1)
        total_pages = convert_from_path(pdf_path, dpi=72, first_page=1, last_page=None, thread_count=1)
        num_pages = len(total_pages)
        print(f"PDF has {num_pages} pages. Starting conversion...")

        output_files = []

        # Define a function to convert a batch of pages
        def convert_batch(batch):
            start_page, end_page = batch
            batch_images = convert_from_path(
                pdf_path,
                dpi=dpi,
                first_page=start_page,
                last_page=end_page,
                thread_count=1  # Use 1 thread per worker as we're already parallelizing
            )

            batch_output_files = []
            for i, image in enumerate(batch_images):
                page_num = start_page + i
                output_path = os.path.join(output_folder, f"page_{page_num}.jpg")
                image.save(output_path, "JPEG")
                batch_output_files.append(output_path)

            return batch_output_files

        # Create batches
        batches = []
        for i in range(1, num_pages + 1, batch_size):
            batches.append((i, min(i + batch_size - 1, num_pages)))

        # Process batches in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
            # Submit all batches to the executor
            future_to_batch = {executor.submit(convert_batch, batch): batch for batch in batches}

            # Process results as they complete with progress bar
            with tqdm(total=len(batches), desc="Converting PDF pages") as pbar:
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_files = future.result()
                    output_files.extend(batch_files)
                    pbar.update(1)

        # Sort output files by page number
        output_files.sort(key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0]))

        return output_files

    except Exception as e:
        raise Exception(f"Error converting PDF: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        # Convert a sample PDF
        folder_path = "/content/drive/MyDrive/RAG"
        pdf_file = os.path.join(folder_path, "Business_Model_Canvas_de.pdf")
        output_dir = "/content/drive/MyDrive/RAG/Business_Model_Canvas_de"

        # Convert PDF to images with parallel processing
        image_files = convert_pdf_to_jpg(
            pdf_file,
            output_dir,
            dpi=400,
            threads=os.cpu_count(),  # Use all available CPU cores
            batch_size=5            # Process 5 pages at a time
        )

        print(f"\nSuccessfully converted {len(image_files)} pages")

    except Exception as e:
        print(f"Error: {str(e)}")

PDF has 7 pages. Starting conversion...


Converting PDF pages: 100%|██████████| 2/2 [00:04<00:00,  2.22s/it]


Successfully converted 7 pages





In [22]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from PIL import Image
import os
from tqdm import tqdm
import numpy as np
import json
import pickle
import time
class ImageRetriever:
    def __init__(self):
        """Initialize basic attributes without loading the model."""
        self.images = []
        self.image_paths = []
        self.embeddings = None
        self.model = None
        self.tokenizer = None

    def _init_model(self, model_name="openbmb/VisRAG-Ret", use_cuda=True):
        """Initialize the model only when needed."""
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            device = 'cuda' if use_cuda and torch.cuda.is_available() else 'cpu'
            self.model = AutoModel.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32,
                trust_remote_code=True
            ).to(device)
            self.model.eval()

    def weighted_mean_pooling(self, hidden, attention_mask):
        """Apply weighted mean pooling to the hidden states."""
        attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
        s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
        d = attention_mask_.sum(dim=1, keepdim=True).float()
        return s / d

    @torch.no_grad()
    def encode(self, text_or_image_list):
        """Encode text queries or images into embeddings."""
        self._init_model()  # Initialize model only if needed

        if isinstance(text_or_image_list[0], str):
            inputs = {
                "text": text_or_image_list,
                'image': [None] * len(text_or_image_list),
                'tokenizer': self.tokenizer
            }
        else:
            inputs = {
                "text": [''] * len(text_or_image_list),
                'image': text_or_image_list,
                'tokenizer': self.tokenizer
            }

        outputs = self.model(**inputs)
        attention_mask = outputs.attention_mask
        hidden = outputs.last_hidden_state

        reps = self.weighted_mean_pooling(hidden, attention_mask)
        embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
        return embeddings
    def load_images(self, image_dir, save_dir=None):
        """Load images and embeddings, computing only if necessary."""
        print(f"\nAttempting to load images from directory: {image_dir}")
        print(f"Embeddings directory: {save_dir}")

        if not save_dir:
            print("No save_dir provided, will compute embeddings without saving")
            should_compute = True
        else:
            # Check for existing embeddings
            embeddings_path = os.path.join(save_dir, 'embeddings.pkl')
            paths_file = os.path.join(save_dir, 'image_paths.json')

            print(f"Checking for existing embeddings at: {embeddings_path}")
            print(f"Checking for paths file at: {paths_file}")

            if os.path.exists(embeddings_path) and os.path.exists(paths_file):
                try:
                    # Load embeddings and paths
                    print("Found existing embedding files, attempting to load...")
                    with open(embeddings_path, 'rb') as f:
                        self.embeddings = pickle.load(f)
                    with open(paths_file, 'r') as f:
                        self.image_paths = json.load(f)['image_paths']

                    # Verify image paths still exist
                    missing_images = [p for p in self.image_paths if not os.path.exists(p)]
                    if missing_images:
                        print(f"Found {len(missing_images)} missing images, will recompute")
                        should_compute = True
                    else:
                        # Load images
                        print("Loading images from saved paths...")
                        self.images = []
                        for path in self.image_paths:
                            image = Image.open(path).convert('RGB')
                            self.images.append(image)

                        print(f"Successfully loaded {len(self.images)} images and their embeddings")
                        return

                except Exception as e:
                    print(f"Error loading saved embeddings: {e}")
                    print("Will recompute embeddings")
                    should_compute = True
            else:
                print("No existing embedding files found")
                should_compute = True

        # If we get here, we need to compute embeddings
        print("\nComputing new embeddings...")
        supported_formats = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
        self.images = []
        self.image_paths = []

        # Load images
        for filename in os.listdir(image_dir):
            if os.path.splitext(filename)[1].lower() in supported_formats:
                image_path = os.path.join(image_dir, filename)
                try:
                    image = Image.open(image_path).convert('RGB')
                    self.images.append(image)
                    self.image_paths.append(image_path)
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

        if not self.images:
            raise ValueError(f"No valid images found in {image_dir}")

        # Compute embeddings
        print(f"Computing embeddings for {len(self.images)} images...")
        self.embeddings = self.encode(self.images)

        # Save if requested
        if save_dir:
            os.makedirs(save_dir, exist_ok=True)
            with open(os.path.join(save_dir, 'embeddings.pkl'), 'wb') as f:
                pickle.dump(self.embeddings, f)
            with open(os.path.join(save_dir, 'image_paths.json'), 'w') as f:
                json.dump({'image_paths': self.image_paths}, f)
            print(f"Saved new embeddings to {save_dir}")

    def query(self, question, k=3):
        """Query the images with a question and return top-k most relevant images."""
        if self.embeddings is None:
            raise ValueError("No images loaded. Please load images first using load_images()")

        # Prepare and encode query
        query = ["Represent this query for retrieving relevant documents: " + question]
        query_embedding = self.encode(query)

        # Get top-k results
        scores = (query_embedding @ self.embeddings.T)[0]
        top_k_indices = np.argsort(scores)[-k:][::-1]

        return [
            {
                'image_path': self.image_paths[idx],
                'score': float(scores[idx]),
                'image': self.images[idx]
            }
            for idx in top_k_indices
        ]

def main():
    # Initialize retriever
    start_time = time.time()
    retriever = ImageRetriever()

    # Define directories
    folder_path = "/content/drive/MyDrive/RAG"
    image_dir = os.path.join(folder_path, "Business_Model_Canvas_de")
    embeddings_dir =os.path.join(folder_path, "embeddings") # Directory to save/load embeddings


    # Load images and compute/load embeddings
    retriever.load_images(image_dir, save_dir=embeddings_dir)

    # Example queries
    questions = [
    "Wann lohnt sich die Nutzung des Business Model Canvas besonders?",
    "In welchen Situationen ist das Business Model Canvas besonders nützlich?"
    ]

    # Process each query
    for question in questions:
        print(f"\nQuery: {question}")
        results = retriever.query(question, k=10)

        # Print results
        for i, result in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"Image: {os.path.basename(result['image_path'])}")
            print(f"Score: {result['score']:.4f}")

    total_execution_time = time.time() - start_time
    print(" ")
    print(f"time: {total_execution_time:.4f} second")
if __name__ == "__main__":
    main()


Attempting to load images from directory: /content/drive/MyDrive/RAG/Business_Model_Canvas_de
Embeddings directory: /content/drive/MyDrive/RAG/embeddings
Checking for existing embeddings at: /content/drive/MyDrive/RAG/embeddings/embeddings.pkl
Checking for paths file at: /content/drive/MyDrive/RAG/embeddings/image_paths.json
Found existing embedding files, attempting to load...
Loading images from saved paths...
Successfully loaded 7 images and their embeddings

Query: Wann lohnt sich die Nutzung des Business Model Canvas besonders?


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Result 1:
Image: page_3.jpg
Score: 0.4921

Result 2:
Image: page_2.jpg
Score: 0.4605

Result 3:
Image: page_5.jpg
Score: 0.3145

Result 4:
Image: page_6.jpg
Score: 0.3014

Result 5:
Image: page_4.jpg
Score: 0.2593

Result 6:
Image: page_1.jpg
Score: 0.2444

Result 7:
Image: page_7.jpg
Score: 0.2296

Query: Welche neun Hauptbereiche gehören zum Business Model Canvas?

Result 1:
Image: page_3.jpg
Score: 0.4164

Result 2:
Image: page_2.jpg
Score: 0.4155

Result 3:
Image: page_6.jpg
Score: 0.3497

Result 4:
Image: page_5.jpg
Score: 0.3457

Result 5:
Image: page_4.jpg
Score: 0.3011

Result 6:
Image: page_7.jpg
Score: 0.2473

Result 7:
Image: page_1.jpg
Score: 0.2279
 
time: 14.7377 second


# Now let's try colqwen 0.1

In [None]:
!pip install transformers==4.49.0 peft==0.14.0  #-> restart kernel
# !pip install transformers==4.34.0
#pip uninstall transformers -y

In [None]:
#!pip uninstall -y flash-attn
!pip install --no-cache-dir --force-reinstall "transformers>=4.46.0" "colpali-engine>=0.3.0" accelerate safetensors

In [13]:
import os
os.environ["FLASH_ATTENTION_FORCE_UNAVAILABLE"] = "1"

In [1]:
import transformers, colpali_engine, torch, sys
print("Python:", sys.version)
print("torch:", torch.__version__)
print("transformers:", transformers.__version__, "|", transformers.__file__)
print("colpali_engine:", getattr(colpali_engine, "__version__", "unknown"), "|", colpali_engine.__file__)

# Sanity: does transformers expose qwen2_5_omni?
try:
    from transformers.models import qwen2_5_omni
    print("qwen2_5_omni module is present ✅")
except Exception as e:
    print("qwen2_5_omni missing ❌ ->", repr(e))

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
torch: 2.7.1+cu126
transformers: 4.53.3 | /usr/local/lib/python3.12/dist-packages/transformers/__init__.py
colpali_engine: unknown | /usr/local/lib/python3.12/dist-packages/colpali_engine/__init__.py
qwen2_5_omni module is present ✅


In [3]:
##%%
from transformers.utils.import_utils import is_flash_attn_2_available
from colpali_engine.models import ColQwen2, ColQwen2Processor
import torch
import os
from PIL import Image
import time


def process_image_directory(image_dir, queries, base_model="vidore/colqwen2-base", adapter_model="vidore/colqwen2-v0.1"):
    """
    Process all images in a directory with the ColQwen2 model.

    Args:
        image_dir (str): Path to directory containing images
        queries (list): List of text queries to score against the images
        base_model (str): HuggingFace model identifier for the base model
        adapter_model (str): HuggingFace model identifier for the adapter

    Returns:
        dict: Results with scores for each query-image pair
    """
    # Load base model and adapter
    print(f"Loading base model from {base_model}...")
    model = ColQwen2.from_pretrained(
        base_model,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
    ).eval()

    # Load adapter
    print(f"Loading adapter from {adapter_model}...")
    try:
        model.load_adapter(adapter_model)
        print("Adapter loaded successfully")
    except Exception as e:
        print(f"Warning: Could not load adapter: {e}")
        print("Continuing with base model only...")

    processor = ColQwen2Processor.from_pretrained(base_model)
    print("Model and processor loaded successfully")

    # Collect all valid images from directory
    images = []
    image_paths = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.webp']

    print(f"Loading images from {image_dir}...")
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        file_ext = os.path.splitext(filename)[1].lower()

        if os.path.isfile(file_path) and file_ext in valid_extensions:
            try:
                img = Image.open(file_path).convert('RGB')
                images.append(img)
                image_paths.append(filename)
                print(f"Loaded image: {filename}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

    if not images:
        print("No valid images found in directory")
        return None

    print(f"Processing {len(images)} images with {len(queries)} queries...")

    print("Processing queries...")
    query_embeddings = []

    for i, query in enumerate(queries):
        try:
            batch_queries = processor.process_queries([query])
            batch_queries = {k: v.to(model.device) for k, v in batch_queries.items()}

            with torch.no_grad():
                embedding = model(**batch_queries)
                query_embeddings.append(embedding)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"  Error processing query: {e}")
            return None

    # Process images one at a time
    print("Processing images...")
    image_embeddings = []
    successful_images = []

    for i, (img, img_path) in enumerate(zip(images, image_paths)):
        # print(f"  Processing image {i+1}/{len(images)}: {img_path}")
        try:
            # Process a single image
            batch_images = processor.process_images([img])
            batch_images = {k: v.to(model.device) for k, v in batch_images.items()}

            with torch.no_grad():
                embedding = model(**batch_images)
                image_embeddings.append(embedding)
                successful_images.append(img_path)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"  Error processing image {img_path}: {e}")

    if not image_embeddings:
        print("No image embeddings were processed successfully.")
        return None

    # Calculate similarity scores
    print("Calculating similarity scores...")
    results = {}

    for i, query in enumerate(queries):
        query_scores = {}
        query_emb = query_embeddings[i]

        for j, img_path in enumerate(successful_images):
            img_emb = image_embeddings[j]

            # Calculate cosine similarity (using score_multi_vector from processor)
            scores = processor.score_multi_vector(query_emb, img_emb)

            # Get the score (should be a single value)
            score = scores[0, 0].item()
            query_scores[img_path] = score

        results[query] = query_scores

    return results

if __name__ == "__main__":
    # Set your image directory and queries
    folder_path = "/content/drive/MyDrive/RAG"
    image_directory = os.path.join(folder_path, "Business_Model_Canvas_de")

    queries = [
    "Wann lohnt sich die Nutzung des Business Model Canvas besonders?",
    "In welchen Situationen ist das Business Model Canvas besonders nützlich?"
    ]

    start_time = time.time()
    # Process the directory
    results = process_image_directory(image_directory, queries)

    # Display results
    if results:
        for query, scores in results.items():
            print(f"\nQuery: {query}")
            # Sort images by score (highest first)
            sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for img_name, score in sorted_scores:
                print(f"  {img_name}: {score:.4f}")
    else:
        print("No results were obtained.")

    total_execution_time = time.time() - start_time
    print(f"\nScript completed in {total_execution_time:.2f} seconds")

Loading base model from vidore/colqwen2-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter from vidore/colqwen2-v0.1...


adapter_config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/74.0M [00:00<?, ?B/s]

Loading adapter weights from vidore/colqwen2-v0.1 led to unexpected keys not found in the model: model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.self_attn.q_proj.lora_A.default.weight, model.layers.0.self_attn.q_proj.lora_B.default.weight, model.layers.0.self_attn.v_proj.lora_A.default.weight, model.layers.0.self_attn.v_proj.lora_B.default.weight, model.layers.1.mlp.down_proj.lora_A.default.weight, model.layers.1.mlp.down_proj.lora_B.default.weight, model.layers.1.mlp.gate_proj.lora_A.default.weight,

Adapter loaded successfully


preprocessor_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Model and processor loaded successfully
Loading images from /content/drive/MyDrive/RAG/Business_Model_Canvas_de...
Loaded image: page_6.jpg
Loaded image: page_7.jpg
Loaded image: page_1.jpg
Loaded image: page_2.jpg
Loaded image: page_3.jpg
Loaded image: page_4.jpg
Loaded image: page_5.jpg
Processing 7 images with 2 queries...
Processing queries...
Processing images...
Calculating similarity scores...

Query: Wann lohnt sich die Nutzung des Business Model Canvas besonders?
  page_2.jpg: 18.6250
  page_3.jpg: 18.6250
  page_5.jpg: 18.3750
  page_6.jpg: 18.1250
  page_1.jpg: 17.3750
  page_7.jpg: 17.2500
  page_4.jpg: 15.5625

Query: In welchen Situationen ist das Business Model Canvas besonders nützlich?
  page_2.jpg: 19.3750
  page_5.jpg: 19.2500
  page_3.jpg: 18.6250
  page_6.jpg: 18.2500
  page_7.jpg: 17.6250
  page_1.jpg: 17.2500
  page_4.jpg: 16.0000

Script completed in 92.90 seconds


# Let's  combine qwenvl (as LLM) with RAG_Images on pdf with 80 pages

In [14]:
import os
import concurrent.futures
from pdf2image import convert_from_path
from tqdm import tqdm

def convert_pdf_to_jpg(pdf_path: str, output_folder: str, dpi: int = 400,
                       threads: int = 4, batch_size: int = 10) -> list:
    """
    Convert PDF pages to JPG images using pdf2image with parallel processing.

    Parameters:
    -----------
    pdf_path : str
        Path to the input PDF file
    output_folder : str
        Path to the folder where JPG images will be saved
    dpi : int, optional
        DPI for rendering (higher means better quality but larger files)
    threads : int, optional
        Number of worker threads to use for parallel processing
    batch_size : int, optional
        Number of pages to process in each batch

    Returns:
    --------
    list
        List of paths to the generated JPG files
    """

    # Validate input PDF file
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    try:
        # Get total number of pages first
        info = convert_from_path(pdf_path, dpi=dpi, first_page=1, last_page=1)
        total_pages = convert_from_path(pdf_path, dpi=72, first_page=1, last_page=None, thread_count=1)
        num_pages = len(total_pages)
        print(f"PDF has {num_pages} pages. Starting conversion...")

        output_files = []

        # Define a function to convert a batch of pages
        def convert_batch(batch):
            start_page, end_page = batch
            batch_images = convert_from_path(
                pdf_path,
                dpi=dpi,
                first_page=start_page,
                last_page=end_page,
                thread_count=1  # Use 1 thread per worker as we're already parallelizing
            )

            batch_output_files = []
            for i, image in enumerate(batch_images):
                page_num = start_page + i
                output_path = os.path.join(output_folder, f"page_{page_num}.jpg")
                image.save(output_path, "JPEG")
                batch_output_files.append(output_path)

            return batch_output_files

        # Create batches
        batches = []
        for i in range(1, num_pages + 1, batch_size):
            batches.append((i, min(i + batch_size - 1, num_pages)))

        # Process batches in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
            # Submit all batches to the executor
            future_to_batch = {executor.submit(convert_batch, batch): batch for batch in batches}

            # Process results as they complete with progress bar
            with tqdm(total=len(batches), desc="Converting PDF pages") as pbar:
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_files = future.result()
                    output_files.extend(batch_files)
                    pbar.update(1)

        # Sort output files by page number
        output_files.sort(key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0]))

        return output_files

    except Exception as e:
        raise Exception(f"Error converting PDF: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        # Convert a sample PDF
        folder_path = "/content/drive/MyDrive/RAG"
        pdf_file = os.path.join(folder_path, "PIF-2023-Annual-Report-EN.pdf")
        output_dir = "/content/drive/MyDrive/RAG/PIF-2023-Annual-Report-EN"
        # Convert PDF to images with parallel processing
        image_files = convert_pdf_to_jpg(
            pdf_file,
            output_dir,
            dpi=400,
            threads=os.cpu_count(),  # Use all available CPU cores
            batch_size=5            # Process 5 pages at a time
        )

        print(f"\nSuccessfully converted {len(image_files)} pages")

    except Exception as e:
        print(f"Error: {str(e)}")

PDF has 80 pages. Starting conversion...


Converting PDF pages: 100%|██████████| 16/16 [00:50<00:00,  3.17s/it]



Successfully converted 80 pages


In [1]:
##%%
import os
import torch
from PIL import Image
import time
from typing import List
import base64
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# This function is no longer needed as we're directly specifying image paths

class QwenVLProcessor:
    def __init__(
        self,
        model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct",
        device: str = "cuda",
        min_pixels: int = 128*16*16,
        max_pixels: int = 1024*16*16,
        cache_dir: str = None
    ):
        """
        Initialize the QwenVL processor with custom configuration.
        """
        # Configure CUDA memory allocation
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

        # Clear CUDA cache
        if device == "cuda":
            torch.cuda.empty_cache()

        print(f"Loading QwenVL model from {model_name}...")
        # Load model and assign to self
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map=device,
            attn_implementation="sdpa",
            use_cache=True,
            cache_dir=cache_dir,
        )

        # Load processor and assign to self
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            use_fast=True
        )

        self.device = device
        print("QwenVL model and processor loaded successfully")

    def _encode_image(self, image_path: str) -> str:
        """
        Encode a local image file to base64.
        """
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/jpeg;base64,{encoded_string}"

    def prepare_messages(
        self,
        image_paths: List[str],
        prompt: str
    ) -> List[dict]:
        """
        Prepare messages for the model using local image paths.
        """
        if isinstance(image_paths, str):
            image_paths = [image_paths]

        messages = []
        for path in image_paths:
            encoded_image = self._encode_image(path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": encoded_image},
                    {"type": "text", "text": prompt}
                ]
            })
        return messages

    def process_images(
        self,
        image_paths: List[str],
        prompt: str,
        max_new_tokens: int = 2000,
        temperature: float = 0.1,
        top_p: float = 0.9
    ) -> List[str]:
        """
        Process local images with the given prompt.
        """
        if isinstance(image_paths, str):
            image_paths = [image_paths]

        results = []

        # Process one image at a time to avoid memory issues
        for image_path in image_paths:
            print(f"Processing image: {os.path.basename(image_path)}")
            messages = self.prepare_messages(image_path, prompt)

            with torch.inference_mode():
                text = self.processor.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

                image_inputs, video_inputs = process_vision_info(messages)
                inputs = self.processor(
                    text=[text],
                    images=image_inputs,
                    videos=video_inputs,
                    padding=True,
                    return_tensors="pt"
                )

                inputs = inputs.to(self.device)

                generated_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=temperature,
                    top_p=top_p,
                    pad_token_id=self.processor.tokenizer.pad_token_id,
                    eos_token_id=self.processor.tokenizer.eos_token_id
                )

                generated_ids_trimmed = [
                    out_ids[len(in_ids):]
                    for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]

                output_text = self.processor.batch_decode(
                    generated_ids_trimmed,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )

                results.append(output_text[0])

                # Clear cache after each image
                if self.device == "cuda":
                    torch.cuda.empty_cache()

        return results

def main():
    start_time = time.time()

    # Set your image directory

    folder_path = "/content/drive/MyDrive/RAG"
    image_directory = os.path.join(folder_path, "PIF-2023-Annual-Report-EN")



    image_filenames = [
    "page_23.jpg",
    "page_22.jpg",
    ]

    # Generate full paths
    image_paths = [os.path.join(image_directory, filename) for filename in image_filenames]

    # Verify images exist
    valid_image_paths = []
    for path in image_paths:
        if os.path.isfile(path):
            valid_image_paths.append(path)
        else:
            print(f"Warning: Image not found: {path}")

    if not valid_image_paths:
        print("No valid images found.")
        return

    print(f"Found {len(valid_image_paths)} images to process with QwenVL")

    # Initialize QwenVL processor
    processor = QwenVLProcessor()

    # OCR prompt
    ocr_prompt = """You are an expert OCR model who can read and interpret hard images in details
                   and in great precision. Given these images extract every detail of text in an organized format.
                   Include all text visible in the image, preserving the structure where possible."""

    # Process images with QwenVL for OCR
    results = processor.process_images(valid_image_paths, prompt=ocr_prompt)

    # Print results
    print("\n===== OCR RESULTS =====")
    for i, (image_path, ocr_text) in enumerate(zip(valid_image_paths, results)):
        print(f"\nImage {i+1}: {os.path.basename(image_path)}")
        print("-" * 40)
        print(ocr_text)
        print("-" * 40)

    # Save results to file
    with open("ocr_results.txt", "w", encoding="utf-8") as f:
        for i, (image_path, ocr_text) in enumerate(zip(valid_image_paths, results)):
            f.write(f"\nImage {i+1}: {os.path.basename(image_path)}\n")
            f.write("-" * 40 + "\n")
            f.write(ocr_text + "\n")
            f.write("-" * 40 + "\n")

    print(f"\nResults saved to ocr_results.txt")

    total_execution_time = time.time() - start_time
    print(f"\nScript completed in {total_execution_time:.2f} seconds")

if __name__ == "__main__":
    main()

Found 2 images to process with QwenVL
Loading QwenVL model from Qwen/Qwen2.5-VL-7B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


QwenVL model and processor loaded successfully
Processing image: page_23.jpg
Processing image: page_22.jpg

===== OCR RESULTS =====

Image 1: page_23.jpg
----------------------------------------
Sure, here is the extracted text from the image:

---

**Strategic Review | Performance Review**

**THE PUBLIC INVESTMENT FUND'S PERFORMANCE IN 2023**

- **Total Shareholder Return Since VRP Inception (Per Annum):**
  - **8.7%**

The investments have been spread over a wide range of industries, and according to the Global Industry Classification Standard (GICS), they are represented as follows:

- **Energy:** 23.1%
- **Real Estate:** 17.0%
- **Information Technology:** 9.4%
- **Financials:** 7.3%
- **Communication Services:** 6.9%
- **Utilities:** 5.5%
- **Healthcare:** 4.6%
- **Materials:** 3.1%
- **Consumer Staples:** 1.2%
- **Healthcare:** 0.4%
- **Consumer Discretionary:** 2.5%
- **Industrials:** 18.9%

---

This text provides a detailed breakdown of the Public Investment Fund's performance