# Qwen2.5-VL-7B-Instruct for Structured Information Extraction

This notebook demonstrates how to use Qwen2.5-VL-7B-Instruct model to extract structured information from images, particularly focused on invoice and document processing.

## 1. Setup and Dependencies

In [2]:
# Install required dependencies
!uv pip install transformers torch torchvision pillow accelerate

[2mUsing Python 3.11.13 environment at: /Users/huetuanthi/dev/dokeai/vlm-ocr/.venv[0m
[2mAudited [1m5 packages[0m [2min 22ms[0m[0m


In [3]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image
import json
from typing import Dict, Any, List, Optional
import os
from pathlib import Path

## 2. Model Initialization

In [4]:
class QwenVLMExtractor:
    def __init__(self, model_name: str = "Qwen/Qwen2-VL-7B-Instruct"):
        """Initialize the Qwen VLM model for information extraction."""
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Load model and processor
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_name)
        
    def create_prompt(self, task_description: str = "Extract all relevant information", 
                     output_format: Optional[Dict] = None) -> str:
        """Create an optimized prompt for information extraction."""
        
        system_prompt = """You are an advanced Vision Language Model specialized in extracting structured information from images.
Your capabilities include:
- Accurate text recognition (OCR)
- Understanding document layouts and structures
- Extracting key-value pairs, tables, and hierarchical information
- Identifying and categorizing different types of information

Guidelines:
1. Extract ALL visible text and information from the image
2. Preserve the original structure and relationships between elements
3. Output valid JSON format
4. Include confidence indicators where appropriate
5. Handle multiple languages if present"""
        
        if output_format:
            format_instruction = f"\n\nExpected output format:\n{json.dumps(output_format, indent=2)}"
        else:
            format_instruction = ""
        
        user_prompt = f"{task_description}{format_instruction}"
        
        # Construct the full prompt with proper tags
        full_prompt = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{user_prompt}<|im_end|>
<|im_start|>assistant
"""
        
        return full_prompt
    
    def extract_information(self, 
                          image_path: str, 
                          task_description: str = "Extract all relevant information from this image",
                          output_format: Optional[Dict] = None,
                          max_new_tokens: int = 1024) -> Dict[str, Any]:
        """Extract structured information from an image."""
        
        # Load and preprocess image
        image = Image.open(image_path).convert('RGB')
        
        # Create prompt
        prompt = self.create_prompt(task_description, output_format)
        
        # Prepare inputs
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": task_description},
                ],
            }
        ]
        
        # Process with the model
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(self.device)
        
        # Generate response
        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.1,  # Low temperature for more deterministic outputs
                do_sample=True,
            )
        
        # Decode the response
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]
        
        # Parse JSON response
        try:
            result = json.loads(output_text)
        except json.JSONDecodeError:
            # If JSON parsing fails, return raw text
            result = {"raw_output": output_text, "parse_error": "Failed to parse JSON"}
        
        return result

## 3. Helper Functions

In [5]:
def process_vision_info(messages):
    """Process vision information from messages."""
    image_inputs = []
    video_inputs = []
    
    for message in messages:
        if message["role"] == "user":
            for content in message["content"]:
                if content["type"] == "image":
                    image_inputs.append(content["image"])
                elif content["type"] == "video":
                    video_inputs.append(content["video"])
    
    return image_inputs, video_inputs

def create_invoice_schema():
    """Create a sample schema for invoice extraction."""
    return {
        "invoice_details": {
            "invoice_number": "string",
            "invoice_date": "string",
            "due_date": "string",
            "currency": "string",
            "total_amount": "number"
        },
        "vendor_details": {
            "name": "string",
            "address": "string",
            "tax_id": "string",
            "contact": "string"
        },
        "customer_details": {
            "name": "string",
            "address": "string",
            "tax_id": "string",
            "contact": "string"
        },
        "line_items": [
            {
                "description": "string",
                "quantity": "number",
                "unit_price": "number",
                "total": "number"
            }
        ],
        "tax_details": {
            "tax_rate": "number",
            "tax_amount": "number"
        },
        "payment_details": {
            "method": "string",
            "account_number": "string",
            "reference": "string"
        }
    }

## 4. Example Usage

In [None]:
# Initialize the extractor
extractor = QwenVLMExtractor()

# Example: Extract information from an invoice
# Note: Replace with your actual image path
image_path = "path/to/your/invoice.jpg"

# Define the extraction task
task_description = """Extract all information from this invoice image. 
Focus on:
1. Invoice metadata (number, dates, amounts)
2. Vendor and customer information
3. Line items with descriptions and amounts
4. Tax and payment details

Return the information as structured JSON."""

# Get the schema for structured output
invoice_schema = create_invoice_schema()

# Extract information
if os.path.exists(image_path):
    result = extractor.extract_information(
        image_path=image_path,
        task_description=task_description,
        output_format=invoice_schema
    )
    
    print("Extracted Information:")
    print(json.dumps(result, indent=2))
else:
    print(f"Please provide a valid image path. Current path '{image_path}' does not exist.")

Using device: cpu


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

## 5. Batch Processing

In [None]:
def batch_extract(extractor: QwenVLMExtractor, 
                 image_folder: str, 
                 task_description: str,
                 output_format: Optional[Dict] = None) -> List[Dict]:
    """Process multiple images in a folder."""
    
    results = []
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    
    folder_path = Path(image_folder)
    image_files = [f for f in folder_path.iterdir() 
                  if f.suffix.lower() in image_extensions]
    
    print(f"Found {len(image_files)} images to process")
    
    for idx, image_file in enumerate(image_files):
        print(f"\nProcessing {idx+1}/{len(image_files)}: {image_file.name}")
        
        try:
            result = extractor.extract_information(
                image_path=str(image_file),
                task_description=task_description,
                output_format=output_format
            )
            
            results.append({
                "filename": image_file.name,
                "extracted_data": result,
                "status": "success"
            })
        except Exception as e:
            results.append({
                "filename": image_file.name,
                "error": str(e),
                "status": "failed"
            })
    
    return results

# Example batch processing
# image_folder = "path/to/invoice/folder"
# batch_results = batch_extract(extractor, image_folder, task_description, invoice_schema)

## 6. Custom Extraction Templates

In [None]:
# Template for receipt extraction
receipt_template = {
    "store_info": {
        "name": "string",
        "address": "string",
        "phone": "string"
    },
    "transaction_info": {
        "date": "string",
        "time": "string",
        "receipt_number": "string",
        "cashier": "string"
    },
    "items": [
        {
            "name": "string",
            "quantity": "number",
            "price": "number"
        }
    ],
    "totals": {
        "subtotal": "number",
        "tax": "number",
        "total": "number"
    },
    "payment": {
        "method": "string",
        "amount_paid": "number",
        "change": "number"
    }
}

# Template for ID card extraction
id_card_template = {
    "personal_info": {
        "full_name": "string",
        "date_of_birth": "string",
        "gender": "string",
        "nationality": "string"
    },
    "document_info": {
        "document_type": "string",
        "document_number": "string",
        "issue_date": "string",
        "expiry_date": "string",
        "issuing_authority": "string"
    },
    "address": {
        "street": "string",
        "city": "string",
        "state": "string",
        "postal_code": "string",
        "country": "string"
    }
}

# Template for business card extraction
business_card_template = {
    "person": {
        "name": "string",
        "title": "string",
        "department": "string"
    },
    "company": {
        "name": "string",
        "tagline": "string",
        "industry": "string"
    },
    "contact": {
        "phone": ["string"],
        "email": ["string"],
        "website": "string",
        "address": "string"
    },
    "social_media": {
        "linkedin": "string",
        "twitter": "string",
        "other": ["string"]
    }
}

## 7. Save Results

In [None]:
def save_extraction_results(results: List[Dict], output_file: str):
    """Save extraction results to a JSON file."""
    
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"Results saved to: {output_path}")

# Example: Save results
# save_extraction_results(batch_results, "extraction_results.json")