# Simple Qwen2.5-VL-7B-Instruct Inference for Document Extraction

This notebook demonstrates a simplified approach to using Qwen2.5-VL-7B-Instruct for extracting structured information from documents.

## 1. Install Dependencies

In [None]:
!pip install qwen-vl-utils transformers torch torchvision accelerate

## 2. Import Libraries and Load Model

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import json

# Model configuration
model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Load model with automatic device mapping
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# Load processor
processor = AutoProcessor.from_pretrained(model_name)

print("Model loaded successfully!")

## 3. Simple Inference Function

In [None]:
def extract_from_image(image_path, prompt):
    """
    Extract information from an image using Qwen2.5-VL.
    
    Args:
        image_path: Path to the image file
        prompt: The extraction prompt
    
    Returns:
        Extracted text from the model
    """
    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]
    
    # Prepare inputs
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
    
    # Generate response
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return output_text

## 4. Document Extraction Examples

In [None]:
# Example 1: Invoice Extraction
invoice_prompt = """Extract all information from this invoice and return it as a JSON object with the following structure:
{
    "invoice_number": "",
    "date": "",
    "vendor": {
        "name": "",
        "address": ""
    },
    "customer": {
        "name": "",
        "address": ""
    },
    "items": [
        {
            "description": "",
            "quantity": 0,
            "unit_price": 0,
            "total": 0
        }
    ],
    "subtotal": 0,
    "tax": 0,
    "total": 0
}"""

# Replace with your image path
# result = extract_from_image("path/to/invoice.jpg", invoice_prompt)
# print(result)

## Expected Output for the Invoice

Based on the invoice image, the model should extract:
- Invoice no: 97159829
- Date: 09/18/2015
- Seller: Bradley-Andrade, address: 9879 Elizabeth Common, Lake Jonathan, RI 12335
- Client: Castro PLC, Unit 9678 Box 9664, DPO AP 69387
- Item: 12" Marble Lapis Inlay Chess Table Top With 2" Pieces & 15" Wooden Stand W537
- Total: $978.12

In [None]:
# Test with the actual invoice image
image_path = "./images/invoice.png"

# Define extraction prompt
invoice_prompt = """Extract all information from this invoice and return it as a JSON object with the following structure:
{
    "invoice_number": "",
    "date": "",
    "seller": {
        "name": "",
        "address": "",
        "tax_id": "",
        "iban": ""
    },
    "client": {
        "name": "",
        "address": "",
        "tax_id": ""
    },
    "items": [
        {
            "description": "",
            "quantity": 0,
            "unit": "",
            "net_price": 0,
            "net_worth": 0,
            "vat_percent": 0,
            "gross_worth": 0
        }
    ],
    "summary": {
        "net_worth": 0,
        "vat": 0,
        "gross_worth": 0
    }
}"""

# Extract information
try:
    result = extract_from_image(image_path, invoice_prompt)
    print("Extraction successful!")
    print("\nExtracted data:")
    print(result)
    
    # Try to parse as JSON
    try:
        json_result = json.loads(result)
        print("\nParsed JSON:")
        print(json.dumps(json_result, indent=2))
    except:
        print("\nNote: Output is not valid JSON, showing raw text")
        
except Exception as e:
    print(f"Error during extraction: {e}")
    print("\nNote: The model requires significant memory (20+ GB). Consider using:")
    print("- A cloud GPU service (Colab, Kaggle, etc.)")
    print("- Quantization (4-bit or 8-bit)")
    print("- A smaller model variant")
    print("- CPU offloading (will be very slow)")

## 4.1 Test with Local Invoice Image

In [None]:
# Example 2: Receipt Extraction
receipt_prompt = """You are a receipt parser. Extract the following information from the receipt image:
- Store name
- Store address
- Date and time
- List of items with prices
- Subtotal
- Tax amount
- Total amount
- Payment method

Format the output as a clean JSON object."""

# Replace with your image path
# result = extract_from_image("path/to/receipt.jpg", receipt_prompt)
# print(result)

In [None]:
# Example 3: ID Card Extraction
id_card_prompt = """Extract all visible information from this ID card or driver's license.
Include:
- Full name
- Date of birth
- ID number
- Address
- Issue date
- Expiry date
- Any other visible fields

Return as structured JSON."""

# Replace with your image path
# result = extract_from_image("path/to/id_card.jpg", id_card_prompt)
# print(result)

## 5. Batch Processing

In [None]:
def batch_extract(image_paths, prompt):
    """
    Process multiple images with the same prompt.
    
    Args:
        image_paths: List of image file paths
        prompt: The extraction prompt to use
    
    Returns:
        List of extraction results
    """
    results = []
    
    for idx, image_path in enumerate(image_paths):
        print(f"Processing image {idx + 1}/{len(image_paths)}: {image_path}")
        try:
            result = extract_from_image(image_path, prompt)
            results.append({
                "file": image_path,
                "result": result,
                "status": "success"
            })
        except Exception as e:
            results.append({
                "file": image_path,
                "error": str(e),
                "status": "failed"
            })
    
    return results

# Example usage
# images = ["invoice1.jpg", "invoice2.jpg", "invoice3.jpg"]
# results = batch_extract(images, invoice_prompt)
# 
# # Save results
# with open("extraction_results.json", "w") as f:
#     json.dump(results, f, indent=2)

## 6. Advanced Prompting Techniques

In [None]:
# Few-shot example prompt
few_shot_prompt = """You are an expert document parser. Here's an example of the expected output format:

Example for an invoice:
{
  "document_type": "invoice",
  "extracted_data": {
    "invoice_number": "INV-2024-001",
    "date": "2024-01-15",
    "vendor": "ABC Company Ltd.",
    "total": 1250.00
  }
}

Now extract information from the provided image in the same format."""

# Chain-of-thought prompt
cot_prompt = """Analyze this document step by step:
1. First, identify the type of document (invoice, receipt, form, etc.)
2. Locate the key fields and their values
3. Extract all text maintaining the structure
4. Format the extracted information as JSON

Let's start:"""

# Specific field extraction
field_prompt = """Extract only the following fields from this document:
- Total amount (look for words like 'Total', 'Amount Due', 'Grand Total')
- Document date
- Document number/ID

Return as: {"total": <amount>, "date": "<date>", "document_id": "<id>"}"""

## 7. Memory-Efficient Loading Options

In [None]:
# For systems with limited GPU memory
def load_model_efficient():
    """
    Load model with memory optimization options.
    """
    # Option 1: Load in 4-bit (requires bitsandbytes)
    # from transformers import BitsAndBytesConfig
    # 
    # quantization_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_compute_dtype=torch.float16
    # )
    # 
    # model = Qwen2VLForConditionalGeneration.from_pretrained(
    #     model_name,
    #     quantization_config=quantization_config,
    #     device_map="auto"
    # )
    
    # Option 2: CPU offloading
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name,
        device_map="auto",
        offload_folder="offload",
        torch_dtype=torch.float16
    )
    
    return model

## 8. Utility Functions

In [None]:
def parse_json_output(text):
    """
    Try to extract JSON from model output.
    """
    try:
        # Direct parse
        return json.loads(text)
    except:
        # Try to find JSON in the text
        import re
        json_pattern = r'\{[^{}]*\}'
        matches = re.findall(json_pattern, text, re.DOTALL)
        
        for match in matches:
            try:
                return json.loads(match)
            except:
                continue
        
        # Return original text if no JSON found
        return {"raw_text": text}

def save_results(results, filename="results.json"):
    """
    Save extraction results to file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Results saved to {filename}")