# Minimal Vision Model Test

Direct model loading and testing without using the unified_vision_processor package.

All configuration is embedded in the notebook for easy modification.

In [None]:
# Configuration - Modify as needed
CONFIG = {
    # Model selection: "llama" or "internvl"
    "model_type": "llama",
    
    # Model paths
    "model_paths": {
        "llama": "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision",
        "internvl": "/home/jovyan/nfs_share/models/InternVL3-8B"
    },
    
    # Test image path
    "test_image": "datasets/image14.png",
    
    # Test prompt
    "prompt": "<|image|>What text is visible in this receipt?",
    
    # Generation parameters
    "max_new_tokens": 512,
    "enable_quantization": True
}

print(f"Configuration loaded:")
print(f"Model: {CONFIG['model_type']}")
print(f"Image: {CONFIG['test_image']}")
print(f"Prompt: {CONFIG['prompt'][:50]}...")

In [None]:
# Imports - Direct model loading
import time
import torch
from pathlib import Path
from PIL import Image

# Model-specific imports based on selection
if CONFIG["model_type"] == "llama":
    from transformers import AutoProcessor, MllamaForConditionalGeneration
elif CONFIG["model_type"] == "internvl":
    from transformers import AutoModel, AutoTokenizer
    import torchvision.transforms as T
    from torchvision.transforms.functional import InterpolationMode

print(f"Imports successful for {CONFIG['model_type']} ✓")

In [None]:
# Load model directly
model_path = CONFIG["model_paths"][CONFIG["model_type"]]
print(f"Loading {CONFIG['model_type']} model from {model_path}...")
start_time = time.time()

try:
    if CONFIG["model_type"] == "llama":
        # Load Llama-3.2-Vision
        processor = AutoProcessor.from_pretrained(
            model_path,
            trust_remote_code=True,
            local_files_only=True
        )
        
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "torch_dtype": torch.float16,
            "device_map": "cuda:0" if torch.cuda.is_available() else "cpu",
            "local_files_only": True
        }
        
        if CONFIG["enable_quantization"] and torch.cuda.is_available():
            try:
                from transformers import BitsAndBytesConfig
                model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                )
                print("8-bit quantization enabled")
            except ImportError:
                print("Quantization not available, using FP16")
        
        model = MllamaForConditionalGeneration.from_pretrained(
            model_path,
            **model_kwargs
        ).eval()
        
    elif CONFIG["model_type"] == "internvl":
        # Load InternVL3
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True,
            local_files_only=True
        )
        
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "trust_remote_code": True,
            "torch_dtype": torch.bfloat16,
            "local_files_only": True
        }
        
        if CONFIG["enable_quantization"] and torch.cuda.is_available():
            try:
                model_kwargs["load_in_8bit"] = True
                print("8-bit quantization enabled")
            except Exception:
                print("Quantization not available, using bfloat16")
        
        model = AutoModel.from_pretrained(
            model_path,
            **model_kwargs
        ).eval()
        
        if torch.cuda.is_available():
            model = model.cuda()
    
    load_time = time.time() - start_time
    print(f"✓ Model loaded successfully in {load_time:.2f}s")
    
except Exception as e:
    print(f"✗ Model loading failed: {e}")
    raise

In [None]:
# Load and preprocess image
test_image_path = Path(CONFIG["test_image"])

if not test_image_path.exists():
    print(f"✗ Test image not found: {test_image_path}")
    available = list(Path("datasets").glob("*.png"))[:5]
    print(f"Available images: {[img.name for img in available]}")
    raise FileNotFoundError(f"Test image not found: {test_image_path}")

# Load image
image = Image.open(test_image_path)
if image.mode != "RGB":
    image = image.convert("RGB")

print(f"✓ Image loaded: {image.size}")
print(f"  File size: {test_image_path.stat().st_size / 1024:.1f} KB")

In [None]:
# Run inference
prompt = CONFIG["prompt"]
print(f"Running inference with {CONFIG['model_type']}...")
print(f"Prompt: {prompt}")
print("-" * 50)

start_time = time.time()

try:
    if CONFIG["model_type"] == "llama":
        # Llama inference
        if not prompt.startswith("<|image|>"):
            prompt = f"<|image|>{prompt}"
        
        inputs = processor(text=prompt, images=image, return_tensors="pt")
        
        # Move to device
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=CONFIG["max_new_tokens"],
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
                eos_token_id=processor.tokenizer.eos_token_id
            )
        
        response = processor.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )
        
    elif CONFIG["model_type"] == "internvl":
        # InternVL inference
        # Simple single image processing
        image_size = 448
        transform = T.Compose([
            T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        
        pixel_values = transform(image).unsqueeze(0)
        
        if torch.cuda.is_available():
            pixel_values = pixel_values.cuda().to(torch.bfloat16)
        
        generation_config = {
            "max_new_tokens": CONFIG["max_new_tokens"],
            "do_sample": False,
            "pad_token_id": tokenizer.eos_token_id
        }
        
        response = model.chat(
            tokenizer=tokenizer,
            pixel_values=pixel_values,
            question=prompt,
            generation_config=generation_config
        )
        
        if isinstance(response, tuple):
            response = response[0]
    
    inference_time = time.time() - start_time
    print(f"✓ Inference completed in {inference_time:.2f}s")
    
except Exception as e:
    print(f"✗ Inference failed: {e}")
    raise

In [None]:
# Display results
print("=" * 60)
print("EXTRACTED TEXT:")
print("=" * 60)
print(response)
print("=" * 60)

# Summary
print(f"\nSUMMARY:")
print(f"Model: {CONFIG['model_type']}")
print(f"Response length: {len(response)} characters")
print(f"Processing time: {inference_time:.2f}s")
print(f"Quantization enabled: {CONFIG['enable_quantization']}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Optional: Test different prompts
test_prompts = [
    "<|image|>Extract the store name and total amount.",
    "<|image|>What type of document is this?",
    "<|image|>List all visible numbers."
]

print("Testing additional prompts...\n")

for i, test_prompt in enumerate(test_prompts, 1):
    print(f"Test {i}: {test_prompt}")
    try:
        start = time.time()
        
        if CONFIG["model_type"] == "llama":
            inputs = processor(text=test_prompt, images=image, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=False,
                    pad_token_id=processor.tokenizer.eos_token_id
                )
            
            result = processor.decode(
                outputs[0][inputs["input_ids"].shape[-1]:],
                skip_special_tokens=True
            )
            
        elif CONFIG["model_type"] == "internvl":
            result = model.chat(
                tokenizer=tokenizer,
                pixel_values=pixel_values,
                question=test_prompt,
                generation_config={"max_new_tokens": 256, "do_sample": False}
            )
            if isinstance(result, tuple):
                result = result[0]
        
        elapsed = time.time() - start
        print(f"Result ({elapsed:.1f}s): {result[:100]}...")
        
    except Exception as e:
        print(f"Error: {e}")
    print("-" * 40)

In [None]:
# Memory cleanup
print("Cleaning up memory...")

del model
if CONFIG["model_type"] == "llama":
    del processor
elif CONFIG["model_type"] == "internvl":
    del tokenizer

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("✓ Memory cleaned")
print("\n🎉 Test completed successfully!")