# Minimal Vision Model Test

Direct model loading and testing without using the unified_vision_processor package.

All configuration is embedded in the notebook for easy modification.

In [None]:
# Configuration - Modify as needed
CONFIG = {
    # Model selection: "llama" or "internvl"
    "model_type": "llama",
    
    # Model paths
    "model_paths": {
        "llama": "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision",
        "internvl": "/home/jovyan/nfs_share/models/InternVL3-8B"
    },
    
    # Test image path
    "test_image": "datasets/image14.png",
    
    # PROVEN WORKING prompt pattern for Llama safety bypass
    "prompt": "<|image|>Extract receipt details in JSON: {\"DATE\": \"\", \"STORE\": \"\", \"TOTAL\": \"\"}",
    
    # Generation parameters - optimized for reliable output
    "max_new_tokens": 128,  # Shorter to prevent repetition
    "enable_quantization": True,
    
    # Safety mode bypass settings
    "bypass_safety": True,
    "deterministic_generation": True
}

print(f"Configuration loaded:")
print(f"Model: {CONFIG['model_type']}")
print(f"Image: {CONFIG['test_image']}")
print(f"Prompt: {CONFIG['prompt']}")
print(f"Safety bypass: {CONFIG['bypass_safety']}")
print("\n✓ Using PROVEN JSON prompt pattern that bypasses Llama safety mode")

In [2]:
# Imports - Direct model loading
import time
import torch
from pathlib import Path
from PIL import Image

# Model-specific imports based on selection
if CONFIG["model_type"] == "llama":
    from transformers import AutoProcessor, MllamaForConditionalGeneration
elif CONFIG["model_type"] == "internvl":
    from transformers import AutoModel, AutoTokenizer
    import torchvision.transforms as T
    from torchvision.transforms.functional import InterpolationMode

print(f"Imports successful for {CONFIG['model_type']} ✓")

Imports successful for llama ✓


In [3]:
# Load model directly
model_path = CONFIG["model_paths"][CONFIG["model_type"]]
print(f"Loading {CONFIG['model_type']} model from {model_path}...")
start_time = time.time()

try:
    if CONFIG["model_type"] == "llama":
        # Load Llama-3.2-Vision
        processor = AutoProcessor.from_pretrained(
            model_path,
            trust_remote_code=True,
            local_files_only=True
        )
        
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "torch_dtype": torch.float16,
            "device_map": "cuda:0" if torch.cuda.is_available() else "cpu",
            "local_files_only": True
        }
        
        if CONFIG["enable_quantization"] and torch.cuda.is_available():
            try:
                from transformers import BitsAndBytesConfig
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True,
                    llm_int8_skip_modules=["vision_tower", "multi_modal_projector"],
                    llm_int8_threshold=6.0
                )
                model_kwargs["quantization_config"] = quantization_config
                print("8-bit quantization enabled (skipping vision modules)")
            except ImportError:
                print("Quantization not available, using FP16")
                CONFIG["enable_quantization"] = False
        
        model = MllamaForConditionalGeneration.from_pretrained(
            model_path,
            **model_kwargs
        ).eval()
        
        # Configure generation settings
        model.generation_config.max_new_tokens = 1024
        model.generation_config.do_sample = False
        model.generation_config.use_cache = True
        
    elif CONFIG["model_type"] == "internvl":
        # Load InternVL3
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True,
            local_files_only=True
        )
        
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "trust_remote_code": True,
            "torch_dtype": torch.bfloat16,
            "local_files_only": True
        }
        
        if CONFIG["enable_quantization"] and torch.cuda.is_available():
            try:
                model_kwargs["load_in_8bit"] = True
                print("8-bit quantization enabled")
            except Exception:
                print("Quantization not available, using bfloat16")
                CONFIG["enable_quantization"] = False
        
        model = AutoModel.from_pretrained(
            model_path,
            **model_kwargs
        ).eval()
        
        if torch.cuda.is_available() and not CONFIG["enable_quantization"]:
            model = model.cuda()
    
    load_time = time.time() - start_time
    print(f"✓ Model loaded successfully in {load_time:.2f}s")
    print(f"Model device: {next(model.parameters()).device}")
    print(f"Quantization active: {CONFIG['enable_quantization']}")
    
except Exception as e:
    print(f"✗ Model loading failed: {e}")
    import traceback
    traceback.print_exc()
    
    # Try loading without quantization as fallback
    if CONFIG["enable_quantization"]:
        print("\nRetrying without quantization...")
        CONFIG["enable_quantization"] = False
        
        if CONFIG["model_type"] == "llama":
            model = MllamaForConditionalGeneration.from_pretrained(
                model_path,
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                device_map="cuda:0" if torch.cuda.is_available() else "cpu",
                local_files_only=True
            ).eval()
            
            # Configure generation settings
            model.generation_config.max_new_tokens = 1024
            model.generation_config.do_sample = False
            model.generation_config.use_cache = True
            
        elif CONFIG["model_type"] == "internvl":
            model = AutoModel.from_pretrained(
                model_path,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                torch_dtype=torch.bfloat16,
                local_files_only=True
            ).eval()
            if torch.cuda.is_available():
                model = model.cuda()
        
        load_time = time.time() - start_time
        print(f"✓ Model loaded without quantization in {load_time:.2f}s")
        print(f"Model device: {next(model.parameters()).device}")
    else:
        print("Cannot proceed without model - please check configuration")
        raise e

Loading llama model from /home/jovyan/nfs_share/models/Llama-3.2-11B-Vision...
8-bit quantization enabled (skipping vision modules)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✓ Model loaded successfully in 5.61s
Model device: cuda:0
Quantization active: True


In [4]:
# Load and preprocess image
test_image_path = Path(CONFIG["test_image"])

if not test_image_path.exists():
    print(f"✗ Test image not found: {test_image_path}")
    available = list(Path("datasets").glob("*.png"))[:5]
    print(f"Available images: {[img.name for img in available]}")
    raise FileNotFoundError(f"Test image not found: {test_image_path}")

# Load image
image = Image.open(test_image_path)
if image.mode != "RGB":
    image = image.convert("RGB")

print(f"✓ Image loaded: {image.size}")
print(f"  File size: {test_image_path.stat().st_size / 1024:.1f} KB")

✓ Image loaded: (2048, 2048)
  File size: 211.1 KB


In [None]:
# Run inference
prompt = CONFIG["prompt"]
print(f"Running inference with {CONFIG['model_type']}...")
print(f"Prompt: {prompt}")
print("-" * 50)

start_time = time.time()

def clean_response(response: str) -> str:
    """Clean response from repetitive text and artifacts."""
    import re
    
    # Remove excessive repetition of ANY word repeated 3+ times consecutively
    response = re.sub(r'\b(\w+)(\s+\1){2,}', r'\1', response, flags=re.IGNORECASE)
    
    # Remove excessive repetition of longer phrases
    response = re.sub(r'\b((?:\w+\s+){1,3})(?:\1){2,}', r'\1', response, flags=re.IGNORECASE)
    
    # Remove safety warnings and repetitive content
    safety_patterns = [
        r"I'm not able to provide.*?information\.",
        r"I cannot provide.*?information\.",
        r"I'm unable to.*?\.",
        r"I can't.*?\.",
        r"Sorry, I cannot.*?\."
    ]
    
    for pattern in safety_patterns:
        response = re.sub(pattern, "", response, flags=re.IGNORECASE)
    
    # Clean up excessive whitespace and artifacts
    response = re.sub(r'\s+', ' ', response)
    response = re.sub(r'[{}]+', '', response)  # Remove extra braces
    response = re.sub(r'\\+', '', response)    # Remove backslashes
    
    # Extract JSON if present
    json_match = re.search(r'\{[^{}]*\}', response)
    if json_match:
        response = json_match.group(0)
    
    return response.strip()

try:
    if CONFIG["model_type"] == "llama":
        # Enhanced Llama inference with safety bypass
        if not prompt.startswith("<|image|>"):
            prompt = f"<|image|>{prompt}"
        
        inputs = processor(text=prompt, images=image, return_tensors="pt")
        
        # Move to device and ensure contiguity for quantized models
        if torch.cuda.is_available():
            device = "cuda"
            inputs = {k: v.to(device).contiguous() if hasattr(v, "to") else v for k, v in inputs.items()}
        else:
            device = "cpu"
            inputs = {k: v.contiguous() if hasattr(v, "contiguous") else v for k, v in inputs.items()}
        
        print(f"Input tensor shapes: {[(k, v.shape) for k, v in inputs.items() if hasattr(v, 'shape')]}")
        
        # Generation settings optimized for safety bypass
        generation_kwargs = {
            **inputs,
            "max_new_tokens": CONFIG["max_new_tokens"],
            "do_sample": False,  # Critical for safety bypass
            "pad_token_id": processor.tokenizer.eos_token_id,
            "eos_token_id": processor.tokenizer.eos_token_id,
            "use_cache": True,
            "repetition_penalty": 1.1,  # Reduce repetition
            "temperature": None,  # Force deterministic
            "top_p": None,        # Force deterministic
            "top_k": None         # Force deterministic
        }
        
        with torch.no_grad():
            outputs = model.generate(**generation_kwargs)
        
        raw_response = processor.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )
        
        print(f"Raw response (first 200 chars): {raw_response[:200]}...")
        
        # Clean the response
        response = clean_response(raw_response)
        
    elif CONFIG["model_type"] == "internvl":
        # InternVL inference - generally more stable
        image_size = 448
        transform = T.Compose([
            T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        
        pixel_values = transform(image).unsqueeze(0)
        
        if torch.cuda.is_available():
            pixel_values = pixel_values.cuda().to(torch.bfloat16).contiguous()
        else:
            pixel_values = pixel_values.contiguous()
        
        generation_config = {
            "max_new_tokens": CONFIG["max_new_tokens"],
            "do_sample": False,
            "pad_token_id": tokenizer.eos_token_id,
            "repetition_penalty": 1.1
        }
        
        raw_response = model.chat(
            tokenizer=tokenizer,
            pixel_values=pixel_values,
            question=prompt,
            generation_config=generation_config
        )
        
        if isinstance(raw_response, tuple):
            raw_response = raw_response[0]
        
        # Clean the response
        response = clean_response(raw_response)
    
    inference_time = time.time() - start_time
    print(f"✓ Inference completed in {inference_time:.2f}s")
    print(f"Cleaned response: {response}")
    
except Exception as e:
    print(f"✗ Inference failed: {e}")
    import traceback
    traceback.print_exc()
    
    # Fallback: Try without quantization
    print("\nTrying fallback without quantization...")
    if CONFIG["model_type"] == "llama":
        try:
            # Reload model without quantization
            print("Reloading model without quantization...")
            del model
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            model = MllamaForConditionalGeneration.from_pretrained(
                CONFIG["model_paths"]["llama"],
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                device_map="cuda:0" if torch.cuda.is_available() else "cpu",
                local_files_only=True
            ).eval()
            
            # Retry inference with strict safety bypass
            inputs = processor(text=prompt, images=image, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda").contiguous() if hasattr(v, "to") else v for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=CONFIG["max_new_tokens"],
                    do_sample=False,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    repetition_penalty=1.1
                )
            
            raw_response = processor.decode(
                outputs[0][inputs["input_ids"].shape[-1]:],
                skip_special_tokens=True
            )
            
            response = clean_response(raw_response)
            inference_time = time.time() - start_time
            print(f"✓ Fallback inference completed in {inference_time:.2f}s")
            
        except Exception as e2:
            print(f"✗ Fallback also failed: {e2}")
            response = f"Error: Both primary and fallback inference failed. Primary: {str(e)}, Fallback: {str(e2)}"
            inference_time = time.time() - start_time
    else:
        response = f"Error: Inference failed - {str(e)}"
        inference_time = time.time() - start_time

print(f"Final response ready for display (length: {len(response) if 'response' in locals() else 0} characters)")

In [None]:
# Display results
print("=" * 60)
print("EXTRACTED TEXT:")
print("=" * 60)
print(response)
print("=" * 60)

# Summary
print(f"\nSUMMARY:")
print(f"Model: {CONFIG['model_type']}")
print(f"Response length: {len(response)} characters")
print(f"Processing time: {inference_time:.2f}s")
print(f"Quantization enabled: {CONFIG['enable_quantization']}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

# Enhanced JSON parsing with validation
print(f"\nRESPONSE ANALYSIS:")
if response.strip().startswith('{') and response.strip().endswith('}'):
    try:
        import json
        parsed = json.loads(response.strip())
        print(f"✅ VALID JSON EXTRACTED:")
        for key, value in parsed.items():
            print(f"  {key}: {value}")
        
        # Validate completeness
        expected_fields = ["DATE", "STORE", "TOTAL"]
        missing = [field for field in expected_fields if field not in parsed or not parsed[field]]
        if missing:
            print(f"⚠️ Missing fields: {missing}")
        else:
            print(f"✅ All expected fields present")
            
    except json.JSONDecodeError as e:
        print(f"❌ Invalid JSON: {e}")
        print(f"Raw response: {response}")
        
elif any(keyword in response for keyword in ["DATE:", "STORE:", "TOTAL:"]):
    print(f"✅ KEY-VALUE format detected")
    # Try to extract key-value pairs
    import re
    matches = re.findall(r'([A-Z]+):\s*([^\n]+)', response)
    if matches:
        print(f"Extracted fields:")
        for key, value in matches:
            print(f"  {key}: {value.strip()}")
            
elif any(phrase in response.lower() for phrase in ["not able", "cannot provide", "sorry"]):
    print(f"❌ SAFETY MODE TRIGGERED")
    print(f"This indicates the prompt triggered Llama's safety restrictions")
    print(f"Solution: Use simpler JSON format prompts")
    
else:
    print(f"⚠️ UNSTRUCTURED RESPONSE")
    print(f"Response doesn't match expected patterns")
    print(f"Consider using different prompt format")

# Performance assessment
if inference_time < 30:
    print(f"\n⚡ GOOD performance: {inference_time:.1f}s")
elif inference_time < 60:
    print(f"\n⚠️ ACCEPTABLE performance: {inference_time:.1f}s") 
else:
    print(f"\n❌ SLOW performance: {inference_time:.1f}s")

print(f"\n🎯 For production use:")
print(f"- Llama-3.2-Vision: Use simple JSON prompts only")
print(f"- InternVL3: More flexible, handles complex prompts better")
print(f"- Both models: Shorter max_new_tokens prevents issues")

In [None]:
# Optional: Test different prompts - Using SAFE patterns only
safe_test_prompts = [
    "<|image|>Extract store and total in JSON: {\"STORE\": \"\", \"TOTAL\": \"\"}",
    "<|image|>Document type in JSON: {\"TYPE\": \"\"}",
    "<|image|>Extract numbers in JSON: {\"NUMBERS\": \"\"}"
]

print("Testing additional prompts with SAFE JSON patterns only...\n")

for i, test_prompt in enumerate(safe_test_prompts, 1):
    print(f"Test {i}: {test_prompt[:60]}...")
    try:
        start = time.time()
        
        if CONFIG["model_type"] == "llama":
            inputs = processor(text=test_prompt, images=image, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda").contiguous() if hasattr(v, "to") else v for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=64,  # Very short to prevent safety triggers
                    do_sample=False,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    repetition_penalty=1.2
                )
            
            result = processor.decode(
                outputs[0][inputs["input_ids"].shape[-1]:],
                skip_special_tokens=True
            )
            
            # Clean result
            result = clean_response(result)
            
        elif CONFIG["model_type"] == "internvl":
            result = model.chat(
                tokenizer=tokenizer,
                pixel_values=pixel_values,
                question=test_prompt,
                generation_config={
                    "max_new_tokens": 64, 
                    "do_sample": False,
                    "repetition_penalty": 1.2
                }
            )
            if isinstance(result, tuple):
                result = result[0]
            result = clean_response(result)
        
        elapsed = time.time() - start
        
        # Check if result is a safety response
        if any(phrase in result.lower() for phrase in ["not able", "cannot provide", "sorry"]):
            print(f"❌ Safety mode triggered ({elapsed:.1f}s): {result[:80]}...")
        else:
            print(f"✅ Success ({elapsed:.1f}s): {result}")
        
    except Exception as e:
        print(f"❌ Error: {e}")
    print("-" * 40)

print("\n💡 TIP: For Llama-3.2-Vision, use ONLY simple JSON format prompts to avoid safety mode.")
print("✅ Pattern: '<|image|>Extract [field] in JSON: {\"FIELD\": \"\"}'")
print("❌ Avoid: Complex instructions, examples, or 'read all text' requests")

In [8]:
# Memory cleanup
print("Cleaning up memory...")

del model
if CONFIG["model_type"] == "llama":
    del processor
elif CONFIG["model_type"] == "internvl":
    del tokenizer

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("✓ Memory cleaned")
print("\n🎉 Test completed successfully!")

Cleaning up memory...
✓ Memory cleaned

🎉 Test completed successfully!
