In [1]:
import traceback
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch import nn
import pandas as pd
from collections import OrderedDict

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from huggingface_hub import hf_hub_download
import json

def load_groma_model(model_id):
    """Load Groma model with proper configuration"""
    print(f"Loading model: {model_id}")
    
    # First load the config to check model type
    try:
        config = AutoConfig.from_pretrained(model_id)
        print(f"Model type: {config.model_type}")
    except Exception as e:
        print(f"Config loading warning (non-fatal): {str(e)}")
    
    # Load tokenizer with fallbacks
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            trust_remote_code=True  # Important for custom architectures
        )
    except Exception as e:
        print(f"Fast tokenizer failed, trying slow: {str(e)}")
        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            use_fast=False,
            trust_remote_code=True
        )

    # Load model with proper settings
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,  # Important for custom architectures
            revision="main"  # Use main branch
        )
    except Exception as e:
        print(f"BFloat16 loading failed, trying FP32: {str(e)}")
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                trust_remote_code=True,
                revision="main"
            )
        except Exception as e:
            print(f"Auto device mapping failed, trying basic load: {str(e)}")
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                revision="main"
            )
    
    return model, tokenizer

def analyze_model_architecture(model, depth=0, path=""):
    """Recursively analyze model architecture with detailed parameters"""
    results = []
    
    for name, module in model.named_children():
        current_path = f"{path}/{name}" if path else name
        
        # Get parameter count
        num_params = sum(p.numel() for p in module.parameters())
        trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
        
        # Get layer info
        layer_type = module.__class__.__name__
        layer_info = {
            'Path': current_path,
            'Type': layer_type,
            'Total Parameters': f"{num_params:,}",
            'Trainable Parameters': f"{trainable_params:,}",
            'Depth': depth,
            'Shape': "N/A"
        }
        
        # Try to get output shape for common layer types
        if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
            if hasattr(module, 'weight'):
                layer_info['Shape'] = f"{tuple(module.weight.shape)}"
        
        results.append(layer_info)
        
        # Recursively analyze children
        if len(list(module.children())) > 0:
            results.extend(analyze_model_architecture(module, depth + 1, current_path))
    
    return results

def visualize_model_architecture(model_id):
    """Load and visualize model architecture with detailed analysis"""
    print(f"Loading model: {model_id}")
    
    # # Try loading tokenizer with fallback
    # try:
    #     tokenizer = AutoTokenizer.from_pretrained(model_id)
    # except Exception as e:
    #     print("AutoTokenizer fast load failed, falling back to use_fast=False")
    #     print(f"Error: {repr(e)}")
    #     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
    
    # # Load model with bfloat16 precision if available
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    # print(f"Using device: {device}")
    
    # try:
    #     model = AutoModelForCausalLM.from_pretrained(
    #         model_id,
    #         torch_dtype=torch.bfloat16,
    #         device_map="auto"
    #     )
    # except Exception as e:
    #     print(f"Failed to load with bfloat16, falling back to float32: {repr(e)}")
    #     model = AutoModelForCausalLM.from_pretrained(model_id)
    
    # Load model using the new loader
    model, tokenizer = load_groma_model(model_id)
    device = next(model.parameters()).device
    
    print("\n=== Model Overview ===")
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size on disk: ~{total_params * 2 / (1024**3):.2f} GB")
    
    # Analyze architecture
    print("\n=== Detailed Architecture Analysis ===")
    architecture = analyze_model_architecture(model)
    
    # Convert to DataFrame for better visualization
    df = pd.DataFrame(architecture)
    
    # Add indentation to path based on depth
    df['Display Path'] = df.apply(lambda x: "  " * x['Depth'] + x['Path'].split('/')[-1], axis=1)
    
    # Display detailed architecture
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    print(df[['Display Path', 'Type', 'Shape', 'Total Parameters', 'Trainable Parameters']]
          .to_string(index=False))
    
    # Test model with a sample input
    print("\n=== Sample Input Processing ===")
    sample_text = "This is a test input for the Groma model."
    inputs = tokenizer(sample_text, return_tensors="pt").to(device)
    
    print(f"Input text: {sample_text}")
    print(f"Tokenized shape: {inputs.input_ids.shape}")
    print(f"Vocabulary size: {model.config.vocab_size:,}")
    
    # Memory usage
    if device == "cuda":
        print("\n=== GPU Memory Usage ===")
        print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
        print(f"Cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB")
    
    return model, tokenizer

# Run the analysis
model_id = "FoundationVision/groma-7b-finetune"
model, tokenizer = visualize_model_architecture(model_id)

  from .autonotebook import tqdm as notebook_tqdm


Loading model: FoundationVision/groma-7b-finetune
Loading model: FoundationVision/groma-7b-finetune

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`
BFloat16 loading failed, trying FP32: The checkpoint you are trying to load has model type `groma` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-dat

ValueError: The checkpoint you are trying to load has model type `groma` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`

In [1]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import requests

# Define the model ID
model_id = "microsoft/kosmos-2-patch14-224"

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Load the Processor ---
# The processor handles both text tokenization and image preprocessing
print(f"Loading processor: {model_id}...")
processor = AutoProcessor.from_pretrained(model_id)

# --- METHOD A: Load Model Directly to GPU (Recommended) ---
# This requires the 'accelerate' library
# It automatically distributes the model across available GPUs.
print("Loading model to GPU using device_map='auto'...")
try:
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.float16,  # Use float16 for memory efficiency
        device_map="auto"
    )
    print("\nModel loading complete.")
    print(f"Model is on device(s): {model.device_map}")

except ImportError:
    print("\n'accelerate' library not found. Falling back to Method B.")
    print("Please install it with 'pip install accelerate' for better loading.")
    
    # --- METHOD B: Load on CPU then move to GPU (Fallback) ---
    print("Loading model to CPU first...")
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.float16  # Use float16
    )
    
    print(f"Moving model to {device}...")
    model.to(device)
    print("\nModel loading complete.")
    print(f"Model is on device: {model.device}")


# --- Example: How to use the model (inputs must also be on GPU) ---
# 1. Load a demo image
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png"
image = Image.open(requests.get(url, stream=True).raw)

# 2. Prepare inputs
prompt = "<grounding>An image of"
# The processor creates 'input_ids', 'attention_mask', and 'pixel_values'
inputs = processor(text=prompt, images=image, return_tensors="pt")

# 3. Move inputs to the same device as the model
# This step is crucial!
inputs = {k: v.to(device) for k, v in inputs.items()}

print("\nRunning model inference on GPU...")
# 4. Generate output
with torch.no_grad():
    generated_ids = model.generate(
        pixel_values=inputs["pixel_values"],
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        image_embeds=None,
        image_embeds_position_mask=inputs["image_embeds_position_mask"],
        use_cache=True,
        max_new_tokens=128,
    )

print("Inference complete.")

# 5. Decode the output
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# 6. Post-process to get text and bounding boxes
processed_text, entities = processor.post_process_generation(
    generated_text, 
    cleanup_and_extract=False
)

print(f"\nGenerated Text (Raw): {generated_text}")
print(f"\nProcessed Text: {processed_text}")

  from .autonotebook import tqdm as notebook_tqdm


ðŸš¨ `image_seq_length` is part of ImagesKwargs, but not documented. Make sure to add it to the docstring of the function in c:\Users\vishn\Desktop\RecodAI\.venv\Lib\site-packages\transformers\image_processing_utils_fast.py.
Using device: cuda
Loading processor: microsoft/kosmos-2-patch14-224...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading model to GPU using device_map='auto'...


`torch_dtype` is deprecated! Use `dtype` instead!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



Model loading complete.


AttributeError: 'Kosmos2ForConditionalGeneration' object has no attribute 'device_map'