In [4]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"SDPA available: {hasattr(torch.nn.functional, 'scaled_dot_product_attention')}")

# Check which backend will be used
from transformers import AutoModel
model = AutoModel.from_pretrained(
    '/home/jovyan/nfs_share/models/InternVL3-2B',
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    # use_flash_attn=False  # ‚Üê InternVL3's own parameter
    attn_implementation="eager"
  )

print(f"Model type: {type(model)}")
print(f"Device: {next(model.parameters()).device}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

print(f"Attention implementation: {model.config._attn_implementation}")

PyTorch: 2.5.1
SDPA available: True
Model type: <class 'transformers_modules.InternVL3-2B.modeling_internvl_chat.InternVLChatModel'>
Device: cuda:0
Parameters: 2.1B
Attention implementation: eager


In [5]:
import torch
from transformers import MllamaForConditionalGeneration

model = MllamaForConditionalGeneration.from_pretrained(
  '/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct',  # adjust path
  torch_dtype=torch.bfloat16,
  device_map="auto",
  attn_implementation="sdpa"
)

print(f"Model type: {type(model)}")
print(f"Device: {next(model.parameters()).device}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

print(f"Attention implementation: {model.config._attn_implementation}")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model type: <class 'transformers.models.mllama.modeling_mllama.MllamaForConditionalGeneration'>
Device: cuda:0
Parameters: 10.7B
Attention implementation: sdpa
