KeyError: 'intern_vit_6b'

### System Info

Ubuntu 24.04.1
transformers    4.47.0


### Who can help?

I want to use the latest OpenGVLab/InternViT-300M-448px-V2_5 as the vision encoder of llava, but an error occurred when running the following code. I think it should be that transformers do not support this visual encoder. I tried to modify the code, but it didn't work.

### Information

- [ ] The official example scripts
- [X] My own modified scripts

### Tasks

- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [ ] My own task or dataset (give details below)

### Reproduction

import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor,AutoTokenizer, AutoProcessor,AutoModelForCausalLM
from transformers import LlavaForConditionalGeneration,LlavaConfig
clip_model_name_or_path = "/home/wangyu/model/models--OpenGVLab--InternViT-300M-448px-V2_5/snapshots/8f86a5e87697180b439811ca69fabbfccd38d996"
qwen_model_name_or_path = "/home/wangyu/model/Qwen2.5-0.5B-Instruct"
clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0",trust_remote_code='True')
llm_model = AutoModelForCausalLM.from_pretrained(qwen_model_name_or_path, device_map="cuda:0")
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
vision_config = clip_model.config
text_config = llm_model.config
configuration = LlavaConfig(vision_config, text_config)
model = LlavaForConditionalGeneration(configuration)
model.save_pretrained("slvm/model001")

from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch
import os
from typing import Union
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
model_name_or_path = "slvm/model001"  # 需要确认路径是否正确

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, 
    device_map="cuda:0", 
    torch_dtype=torch.bfloat16,
)

from PIL import Image

prompt_text = "<image>\nWhat are these?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]

prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_path = "000000039769.jpg"  
image = Image.open(image_path)

inputs = llava_processor(text=prompt, images=image, return_tensors="pt")


for tk in inputs.keys():
    if inputs[tk].dtype == torch.float32:  
        inputs[tk] = inputs[tk].to(dtype=torch.bfloat16)
    inputs[tk] = inputs[tk].to(model.device)

generate_ids = model.generate(**inputs, max_new_tokens=20)
gen_text = llava_processor.batch_decode(
    generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]

print(gen_text)






error:

{
	"name": "KeyError",
	"message": "'intern_vit_6b'",
	"stack": "---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[5], line 9
      6 from transformers.utils import logging
      7 model_name_or_path = \"slvm/model001\"  # 需要确认路径是否正确
----> 9 llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
     10 model = LlavaForConditionalGeneration.from_pretrained(
     11     model_name_or_path, 
     12     device_map=\"cuda:0\", 
     13     torch_dtype=torch.bfloat16,
     14 )
     16 from PIL import Image

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/processing_utils.py:974, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
    971 if token is not None:
    972     kwargs[\"token\"] = token
--> 974 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
    975 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
    977 return cls.from_args_and_dict(args, processor_dict, **kwargs)

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/processing_utils.py:1020, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
   1017     else:
   1018         attribute_class = getattr(transformers_module, class_name)
-> 1020     args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
   1021 return args

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:878, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    876         config = AutoConfig.for_model(**config_dict)
    877     else:
--> 878         config = AutoConfig.from_pretrained(
    879             pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
    880         )
    881 config_tokenizer_class = config.tokenizer_class
    882 if hasattr(config, \"auto_map\") and \"AutoTokenizer\" in config.auto_map:

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:1045, in AutoConfig.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
   1039     except KeyError:
   1040         raise ValueError(
   1041             f\"The checkpoint you are trying to load has model type `{config_dict['model_type']}` \"
   1042             \"but Transformers does not recognize this architecture. This could be because of an \"
   1043             \"issue with the checkpoint, or because your version of Transformers is out of date.\"
   1044         )
-> 1045     return config_class.from_dict(config_dict, **unused_kwargs)
   1046 else:
   1047     # Fallback: use pattern matching on the string.
   1048     # We go from longer names to shorter names to catch roberta before bert (for instance)
   1049     for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/configuration_utils.py:734, in PretrainedConfig.from_dict(cls, config_dict, **kwargs)
    731 # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
    732 config_dict[\"attn_implementation\"] = kwargs.pop(\"attn_implementation\", None)
--> 734 config = cls(**config_dict)
    736 if hasattr(config, \"pruned_heads\"):
    737     config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/llava/configuration_llava.py:108, in LlavaConfig.__init__(self, vision_config, text_config, ignore_index, image_token_index, projector_hidden_act, vision_feature_select_strategy, vision_feature_layer, image_seq_length, **kwargs)
    104 if isinstance(vision_config, dict):
    105     vision_config[\"model_type\"] = (
    106         vision_config[\"model_type\"] if \"model_type\" in vision_config else \"clip_vision_model\"
    107     )
--> 108     vision_config = CONFIG_MAPPING[vision_config[\"model_type\"]](**vision_config)
    109 elif vision_config is None:
    110     vision_config = CONFIG_MAPPING[\"clip_vision_model\"](
    111         intermediate_size=4096,
    112         hidden_size=1024,
   (...)
    118         projection_dim=768,
    119     )

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:740, in _LazyConfigMapping.__getitem__(self, key)
    738     return self._extra_content[key]
    739 if key not in self._mapping:
--> 740     raise KeyError(key)
    741 value = self._mapping[key]
    742 module_name = model_type_to_module_name(key)

KeyError: 'intern_vit_6b'"
}

### Expected behavior

I think I need to modify the source code and add internvit, just like clip. I hope the official staff can tell me where to modify it.

I love transformers！

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

KeyError: 'intern_vit_6b' #35279

System Info

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

KeyError: 'intern_vit_6b' #35279

Description

System Info

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions