KeyError: 'intern_vit_6b' #35279
Labels
bug
Multimodal
WIP
Label your PR/Issue with WIP for some long outstanding Issues/PRs that are work in progress
System Info
Ubuntu 24.04.1
transformers 4.47.0
Who can help?
I want to use the latest OpenGVLab/InternViT-300M-448px-V2_5 as the vision encoder of llava, but an error occurred when running the following code. I think it should be that transformers do not support this visual encoder. I tried to modify the code, but it didn't work.
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor,AutoTokenizer, AutoProcessor,AutoModelForCausalLM
from transformers import LlavaForConditionalGeneration,LlavaConfig
clip_model_name_or_path = "/home/wangyu/model/models--OpenGVLab--InternViT-300M-448px-V2_5/snapshots/8f86a5e87697180b439811ca69fabbfccd38d996"
qwen_model_name_or_path = "/home/wangyu/model/Qwen2.5-0.5B-Instruct"
clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0",trust_remote_code='True')
llm_model = AutoModelForCausalLM.from_pretrained(qwen_model_name_or_path, device_map="cuda:0")
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
vision_config = clip_model.config
text_config = llm_model.config
configuration = LlavaConfig(vision_config, text_config)
model = LlavaForConditionalGeneration(configuration)
model.save_pretrained("slvm/model001")
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch
import os
from typing import Union
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
model_name_or_path = "slvm/model001" # 需要确认路径是否正确
llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
torch_dtype=torch.bfloat16,
)
from PIL import Image
prompt_text = "
\nWhat are these?"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_path = "000000039769.jpg"
image = Image.open(image_path)
inputs = llava_processor(text=prompt, images=image, return_tensors="pt")
for tk in inputs.keys():
if inputs[tk].dtype == torch.float32:
inputs[tk] = inputs[tk].to(dtype=torch.bfloat16)
inputs[tk] = inputs[tk].to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=20)
gen_text = llava_processor.batch_decode(
generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]
print(gen_text)
error:
{
"name": "KeyError",
"message": "'intern_vit_6b'",
"stack": "---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[5], line 9
6 from transformers.utils import logging
7 model_name_or_path = "slvm/model001" # 需要确认路径是否正确
----> 9 llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
10 model = LlavaForConditionalGeneration.from_pretrained(
11 model_name_or_path,
12 device_map="cuda:0",
13 torch_dtype=torch.bfloat16,
14 )
16 from PIL import Image
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/processing_utils.py:974, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
971 if token is not None:
972 kwargs["token"] = token
--> 974 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
975 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
977 return cls.from_args_and_dict(args, processor_dict, **kwargs)
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/processing_utils.py:1020, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1017 else:
1018 attribute_class = getattr(transformers_module, class_name)
-> 1020 args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
1021 return args
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:878, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
876 config = AutoConfig.for_model(**config_dict)
877 else:
--> 878 config = AutoConfig.from_pretrained(
879 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
880 )
881 config_tokenizer_class = config.tokenizer_class
882 if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:1045, in AutoConfig.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1039 except KeyError:
1040 raise ValueError(
1041 f"The checkpoint you are trying to load has model type
{config_dict['model_type']}
"1042 "but Transformers does not recognize this architecture. This could be because of an "
1043 "issue with the checkpoint, or because your version of Transformers is out of date."
1044 )
-> 1045 return config_class.from_dict(config_dict, **unused_kwargs)
1046 else:
1047 # Fallback: use pattern matching on the string.
1048 # We go from longer names to shorter names to catch roberta before bert (for instance)
1049 for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/configuration_utils.py:734, in PretrainedConfig.from_dict(cls, config_dict, **kwargs)
731 # We remove it from kwargs so that it does not appear in
return_unused_kwargs
.732 config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
--> 734 config = cls(**config_dict)
736 if hasattr(config, "pruned_heads"):
737 config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/llava/configuration_llava.py:108, in LlavaConfig.init(self, vision_config, text_config, ignore_index, image_token_index, projector_hidden_act, vision_feature_select_strategy, vision_feature_layer, image_seq_length, **kwargs)
104 if isinstance(vision_config, dict):
105 vision_config["model_type"] = (
106 vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
107 )
--> 108 vision_config = CONFIG_MAPPINGvision_config["model_type"]
109 elif vision_config is None:
110 vision_config = CONFIG_MAPPING["clip_vision_model"](
111 intermediate_size=4096,
112 hidden_size=1024,
(...)
118 projection_dim=768,
119 )
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:740, in _LazyConfigMapping.getitem(self, key)
738 return self._extra_content[key]
739 if key not in self._mapping:
--> 740 raise KeyError(key)
741 value = self._mapping[key]
742 module_name = model_type_to_module_name(key)
KeyError: 'intern_vit_6b'"
}
Expected behavior
I think I need to modify the source code and add internvit, just like clip. I hope the official staff can tell me where to modify it.
I love transformers!
The text was updated successfully, but these errors were encountered: