In [None]:

import torch
import requests
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch

print('tokenizer')
# tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
tokenizer = LlamaTokenizer.from_pretrained('/home/ubuntu/models/vicuna-7b-v1.5')
print('AutoModelForCausalLM')
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        '/home/ubuntu/models/cogvlm-chat-hf',
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
device_map = infer_auto_device_map(model, max_memory={0:'20GiB',1:'20GiB','cpu':'16GiB'}, no_split_module_classes='CogVLMDecoderLayer')
model = load_checkpoint_and_dispatch(
    model,
    '/home/ubuntu/models/cogvlm-chat-hf',   # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
    device_map=device_map,
)
model = model.eval()
# check device for weights if u want to
for n, p in model.named_parameters():
    print(f"{n}: {p.device}")


print('chat example')
# chat example
query = 'Describe this image'
image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/1.png?raw=true', stream=True).raw).convert('RGB')
inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image])  # chat mode
inputs = {
    'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
    'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
    'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
    'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
}
gen_kwargs = {"max_length": 2048, "do_sample": False}

with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0]))



tokenizer
AutoModelForCausalLM


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?w/s]

  0%|          | 0/79 [00:00<?, ?w/s]

  0%|          | 0/82 [00:00<?, ?w/s]

  0%|          | 0/80 [00:00<?, ?w/s]

  0%|          | 0/80 [00:00<?, ?w/s]

  0%|          | 0/348 [00:00<?, ?w/s]

  0%|          | 0/439 [00:00<?, ?w/s]

  0%|          | 0/4 [00:00<?, ?w/s]

model.embed_tokens.weight: cuda:0
model.layers.0.self_attn.vision_expert_query_key_value.weight: cuda:0
model.layers.0.self_attn.vision_expert_dense.weight: cuda:0
model.layers.0.self_attn.language_expert_query_key_value.weight: cuda:0
model.layers.0.self_attn.language_expert_dense.weight: cuda:0
model.layers.0.mlp.language_mlp.gate_proj.weight: cuda:0
model.layers.0.mlp.language_mlp.up_proj.weight: cuda:0
model.layers.0.mlp.language_mlp.down_proj.weight: cuda:0
model.layers.0.mlp.vision_mlp.gate_proj.weight: cuda:0
model.layers.0.mlp.vision_mlp.up_proj.weight: cuda:0
model.layers.0.mlp.vision_mlp.down_proj.weight: cuda:0
model.layers.0.input_layernorm.weight: cuda:0
model.layers.0.post_attention_layernorm.weight: cuda:0
model.layers.1.self_attn.vision_expert_query_key_value.weight: cuda:0
model.layers.1.self_attn.vision_expert_dense.weight: cuda:0
model.layers.1.self_attn.language_expert_query_key_value.weight: cuda:0
model.layers.1.self_attn.language_expert_dense.weight: cuda:0
model