CogVLM
====

**CogVLM: Visual Expert for Pretrained Language Models**

 * Paper: https://arxiv.org/pdf/2311.03079

![CogVLM Arch](../assets/cogvlm_arch.jpg)

## Installation

```bash
conda create -n cogvlm python=3.10 -y
conda activate cogvlm

python -m pip install --upgrade pip

# nvidia-smi => cuda 12.2 (12.1 works)
pip install torch==2.1.2+cu121 torchvision==0.16.2+cu121 \
  --extra-index-url https://download.pytorch.org/whl/cu121

pip install xformers==0.0.22 -f https://download.pytorch.org/whl/cu121/torch_stable.html

pip install -r requirements.txt

# download spacy data
python -m spacy download en_core_web_sm

python -c "import torch;print(torch.cuda.is_available(), torch.cuda.get_device_name(0))"


# for quantization:
pip install bitsandbytes
```

* Note:
Requires transformers 4.36.2 (newer versions won't work)
```bash
pip install --force-reinstall transformers==4.36.2
```

In [None]:
import os
import argparse
from typing import Optional

import torch
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer

# export XFORMERS_FORCE_DISABLE=1
os.environ["XFORMERS_FORCE_DISABLE"] = "1"

use_bf16 = False
use_fp16 = False
quant = False
device = "cuda" if torch.cuda.is_available() else "cpu"

# Choose appropriate floating point type
if use_bf16 and not use_fp16:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

#torch_dtype = torch.float16 if use_fp16 else torch.float32

tokenizer_name = "lmsys/vicuna-7b-v1.5"
model_name = "THUDM/cogvlm-chat-hf"

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)

if quant:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        load_in_4bit=True,
        trust_remote_code=True,
    )
    model.eval();
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    model.to(device)
    model.eval();


  _torch_pytree._register_pytree_node(
    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.8.0+cu128)
    Python  3.10.13 (you have 3.10.18)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
  def forward(cls, ctx, x, w1, b1, w2, b2, w3, b3):
  def backward(cls, ctx, dx5):
Loading checkpoint shards: 100%|██████████| 8/8 [00:16<00:00,  2.07s/it]


In [3]:
image_path = "../samples/plants.jpg"
query = "What are the objects in the image?"

image = Image.open(image_path).convert("RGB")
image = image.resize((224, 224))

input_by_model = model.build_conversation_input_ids(
    tokenizer,
    query=query,
    history=[],
    images=[image],
)
input_by_model.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'images'])

In [4]:
# Construct the dict of tensors expected by the model.generate API.  Note
# that ``images`` and ``cross_images`` tensors live on the CPU until
# generation time.  They are moved to ``device`` and cast to the model's
# dtype as needed.  For quantized models ``images`` must remain as a
# nested list (one list per batch) of tensors.
inputs = {
    "input_ids": input_by_model["input_ids"].unsqueeze(0).to(device),
    "token_type_ids": input_by_model["token_type_ids"].unsqueeze(0).to(device),
    "attention_mask": input_by_model["attention_mask"].unsqueeze(0).to(device),
    "images": [[input_by_model["images"][0].to(device).to(torch_dtype)]],
}

# Some models also return ``cross_images`` for cross‑attention layers
if "cross_images" in input_by_model and input_by_model["cross_images"]:
    inputs["cross_images"] = [[input_by_model["cross_images"][0].to(device).to(torch_dtype)]]

# Generation parameters: disable sampling for deterministic output and limit
# the total sequence length.  Feel free to adjust ``max_length`` to suit
# your needs.
gen_kwargs = {"max_length": 2048, "do_sample": False}


In [None]:
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    # Slice off the input tokens to obtain only newly generated tokens
    new_tokens = outputs[:, inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(new_tokens[0], skip_special_tokens=True)

print(response)