In [None]:
!pip install transformers accelerate bitsandbytes torch


In [2]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

# ========================
# Load Qwen Model
# ========================
model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Qwen loaded successfully in 8-bit mode")

# ========================
# Meta-caption function
# ========================
def get_meta_caption(dense_captions, meta_model, meta_tokenizer, device="cuda"):
    pattern = r"<caption>(.*?)</caption>"

    prompt = """I am providing you with captions for sub-regions of an image.
You need to stitch all the captions into one unified caption for the entire image.
Do not add new information. Keep it short but detailed.
Ignore mentions of backgrounds. Do not hallucinate details.
Return the final caption wrapped inside <caption></caption> tags.
"""

    for i, cap in enumerate(dense_captions):
        prompt += f"{i+1}. {cap}\n"

    messages = [
        {"role": "system", "content": "You are a meta image captioning model. You combine multiple sub-captions into one coherent grounded caption."},
        {"role": "user", "content": prompt}
    ]

    text = meta_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = meta_tokenizer([text], return_tensors="pt").to(device)

    generated_ids = meta_model.generate(
        model_inputs.input_ids,
        max_new_tokens=400,
        do_sample=False
    )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = meta_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    try:
        response = re.findall(pattern, response, re.DOTALL)[0]
        return response.strip()
    except:
        return response.strip()

# ========================
# Hardcoded dense captions (5 sets)
# ========================
caption_sets = [
    ["A man holding an umbrella", "The umbrella is red", "It is raining heavily"],
    ["A cat sleeping on a sofa", "The sofa is blue", "There is a blanket nearby"],
    ["A plate of sushi", "Chopsticks on the side", "Wasabi and soy sauce visible"],
    ["A mountain covered in snow", "Trees without leaves", "A ski lift in the background"],
    ["A city skyline at night", "Buildings lit up", "Reflections on the water"]
]

# ========================
# Timing and Throughput
# ========================
start_time = time.time()

for i, dense_caps in enumerate(caption_sets):
    print(f"\nGenerating meta-caption {i+1}/5")
    meta_caption = get_meta_caption(dense_caps, model, tokenizer, device)
    print("→", meta_caption)

end_time = time.time()

total_time = end_time - start_time
throughput = len(caption_sets) / total_time

print(f"\nTotal time for {len(caption_sets)} captions: {total_time:.2f} seconds")
print(f"Throughput: {throughput:.2f} images per second")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Qwen loaded successfully in 8-bit mode

Generating meta-caption 1/5
→ A man holding a red umbrella as it rains heavily.

Generating meta-caption 2/5
→ A cat sleeping on a blue sofa with a nearby blanket.

Generating meta-caption 3/5
→ A plate of sushi with chopsticks on the side, wasabi, and soy sauce readily available.

Generating meta-caption 4/5
→ A mountain covered in snow with trees without leaves, and a ski lift in the background.

Generating meta-caption 5/5
→ A city skyline at night with buildings lit up and reflections on the water.

Total time for 5 captions: 18.59 seconds
Throughput: 0.27 images per second
