# **1. Setup & Library Imports**

In [1]:
import os
import pandas as pd
import torch
from transformers import Qwen2_5_VLForConditionalGeneration
from transformers import AutoProcessor
from PIL import Image
from qwen_vl_utils import process_vision_info
from tqdm import tqdm

# **2. Model**

In [2]:
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  71%|#######   | 2.81G/3.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:  68%|######8   | 2.40G/3.53G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

# **3. Create prompt function**

In [3]:
def create_prompt(emoji_title):
    instruction = {
        "Instruction": "Given the emoji title and the corresponding image, generate a natural language description "
                       "of the emoji's key features. The description should include: 1) the head shape, 2) eye characteristics, "
                       "3) mouth characteristics, 4) facial expression, 5) skin color, 6) any action (if present), and 7) background color.\n"
                       "The description should be concise and structured like this:\n"
                       "\"{emoji_title} emoji with a {head shape}, {eye description} eyes, {mouth description} mouth, "
                       "{expression description} expression, {skin color} skin color, {action description}, "
                       "with a {background color} background.\""
    }
    example = {
        "Example": "For an emoji titled 'Pepe the Frog', the description should be like:\n"
                   "'Pepe the frog emoji with a round head, big eyes, smiling mouth, happy expression, "
                   "green skin color, no action, with a green background.'"
    }
    prompt = f"{instruction}\n{example}\nInput: '{emoji_title}'"
    return prompt

# **4. Setup input**

In [7]:
df = pd.read_csv('crawled_data/metadata.csv')
subfolder = 'emoji_dataset'
images_folder = 'crawled_data/images'

In [8]:
for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {subfolder}"):
    image_path = os.path.join(images_folder, row["file_name"])
    prompt = create_prompt(row["image_title"])

    image = Image.open(image_path).convert("RGBA")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    result_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    df.loc[_, "prompt"] = result_text

  attn_output = F.scaled_dot_product_attention(
Processing emoji_dataset: 100%|██████████| 3866/3866 [3:18:00<00:00,  3.07s/it]  


In [9]:
# Add this after your loop finishes
df.to_csv("emoji_descriptions.csv", index=False)