In [1]:
pip install transformers qwen_vl_utils accelerate optimum auto-gptq

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 加载模型和处理器（全局加载，避免重复加载）
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "./Qwen2-VL-7B-Instruct-GPTQ-Int8", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct-GPTQ-Int8")


def process_tsv(input_file, output_file, img_base_path):
    """
    处理TSV文件，将第三列的图片路径传入Qwen2-VL模型生成图像描述，并替换第三列内容。

    参数:
        input_file (str): 输入的TSV文件路径
        output_file (str): 输出的TSV文件路径
    """

    def generate_image_description(image_path):
        """
        使用Qwen2-VL模型生成图像描述。

        参数:
            image_path (str): 图像路径

        返回:
            str: 生成的图像描述
        """
        try:
            # 构造消息结构
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image_path},
                        {"type": "text", "text": "Identify the key elements in the image and describe each element's specific emotions, actions, or expressions in a single paragraph of no more than 50 words."},
                    ],
                }
            ]
            # 处理输入
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)

            with torch.no_grad():  # 使用 no_grad 以减少显存占用
                inputs = processor(
                    text=[text],
                    images=image_inputs,
                    videos=video_inputs,
                    padding=True,
                    return_tensors="pt",
                )
                inputs = inputs.to("cuda")
    
                # 模型推理
                generated_ids = model.generate(**inputs, max_new_tokens=192)
                generated_ids_trimmed = [
                    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            return output_text[0]  # 返回生成的描述
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            return "Error generating description"

    # 读取TSV文件
    data = pd.read_csv(input_file, sep="\t")

    # 替换第三列内容
    new_descriptions = []
    for index, row in data.iterrows():
        img_path = os.path.join(img_base_path, row[2])
        description = generate_image_description(img_path)
        new_descriptions.append(description)
        print(f"Processed row {index + 1}/{len(data)}: {description}")
        torch.cuda.empty_cache()

    # 更新第三列
    data.iloc[:, 2] = new_descriptions

    # 保存新的TSV文件
    data.to_csv(output_file, sep="\t", index=False)
    print(f"Updated TSV file saved to {output_file}")


CUDA extension not installed.
CUDA extension not installed.
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]


In [4]:
process_tsv("test.tsv", "new_test.tsv", "./twitterdataset/img_data/twitter2015_images/")

Processed row 1/1037: The image shows a man in a white hat and red shirt standing among a group of people. He appears to be speaking or giving instructions, with his hand raised and a focused expression. The people around him are engaged, some looking up at him, suggesting a moment of instruction or motivation. The setting seems to be outdoors, possibly during a sports event or training session.
Processed row 2/1037: The image shows a person with vibrant, colorful hair styled in tall, playful pigtails. They are wearing a polka-dotted scarf and a green outfit. The background includes a cozy living room setting with a dog and a potted plant. The overall mood is cheerful and lighthearted.
Processed row 3/1037: The image depicts a stone sculpture of two figures embracing. The figures are abstract and simplified, with smooth, rounded forms that convey a sense of intimacy and connection. The stone's texture adds a raw, natural feel to the piece, evoking emotions of warmth, protection, and af

In [5]:
process_tsv("dev.tsv", "new_dev.tsv", "./twitterdataset/img_data/twitter2015_images/")

Processed row 1/1122: The image shows a person with shoulder-length black hair, wearing a black sleeveless top. The background features a colorful, abstract design with the word "Kylie" and some other text. The overall mood is casual and relaxed.
Processed row 2/1122: The image shows a bronze plaque on a metal table, featuring a portrait and text honoring Pedro Jaime Martinez. The plaque is placed in a workshop or industrial setting, with various tools and materials visible in the background. The plaque commemorates Martinez's contributions to baseball, particularly his time with the Boston Red Sox and Philadelphia Phillies. The setting suggests a focus on craftsmanship and dedication to the craft.
Processed row 3/1122: The image shows a bronze plaque on a metal table, featuring a portrait and text honoring Pedro Jaime Martinez. The plaque is placed in a workshop or industrial setting, with various tools and materials visible in the background. The plaque commemorates Martinez's contri