In [None]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
torch.cuda.empty_cache()

# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)


In [2]:
import json
import os
import numpy as np
import time


In [4]:
!pwd

/home/zkl/others/master/pycodes/project/competitions/tianchi/mllm/src


In [12]:

def split_dataset(data_path, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1,saving=True,seed=123):
    # 用来产生训练集、验证集、测试集的函数，输入的data_path 是一个文件夹，下面需要有一个data.json文件，里面是json格式的数据，
    # 该函数会将之后的结果保存到和data.json同级的train.json、val.json、test.json文件中。
    
    np.random.seed(seed)
    data_json_path=os.path.join(data_path, 'data.json')
    with open(data_json_path, 'r') as f:
        data_json=json.load(f)
    index=np.arange(len(data_json))
    shullfed_index=np.random.permutation(index)
    train_index=shullfed_index[:int(len(data_json)*train_ratio)]
    val_index=shullfed_index[int(len(data_json)*train_ratio):int(len(data_json)*(train_ratio+val_ratio))]
    test_index=shullfed_index[int(len(data_json)*(train_ratio+val_ratio)):]
    
    train_json=[*map(lambda i:data_json[i],train_index)]
    val_json=[*map(lambda i:data_json[i],val_index)]
    test_json=[*map(lambda i:data_json[i],test_index)]
    # train_json=[data_json[i] for i in train_index]
    # val_json=[data_json[i] for i in val_index]
    # test_json=[data_json[i] for i in test_index]
    
    if saving:
        for json_name,json_data in zip(['train', 'val', 'test'], [train_json, val_json, test_json]):
            with open(os.path.join(data_path, f'{json_name}.json'), 'w') as f:
                json.dump(obj=json_data, fp=f,ensure_ascii=False)
    return train_json, val_json, test_json
    
    

start_time=time.time()
train_json, val_json, test_json=split_dataset(data_path='../datas/train',train_ratio=0.8,val_ratio=0.1,test_ratio=0.1,saving=True,seed=123)
print(f'Time cost: {time.time()-start_time:.2f}s')
    
# maping :0.01s-0.02s   
# train [for ]:0.01s-0.02s
    
    
    

Time cost: 0.02s


加载训练或者测试的数据的 dataset


In [1]:
import sys
from rich import print
print(sys.path)
sys.path.append('/home/zkl/others/master/pycodes/project/competitions/tianchi/mllm')

In [3]:
from src.data_utils import CustomDataset,Conversions
import matplotlib.pyplot as plt

data_set=CustomDataset(data_path='../datas/train',data_type='train')
data_dict=data_set[12]
# print(data_dict)
images=data_dict['image']


# 确定行数和列数以创建网格布局，这里假设最多 4 列
num_images = len(images)
cols = min(num_images, 4)  # 最多4列
rows = (num_images + cols - 1) // cols  # 计算需要多少行

# 创建一个子图网格
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
axes = axes.flatten()  # 将 axes 展平为一维数组，方便迭代

# 显示每个图像
for ax, img in zip(axes, images):
    ax.imshow(img)
    ax.axis('off')  # 关闭坐标轴

# 如果图像数量不足填满整个网格，关闭多余的子图
for idx in range(num_images, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

conversations=Conversions()
conversations.parse_conversation(data_dict['instruction'])


AttributeError: 'Image' object has no attribute 'shape'

In [None]:
# Qwen/Qwen2-VL-2B-Instruct
from transformers import Qwen2VLModel, Qwen2VLTokenizer,Qwen2VLPreTrainedModel

In [5]:
from jinja2 import Template
# 定义模板字符串
template_str = """
<!DOCTYPE html>
<html>
<head>
    <title>Welcome Page</title>
</head>
<body>
    <h1>Hello, {{ name }}!</h1>
</body>
</html>
"""

# 创建模板对象
template = Template(template_str)

# 渲染模板，传入上下文数据
rendered_html = template.render(name='Alice')

print(rendered_html)

In [7]:

a={
    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}\
        {% for message in messages %}\
            {% if loop.first and message['role'] != 'system' %}\
                <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n\
            {% endif %}\
            <|im_start|>{{ message['role'] }}\n\
            {% if message['content'] is string %}\
                {{ message['content'] }}<|im_end|>\n\
            {% else %}\
                {% for content in message['content'] %}\
                    {% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}\
                        {% set image_count.value = image_count.value + 1 %}\
                        {% if add_vision_id %}\
                            Picture {{ image_count.value }}: \
                        {% endif %}\
                        <|vision_start|><|image_pad|><|vision_end|>\
                        {% elif content['type'] == 'video' or 'video' in content %}\
                            {% set video_count.value = video_count.value + 1 %}\
                            {% if add_vision_id %}Video {{ video_count.value }}: \
                        {% endif %}\
                        <|vision_start|><|video_pad|><|vision_end|>\
                        {% elif 'text' in content %}\
                            {{ content['text'] }}\
                        {% endif %}\
                    {% endfor %}<|im_end|>\n\
            {% endif %}\
        {% endfor %}\
        {% if add_generation_prompt %}\
            <|im_start|>assistant\n\
        {% endif %}"
}
template = Template(a['chat_template']) 
print(template)







In [8]:
x="你是一个电商客服专家，请根据用户与客服的多轮对话判断用户的意图分类标签。\n<用户与客服的对话 START>\n用户: 于\n客服: 十里春风不及您，欢迎来到我们的店铺😊。我是您的小助手【客服精灵】，有什么事情是【客服精灵】能帮到您的吗？\n用户: <image>\n客服: 亲爱的，我在这里哦，只是一张图片还不能解答您的疑问呢，麻烦您用文字具体说明一下，比如“发货时间”、“店铺促销”等。\n用户: 颜色变淡了吗？\n客服: 这款皮带扣采用实心锌合金并经过真空电镀处理，颜色持久不褪，质量上乘。\n<用户与客服的对话 END>\n请直接只输出分类标签结果，不需要其他多余的话。以下是可以参考的分类标签为：[\"反馈密封性不好\",\"是否好用\",\"是否会生锈\",\"排水方式\",\"包装区别\",\"发货数量\",\"反馈用后症状\",\"商品材质\",\"功效功能\",\"是否易褪色\",\"适用季节\",\"能否调光\",\"版本款型区别\",\"单品推荐\",\"用法用量\",\"控制方式\",\"上市时间\",\"商品规格\",\"信号情况\",\"养护方法\",\"套装推荐\",\"何时上货\",\"气泡\"]\n"



print(x)

In [None]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
import os
os.environ['HF_HOME']='huggingface'
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from data_utils import CustomDataset,Conversions
from torch.utils.data import DataLoader

def main():
    # Load the model in half-precision on the available device(s)
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    
    data_set=CustomDataset(data_path='datas/train',data_type='train')
    i=0
    datas={'instruction':[],'image':[]}
    while i<=100:
        if len(data_set[i]['image'])==1:
            datas['instruction'].append(data_set[i]['instruction'])
            datas['image'].append(data_set[i]['image'][0])
            i+=1
            continue
    conversation_helper=Conversions()
    def resize_image(image):
        origin_w,origin_h = image.size
        new_w,new_h = origin_h//2,origin_w//2
        image=image.resize((new_w,new_h))
        return image
    datas['image']=[resize_image(Image.open(image)) for image in datas['image']]
    
    conversation=[conversation_helper.parse_conversation(instruction) for instruction in datas['instruction']]

    # Preprocess the inputs
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

    inputs = processor(
        text=[text_prompt], images=[datas['image']], padding=True, return_tensors="pt"
    )
    for input in inputs:
        input = input.to("cuda")

    # Inference: Generation of the output
    output_ids = model.generate(**input, max_new_tokens=128)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    print(output_text)


if __name__=='__main__':
    main()