In [1]:
import os
import sys
try:
    get_ipython
    current_dir = os.getcwd()
except NameError:
    current_dir = os.path.dirname(os.path.abspath(__file__))

# Set path，temporary path expansion
project_dir = os.path.abspath(os.path.join(current_dir, '..'))
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
from PIL import Image
import requests

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:

LLM_NAME ="Qwen/Qwen3-4B-Instruct-2507"
VISION_NAME = "google/siglip-base-patch16-256"

In [4]:


class LLM_SigLIP_Multimodal(nn.Module):
    def __init__(self, llm_model_name=LLM_NAME, 
                 siglip_model_name=VISION_NAME):
        super().__init__()
        local_path = os.path.join("/root/autodl-tmp","model/")
        print(local_path)
        
        # 加载LLM模型和分词器
        self.tokenizer = AutoTokenizer.from_pretrained(local_path+llm_model_name)
        self.llm = AutoModelForCausalLM.from_pretrained(
            local_path+llm_model_name,
            device_map=device
        )
        self.device = next(self.llm.parameters()).device
        print(f"使用设备: {self.device}")
        
        # 加载SigLIP图像编码器
        self.image_processor = AutoProcessor.from_pretrained(local_path+siglip_model_name)
        self.image_encoder = AutoModel.from_pretrained(local_path+siglip_model_name)
        self.image_encoder = self.image_encoder.to(self.device)
        
        # 确保分词器有pad_token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # 投影层：将SigLIP的图像特征映射到LLM的嵌入空间
        self.projection = nn.Sequential(
            nn.Linear(self.image_encoder.config.vision_config.hidden_size, 1024),
            nn.GELU(),
            nn.Linear(1024, self.llm.config.hidden_size)
        ).to(self.device)
        
        # 添加图像标记
        self.image_token = "<image>"
        self.tokenizer.add_tokens([self.image_token])
        self.llm.resize_token_embeddings(len(self.tokenizer))
        self.llm = self.llm.to(self.device) 
    
        
    def encode_image(self, image):
        """使用SigLIP编码图像并投影到LLM空间"""
        # print(image)
        # 处理图像  提取图像特征 显式指定 text=None
        inputs = self.image_processor(text=None,images=image, return_tensors="pt").to(self.device)
        # print(inputs)
        # 获取图像特征
        with torch.no_grad():
            vision_outputs = self.image_encoder.vision_model(**inputs)
            # shape: [batch_size, num_patches + 1, vision_embed_dim]（包含CLS标记）
            print(vision_outputs.last_hidden_state.shape)
            image_features = vision_outputs.last_hidden_state[:, 0, :]  # [batch_size, vision_embed_dim]
        
        # 投影到LLM的嵌入空间
        projected_features = self.projection(image_features)
        return projected_features
    
    def generate(self, image, prompt, max_length=200, temperature=0.7):
        """根据图像和提示生成响应"""
        self.eval()
        with torch.no_grad():
            # 编码图像
            image_embeds = self.encode_image(image)
            
            # 准备输入文本，包含图像标记
            input_text = f"{self.image_token}\n{prompt}"
            
            # 编码文本
            inputs = self.tokenizer(
                input_text,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(self.device)
            print(inputs.input_ids)
            # 构建输入嵌入：将图像标记替换为实际图像特征
            input_embeds = self.llm.get_input_embeddings()(inputs.input_ids)
            image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
            
            # 找到图像标记的位置并替换
            for batch_idx in range(input_embeds.shape[0]):
                image_positions = (inputs.input_ids[batch_idx] == image_token_id).nonzero()
                if len(image_positions) > 0:
                    pos = image_positions[0, -1]  # 获取第一个图像标记的位置
                    input_embeds[batch_idx, pos] = image_embeds[batch_idx]
            
            # 生成响应
            outputs = self.llm.generate(
                input_ids=inputs.input_ids, 
                inputs_embeds=input_embeds,
                attention_mask=inputs.attention_mask,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
            
            # 解码并返回结果
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


In [5]:
print("初始化多模态模型...")
model = LLM_SigLIP_Multimodal(
    llm_model_name=LLM_NAME,
    siglip_model_name=VISION_NAME
)



初始化多模态模型...
/root/autodl-tmp/model/


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


使用设备: cuda:0


In [6]:
try:
    # 加载示例图像
    image_path = "example.jpg"  # 替换为你的图像路径
    # image = Image.open(image_path).convert("RGB")
    # print(f"成功加载图像: {image_path}")
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
  
    # 示例提示
    prompt = "详细描述这张图片的内容，包括场景、物体和可能的氛围。"
    
    # 生成响应
    print("生成响应中...")
    response = model.generate(image, prompt, max_length=300)
    
    print("\n===== 模型响应 =====")
    print(response)
    
except FileNotFoundError:
    print(f"错误: 未找到图像文件 {image_path}")
    print("请确保图像文件存在或修改图像路径")
except Exception as e:
    print(f"发生错误: {str(e)}")

生成响应中...
torch.Size([1, 256, 768])
tensor([[151669,    198, 100700,  53481, 108893,  45930, 104597,   3837, 100630,
         102122,   5373, 109840,  33108,  87267,   9370, 104556,   1773]],
       device='cuda:0')

===== 模型响应 =====
<image>
详细描述这张图片的内容，包括场景、物体和可能的氛围。 由于我目前无法直接查看或分析图片，因此无法提供对具体图片内容的详细描述。如果你能提供图片的详细信息，例如场景描述、物体特征、颜色搭配、构图方式或氛围感受等，我可以帮助你进行详细的分析和描述。

例如，你可以告诉我：
- 场景是室内还是室外？是城市、乡村、森林、沙漠还是其他？
- 图片中有哪些主要物体？比如人物、建筑、植物、车辆、动物等？
- 颜色和光线如何？是明亮温暖的阳光，还是阴暗冷色调的黄昏？
- 氛围是宁静、紧张、欢乐、悲伤、神秘还是其他情绪？
- 是否有特定的风格或艺术表现手法？比如写实、抽象、超现实、摄影、绘画等？

提供这些信息后，我将为你详细描述图片的内容、场景、物体和可能的氛围。期待你的补充！ 📸✨

如果你有图片链接或可以文字描述的内容，也欢迎继续发送！
