In [4]:
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel,AutoImageProcessor
import torch

model_path ="/root/autodl-tmp/model/google/siglip-base-patch16-256"

In [None]:
model = AutoModel.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

texts = ["a photo of 2 cats", "a photo of 2 dogs"]
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image) # these are the probabilities
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


35.6% that image 0 is 'a photo of 2 cats'


In [None]:



processor = AutoImageProcessor.from_pretrained(model_path)  
model = AutoModel.from_pretrained(model_path)


url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")  

# 预处理图像
inputs = processor(
    images=image,  # 可以是单张图像或图像列表（批次处理）
    return_tensors="pt",  
    padding=True,  # 批次处理时自动填充
    truncation=False  # 图像通常不截断，而是缩放
)

# 查看预处理结果：包含像素值张量（pixel_values）等
print("预处理输出键值:", inputs.keys())  # 通常包含 'pixel_values'
print("图像张量形状:", inputs["pixel_values"].shape)  # 例如：[1, 3, 256, 256]（批次大小1，3通道，256x256）

# 4. 输入模型获取特征
with torch.no_grad():
    outputs = model.vision_model(**inputs)
image_features = outputs.last_hidden_state[:, 0, :]  # 提取[CLS]标记的特征

预处理输出键值: KeysView({'pixel_values': tensor([[[[ 0.1059,  0.1608,  0.1922,  ..., -0.1765, -0.1922, -0.1922],
          [ 0.1294,  0.1686,  0.1686,  ..., -0.2000, -0.1922, -0.2078],
          [ 0.1137,  0.1608,  0.1686,  ..., -0.2471, -0.2157, -0.2235],
          ...,
          [ 0.8431,  0.8118,  0.7412,  ...,  0.6627,  0.6078,  0.6157],
          [ 0.8196,  0.7961,  0.7804,  ...,  0.5059,  0.4118,  0.3255],
          [ 0.8510,  0.7490,  0.7647,  ..., -0.0902, -0.1922, -0.3176]],

         [[-0.8118, -0.8196, -0.8118,  ..., -0.8902, -0.9059, -0.8980],
          [-0.7804, -0.7882, -0.8039,  ..., -0.8824, -0.8745, -0.8902],
          [-0.8039, -0.7961, -0.7882,  ..., -0.8902, -0.8824, -0.8902],
          ...,
          [-0.2471, -0.2863, -0.3412,  ..., -0.4353, -0.4745, -0.4667],
          [-0.2784, -0.2863, -0.3333,  ..., -0.5294, -0.5765, -0.6235],
          [-0.2627, -0.3490, -0.3412,  ..., -0.8039, -0.8353, -0.8824]],

         [[-0.5529, -0.4588, -0.4510,  ..., -0.7176, -0.6941, -0.70

  return self.preprocess(images, **kwargs)


In [None]:
from transformers import SiglipConfig

config = SiglipConfig.from_pretrained(model_path)
print(config)  # 打印所有配置参数