In [None]:
from PIL import Image

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
url = "data/quic360/downtest/images/2094501355_045ede6d89_k.jpg"
image = Image.open(url)
inputs = processor(text=["a photo of a panorama", 
                         "There are tall round cocktail tables in the center in a line, there is bright red carpeting, and there are brown benches against the back wall.",
                         "The all furniture are made by the wood and the metal stand , there have three tables by the wooden material"], 
                   images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Logits per image:", logits_per_image)
print("Probabilities:", probs)


Logits per image: tensor([[25.8388, 32.4433, 24.4310]], grad_fn=<TBackward0>)
Probabilities: tensor([[1.3520e-03, 9.9832e-01, 3.3079e-04]], grad_fn=<SoftmaxBackward0>)


In [None]:
from transformers import Qwen3ForCausalLM, AutoModelForCausalLM, AutoTokenizer

tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")
import torch
import torch.nn as nn
import torch.nn.functional as F
text = "Hello, how are you?"
input_ids = tok(text, return_tensors="pt").input_ids
out = model(input_ids, return_dict=True)


Output logits: tensor([[[ 5.9520,  4.9115,  3.7664,  ...,  1.4857,  1.4857,  1.4857],
         [10.4748,  0.7778,  0.2089,  ..., -1.7404, -1.7404, -1.7404],
         [ 7.9126,  5.2561,  3.0047,  ..., -1.7214, -1.7214, -1.7214],
         [ 7.5768,  6.2890,  2.9667,  ..., -2.4134, -2.4134, -2.4134],
         [11.8694,  7.0810,  4.7935,  ..., -1.7592, -1.7592, -1.7592],
         [ 7.1119,  5.2475,  2.5917,  ..., -0.0563, -0.0563, -0.0563]]],
       grad_fn=<UnsafeViewBackward0>)


In [17]:
embedded = model.model.embed_tokens(input_ids)
print(embedded.shape)

torch.Size([1, 6, 1024])


In [12]:
print("Output logits:", out.logits.shape)

Output logits: torch.Size([1, 6, 151936])


In [10]:
print(model.get_input_embeddings())

Embedding(151936, 1024)


In [5]:

print("Output logits:", out)



Output logits: CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.4758,  4.5917,  4.9683,  ...,  2.1538,  2.1538,  2.1538],
         [ 7.5466,  5.1561,  6.5977,  ..., -0.2438, -0.2438, -0.2438],
         [ 9.3812,  9.6463, 11.9673,  ...,  0.3227,  0.3227,  0.3227],
         [10.3389,  7.1600, 14.7043,  ..., -0.4302, -0.4303, -0.4303]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=<transformers.cache_utils.DynamicCache object at 0x313367110>, hidden_states=None, attentions=None)
