In [3]:
# Requires transformers>=4.51.0

import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


# 取最后一个token的embedding作为整个句子的embedding
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'


def tokenize(tokenizer, input_texts, eod_id, max_length):
    batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
    for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
        seq.append(eod_id)
        att.append(1)
    batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
    return batch_dict


# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = [
    get_detailed_instruct(task, 'What is the capital of China?'),
    get_detailed_instruct(task, 'Explain gravity')
]
# No need to add instruction for retrieval documents
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]

input_texts = queries + documents

tokenizer = AutoTokenizer.from_pretrained('/home/xwj/Model/qwen3-embedding-0.6b', padding_side='left')
model = AutoModel.from_pretrained('/home/xwj/Model/qwen3-embedding-0.6b')

# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()

eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
max_length = 8192

# Tokenize the input texts
batch_dict = tokenize(tokenizer, input_texts, eod_id, max_length)
batch_dict.to(model.device)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
print(embeddings.shape)

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T)
print(scores)
print(scores.tolist())


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([4, 1024])
tensor([[0.7646, 0.1414],
        [0.1355, 0.6000]], grad_fn=<MmBackward0>)
[[0.7645566463470459, 0.14142508804798126], [0.13549773395061493, 0.5999549627304077]]
