In [1]:
from typing import List, Optional

import torch
from torch import Tensor

import numpy as np
from transformers import AutoTokenizer, AutoModel

In [2]:
class Qwen3TextEmbedding:
    """A class for generating text embeddings using the Qwen3 model."""
    def __init__(
            self,
            model_name: str = 'Qwen/Qwen3-Embedding-0.6B',
            max_length: int = 8192,
        ):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side='left'
        )
        self.model = AutoModel.from_pretrained(
            model_name,
            attn_implementation="flash_attention_2",
            torch_dtype=torch.float16,
        ).cuda()
        self.max_length = max_length

    @staticmethod
    def _last_token_pool(
        last_hidden_states: Tensor,
        attention_mask: Tensor,
    ) -> Tensor:
        """Pools the last token representation, handling both left and right padding."""
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[
            torch.arange(batch_size, device=last_hidden_states.device),
            sequence_lengths
        ]

    @staticmethod
    def _get_detailed_instruct(task_description: str, query: str) -> str:
        """Formats a query with a given task description."""
        return f'Instruct: {task_description}\nQuery: {query}'

    def embed(
            self,
            texts: List[str],
            instruction: Optional[str] = None,
            return_numpy: bool = True
        ):
        """
        Generates normalized embeddings for input text(s).

        Args:
            texts: A list of strings to embed.
            instruction: Optional task instruction for formatting queries.
            return_numpy: Whether to return numpy array (True) or Python lists (False).

        Returns:
            Embeddings as np.ndarray or list of lists.
        """
        if instruction:
            texts = [self._get_detailed_instruct(instruction, text) for text in texts]

        batch_dict = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.model.device)

        outputs = self.model(**batch_dict)
        embeddings = self._last_token_pool(
            outputs.last_hidden_state, batch_dict['attention_mask']
        ).detach().cpu().numpy()

        if return_numpy:
            return embeddings
        return embeddings.tolist()

In [3]:
# ---- Test run ----
english_text = {"texts": ["Hello world!"], "instruction": "Test"}

In [4]:
# Instantiate the embedding model
embedder = Qwen3TextEmbedding()

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [5]:
print("Model is on device:", embedder.model.device)

# Run inference
embeddings = embedder.embed(**english_text, return_numpy=True)

print("Embedding shape:", embeddings.shape)
print("Embedding (first 5 values):", embeddings[0][:5])

Model is on device: cuda:0
Embedding shape: (1, 1024)
Embedding (first 5 values): [-0.9863  0.1508 -0.88   -7.734   0.0977]


In [6]:
type(embeddings)

numpy.ndarray

In [7]:
type(embeddings[0][0])

numpy.float16