# Qwen

## Transformers Usage

In [1]:
# Requires transformers>=4.51.0
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [2]:
def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

@torch.no_grad()
def compute_logits(inputs, **kwargs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B", torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda().eval()
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)

task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = ["What is the capital of China?",
    "Explain gravity",
]

documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

pairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]

# Tokenize the input texts
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

print("scores: ", scores)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


scores:  [0.9994982481002808, 0.9993619322776794]


## vLLM Usage

In [1]:
# Requires vllm>=0.8.5
import os

import torch

from transformers import AutoTokenizer, is_torch_npu_available
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import destroy_model_parallel
import gc
import math
from vllm.inputs.data import TokensPrompt

INFO 06-26 17:02:42 [importing.py:17] Triton not installed or not compatible; certain GPU-related functions will not be available.
INFO 06-26 17:02:43 [__init__.py:244] Automatically detected platform cpu.


In [2]:
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [3]:
def format_instruction(instruction, query, doc):
    text = [
        {"role": "system", "content": "Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\"."},
        {"role": "user", "content": f"<Instruct>: {instruction}\n\n<Query>: {query}\n\n<Document>: {doc}"}
    ]
    return text

def process_inputs(pairs, instruction, max_length, suffix_tokens):
    messages = [format_instruction(instruction, query, doc) for query, doc in pairs]
    messages =  tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=False, enable_thinking=False
    )
    messages = [ele[:max_length] + suffix_tokens for ele in messages]
    messages = [TokensPrompt(prompt_token_ids=ele) for ele in messages]
    return messages

def compute_logits(model, messages, sampling_params, true_token, false_token):
    outputs = model.generate(messages, sampling_params, use_tqdm=False)
    scores = []
    for i in range(len(outputs)):
        final_logits = outputs[i].outputs[0].logprobs[-1]
        token_count = len(outputs[i].outputs[0].token_ids)
        if true_token not in final_logits:
            true_logit = -10
        else:
            true_logit = final_logits[true_token].logprob
        if false_token not in final_logits:
            false_logit = -10
        else:
            false_logit = final_logits[false_token].logprob
        true_score = math.exp(true_logit)
        false_score = math.exp(false_logit)
        score = true_score / (true_score + false_score)
        scores.append(score)
    return scores

# Reduce batch size and process sequentially
def compute_logits_safe(model, messages, sampling_params, true_token, false_token):
    scores = []
    for message in messages:  # Process one at a time
        try:
            outputs = model.generate([message], sampling_params, use_tqdm=False)
            # Your existing scoring logic for single output
            final_logits = outputs[0].outputs[0].logprobs[-1]

            true_logit = final_logits.get(true_token, type('obj', (object,), {'logprob': -10})).logprob
            false_logit = final_logits.get(false_token, type('obj', (object,), {'logprob': -10})).logprob

            true_score = math.exp(true_logit)
            false_score = math.exp(false_logit)
            score = true_score / (true_score + false_score)
            scores.append(score)
        except Exception as e:
            print(f"Error processing single input: {e}")
            scores.append(0.5)  # Default neutral score
    return scores


In [4]:
# Check for GPU availability and set appropriate configuration
number_of_gpu = torch.cuda.device_count()
print(f"Number of GPUs detected: {number_of_gpu}")

Number of GPUs detected: 0


In [5]:
# Ensure tensor_parallel_size is at least 1 and validate GPU availability
if number_of_gpu == 0:
    print("No GPUs detected. Running on CPU mode.")
    # For CPU execution, don't use tensor_parallel_size parameter
    model = LLM(
    model='Qwen/Qwen3-Reranker-0.6B',
    max_model_len=10000,  # Reduced from 10000
    # enable_prefix_caching=False,  # Disable for stability, need to be False or commented
    # enforce_eager=True,  # Use eager execution
    # max_num_seqs=1,  # Process one at a time
    # gpu_memory_utilization=0.0,  # Force CPU
)

else:
    print(f"Using {number_of_gpu} GPU(s) for tensor parallelism.")
    model = LLM(
        model='Qwen/Qwen3-Reranker-0.6B',
        tensor_parallel_size=number_of_gpu,
        max_model_len=10000,
        enable_prefix_caching=True,
        gpu_memory_utilization=0.8
    )

No GPUs detected. Running on CPU mode.
INFO 06-26 17:02:53 [config.py:823] This model supports multiple tasks: {'classify', 'embed', 'generate', 'reward', 'score'}. Defaulting to 'generate'.
INFO 06-26 17:02:53 [arg_utils.py:1653] cpu is experimental on VLLM_USE_V1=1. Falling back to V0 Engine.
INFO 06-26 17:02:53 [config.py:1980] Disabled the custom all-reduce kernel because it is not supported on current platform.
INFO 06-26 17:02:53 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='Qwen/Qwen3-Reranker-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-Reranker-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cpu, de



INFO 06-26 17:03:50 [parallel_state.py:1065] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-26 17:03:51 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 06-26 17:03:51 [weight_utils.py:345] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-26 17:03:53 [default_loader.py:272] Loading weights took 2.12 seconds
INFO 06-26 17:03:53 [executor_base.py:113] # cpu blocks: 2340, # CPU blocks: 0
INFO 06-26 17:03:53 [executor_base.py:118] Maximum concurrency for 10000 tokens per request: 3.74x
INFO 06-26 17:03:54 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 0.70 seconds


In [6]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Reranker-0.6B')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
max_length=8192
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
true_token = tokenizer("yes", add_special_tokens=False).input_ids[0]
false_token = tokenizer("no", add_special_tokens=False).input_ids[0]
sampling_params = SamplingParams(temperature=0,
    max_tokens=1,
    logprobs=20,
    allowed_token_ids=[true_token, false_token],
)


task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = ["What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

pairs = list(zip(queries, documents))
inputs = process_inputs(pairs, task, max_length-len(suffix_tokens), suffix_tokens)
scores = compute_logits(model, inputs, sampling_params, true_token, false_token)
print('scores', scores)

destroy_model_parallel()

scores [0.997949256383843, 0.9992145261195076]


# Jina

## Transformer

In [3]:
import torch
from transformers import AutoModelForSequenceClassification

In [4]:
torch.backends.mps.is_built()

True

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    'jinaai/jina-reranker-v2-base-multilingual',
    torch_dtype="auto",
    trust_remote_code=True,
)

model.to("mps" if torch.backends.mps.is_built() else "cpu") # 'cuda', 'mps', or 'cpu' if no GPU is available
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768)
      (token_type_embeddings): Embedding(1, 768)
    )
    (emb_drop): Dropout(p=0.1, inplace=False)
    (emb_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): XLMRobertaEncoder(
      (layers): ModuleList(
        (0-11): 12 x Block(
          (mixer): MHA(
            (Wqkv): LinearResidual(in_features=768, out_features=2304, bias=True)
            (inner_attn): SelfAttention(
              (drop): Dropout(p=0.1, inplace=False)
            )
            (inner_cross_attn): CrossAttention(
              (drop): Dropout(p=0.1, inplace=False)
            )
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (dropout1): Dropout(p=0.1, inplace=False)
          (drop_path1): StochasticDepth(p=0.0, mode=r

In [6]:
# Example query and documents
query = "Organic skincare products for sensitive skin"
documents = [
    "Organic skincare for sensitive skin with aloe vera and chamomile.",
    "New makeup trends focus on bold colors and innovative techniques",
    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
    "针对敏感肌专门设计的天然有机护肤产品",
    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
    "敏感肌のために特別に設計された天然有機スキンケア製品",
    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
]

# construct sentence pairs
sentence_pairs = [[query, doc] for doc in documents]

scores = model.compute_score(sentence_pairs, max_length=1024)

In [7]:
print(scores)

[0.8300437331199646, 0.09534945338964462, 0.6306849718093872, 0.08269733935594559, 0.7620701193809509, 0.09947021305561066, 0.9263036847114563, 0.05921025201678276, 0.842863142490387, 0.1127953976392746]


In [10]:
result = model.rerank(
    query,
    documents,
    max_query_length=512,
    max_length=1024,
    top_n=3
)

In [11]:
print(result)

[{'document': '针对敏感肌专门设计的天然有机护肤产品', 'relevance_score': 0.9263036847114563, 'index': 6}, {'document': '敏感肌のために特別に設計された天然有機スキンケア製品', 'relevance_score': 0.842863142490387, 'index': 8}, {'document': 'Organic skincare for sensitive skin with aloe vera and chamomile.', 'relevance_score': 0.8300437331199646, 'index': 0}]
