From a3a54cdce01cf0ab6ea4cc9fab4173d8e017b181 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:51:05 -0700 Subject: [PATCH] [V0 Deprecation] Remove misc V0 tests Signed-off-by: Woosuk Kwon --- tests/model_executor/test_logits_processor.py | 98 ------------------- tests/test_cache_block_hashing.py | 92 ----------------- 2 files changed, 190 deletions(-) delete mode 100644 tests/model_executor/test_logits_processor.py delete mode 100644 tests/test_cache_block_hashing.py diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py deleted file mode 100644 index 532ebba038d3..000000000000 --- a/tests/model_executor/test_logits_processor.py +++ /dev/null @@ -1,98 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import patch - -import pytest -import torch - -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available - - -class MockLogitsProcessor(LogitsProcessor): - - def __init__(self, vocab_size: int, scale: float, - fake_logits: torch.Tensor): - super().__init__(vocab_size=vocab_size, scale=scale) - self.fake_logits = fake_logits.clone() - - def forward(self, *args, **kwargs): - with patch( - "vllm.model_executor.layers.logits_processor._prune_hidden_states", - lambda x, y: x - ), patch( - "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits", - lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, vocab_size), - 1e-2, - dtype=input_tensor.dtype) - logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits) - return input_tensor, fake_logits, logits_processor - - -RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_logits_processors(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, logits_processor = _prepare_test(batch_size) - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - seq_group_metadata_list = [] - seq_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, - logits_processors=[pick_ith]), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - logits_processor_output = logits_processor( - lm_head=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - - assert torch.isinf(logits_processor_output[:, 0]).all() - - fake_logits *= logits_processor.scale - torch.testing.assert_close(logits_processor_output[:, 1], - fake_logits[:, 1], - rtol=1e-4, - atol=0.0) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py deleted file mode 100644 index 1dba0fd0fb3d..000000000000 --- a/tests/test_cache_block_hashing.py +++ /dev/null @@ -1,92 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test hashing of cache blocks. - -Run `pytest tests/test_cache_block_hashing.py`. -""" -from typing import Optional - -import pytest - -from vllm.inputs import token_inputs -from vllm.lora.request import LoRARequest -from vllm.sequence import Sequence -from vllm.transformers_utils.tokenizer import get_tokenizer - -# Make two prefixes with different first blocks. -prefix_start = [("You are an expert"), ("You are a")] -prefix_common = ( - " school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on this, fulfill " - "the following: ") -prefixes = [start + prefix_common for start in prefix_start] - -# Sample prompts. -sample_prompts = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" -] - - -# Helper function. -def flatten_2d(li): - return [lss for ls in li for lss in ls] - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("max_num_seqs", [256]) -@pytest.mark.parametrize("concurrent_lora_int_ids", - [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) -def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, - concurrent_lora_int_ids: list[Optional[int]]): - - tokenizer = get_tokenizer("facebook/opt-125m") - - hashes: list[list[list[int]]] = [] - - for prefix in prefixes: - for lora_int_id in concurrent_lora_int_ids: - lora_request = None - - if lora_int_id is not None: - lora_request = LoRARequest( - f"example_lora_{lora_int_id}", - lora_int_id, - f"example/path/to/lora_{lora_int_id}", - ) - - hashes.append([]) - prompts = [prefix + prompt for prompt in sample_prompts] - for seq_id, prompt in enumerate(prompts): - hashes[-1].append([]) - prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, - inputs=token_inputs(prompt_token_ids, - prompt=prompt), - block_size=block_size, - eos_token_id=tokenizer.eos_token_id, - lora_request=lora_request) - - num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash_of_block(idx)) - - # Check that hashes made with two prefixes with different first blocks are - # different everywhere. - for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): - assert (hash0 != hash1) - - # Check that hashes of different prompts made with the same prefix are the - # same until the hashes that contain the prompt. - for hash_pref in hashes: - same_hashes = [tuple(h[:-1]) for h in hash_pref] - different_hashes = [h[-1] for h in hash_pref] - assert (len(set(same_hashes)) == 1) - assert (len(set(different_hashes)) == len(different_hashes))