Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][Bugfix]: fix prefix caching for blockv2 #4764

Merged
merged 2 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions tests/core/block/test_prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,123 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,

assert (len(res) == zero_point_blocks)

# Test case that assume those prompted block after first immutable would
# be freed into hashless allocator, while first immutable block get ref
# increased.
@staticmethod
@pytest.mark.parametrize("num_blocks", [3])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(10)))
def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
random.seed(seed)

allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(block_size))

block = allocator.allocate_immutable(prev_block=None,
token_ids=token_ids)

assert allocator._refcounter.get(block.block_id) == 1
m = allocator.allocate_mutable(prev_block=None)

block_id = m.block_id
for i in range(block_size):
m.append_token_ids([i])
# After block get promoted to immutable from mutable, if there is
# already same content hash block, then it shall be released into
# hashless_allocator
# And first immutable block's ref get increased by 1
assert m.block_id == block.block_id
assert block_id in allocator._hashless_allocator._free_block_indices
assert allocator._refcounter.get(block.block_id) == 2

# Test case when eviction and allocation are mixed,
# make sure they work as expected
@staticmethod
@pytest.mark.parametrize("num_blocks", [3])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(10)))
def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
random.seed(seed)

all_blocks_list = [i for i in range(num_blocks)]
zero_ref = {i: 0 for i in range(num_blocks)}
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(num_blocks * block_size))

# now we have num_blocks free blocks in hashless allocator
# with internal tracking list _blocks _cached_blocks and evictor
# empty and block's ref shall be 0
assert list(allocator._hashless_allocator._free_block_indices
) == all_blocks_list
assert len(allocator._blocks.keys()) == 0
assert len(allocator._cached_blocks.values()) == 0
assert len(allocator.evictor.free_table.keys()) == 0
assert allocator._refcounter._refcounts == zero_ref

# Allocate immutable chains with only one block residuled in
new_block = []
for i in range(num_blocks):
block = allocator.allocate_immutable(
prev_block=None,
token_ids=token_ids[block_size * i:block_size * (i + 1)])
new_block.append(block)

# Free all blocks, and now all blocks shall be in the evictor
# there shall be no tracking data left in _blocks
# all blocks shall be tracked in _cached_blocks
# all blocks' ref shall be zero
for block in new_block:
allocator.free(block)

assert len(allocator._blocks.keys()) == 0
assert len(allocator._hashless_allocator._free_block_indices) == 0
assert list(allocator._cached_blocks.values()) == all_blocks_list
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
assert allocator._refcounter._refcounts == zero_ref

# Allocate a mutable block, and the first block shall be evicted
# and set its content hash into None, ref to 1
mutable = allocator.allocate_mutable(prev_block=None)

assert mutable.block_id == 0
assert mutable.content_hash is None
assert 0 in allocator._blocks
assert allocator._refcounter.get(0) == 1
assert 0 not in allocator._cached_blocks
assert 0 not in allocator.evictor

# Since this mutable block has no hash yet, it shall be released into
# hashless allocator
allocator.free(mutable)

assert len(allocator._blocks.keys()) == 0
assert allocator._refcounter._refcounts == zero_ref
assert 0 not in allocator._cached_blocks
assert 0 not in allocator.evictor
assert 0 in allocator._hashless_allocator._free_block_indices

# when allocate immutable with first block_size tokens, we
# shall get free block from hashless allocator, thus no block left
# in hashless
block = allocator.allocate_immutable(prev_block=None,
token_ids=token_ids[:block_size])

assert block.block_id == 0
assert len(allocator._hashless_allocator._free_block_indices) == 0
assert 0 in allocator._blocks
assert 0 in allocator._cached_blocks.values()
assert allocator._refcounter.get(0) == 1
assert 0 not in allocator.evictor

# allocate mutable block again, it shall be popped from evictor
mutable = allocator.allocate_mutable(prev_block=None)
assert len(allocator._hashless_allocator._free_block_indices) == 0
assert mutable.block_id not in allocator.evictor.free_table
assert allocator._refcounter.get(mutable.block_id) == 1

# Test case where two last accessed times are equal
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
Expand Down
41 changes: 24 additions & 17 deletions vllm/core/block/prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,21 +160,17 @@ def allocate_mutable(self,
# If the evictor has blocks available for eviction, evict a block
# and return it.
if self.evictor.num_blocks > 0:
# here we get an evicted block, which is only added
# into evictor if its ref counter is 0
# and since its content would be changed, we need
# to remove it from _cached_blocks's tracking list
block_id, content_hash_to_evict = self.evictor.evict()

# Here we may have scenario that several blocks have
# the same content hash, but due to the latter coming block
# is coming from mutable to immutable path, their physical
# block is added into evictor.
# However in this case, we shall not pop the _cached_blocks,
# as the same content is still used by others, which means
# we need to check ref before decide to pop the list.

_block_id = self._cached_blocks[content_hash_to_evict]
refcount = self._refcounter.get(_block_id)
if refcount == 1:
self._cached_blocks.pop(content_hash_to_evict)
assert _block_id == block_id
assert self._refcounter.get(_block_id) == 0
assert _block_id == block_id

self._cached_blocks.pop(content_hash_to_evict)

self._refcounter.incr(block_id)

Expand All @@ -199,7 +195,11 @@ def allocate_mutable(self,

def _incr_refcount_cached_block(self, block: Block,
block_id: BlockId) -> None:
# since block is already computed, mark it
# now _incr_refcount_cached_block comes from two place
# allocate_immutable/promote_to_immutable_block where hit
# _cached_blocks hash key.
# In both cases, it means that already exists a already
# computed block which shared with block now
block.computed = True

refcount = self._refcounter.incr(block_id)
Expand Down Expand Up @@ -228,13 +228,19 @@ def _free_block_id_for_block(self, block_id: BlockId,
block: Block) -> None:
assert isinstance(block, PrefixCachingBlock)

if block.content_hash is None:
# if we comes from promote_to_immutable_block, it means that
# block.content_hash is never None.
# However we need to release the same content block, so that
# physical block could get reused.
if block.block_id != block_id or block.content_hash is None:
leiwen83 marked this conversation as resolved.
Show resolved Hide resolved
refcount = self._refcounter.get(block_id)
# We have fork case where block would get more than one ref,
# so we cannot free it from tracking if ref cnt large than 1
if refcount <= 1:
assert block.block_id is not None
assert block.block_id is not None
refcount = self._refcounter.get(block.block_id)
if refcount == 1:
del self._blocks[block.block_id]

return self._hashless_allocator.free(block)

refcount = self._refcounter.decr(block_id)
Expand Down Expand Up @@ -317,7 +323,8 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
if block.content_hash not in self._cached_blocks:
self._cached_blocks[block.content_hash] = block.block_id
else:
self._free_block_id_for_block(block.block_id, block)
self._free_block_id_for_block(
self._cached_blocks[block.content_hash], block)
self._incr_refcount_cached_block(
block, self._cached_blocks[block.content_hash])

Expand Down
Loading