|
32 | 32 | from server.engine.config import Config
|
33 | 33 | from server.utils import get_logger
|
34 | 34 | from task_queue_manager import TaskQueueManager
|
35 |
| -from paddlenlp.trl import llm_utils |
36 | 35 |
|
37 | 36 | from paddlenlp.experimental.transformers import (
|
38 | 37 | EagleProposer,
|
39 | 38 | InferenceWithReferenceProposer,
|
40 | 39 | )
|
| 40 | +from paddlenlp.trl import llm_utils |
41 | 41 | from paddlenlp.trl.llm_utils import get_rotary_position_embedding
|
42 | 42 |
|
43 | 43 | File_Path = os.path.realpath(sys.argv[0])
|
@@ -338,8 +338,12 @@ def init_inputs(self):
|
338 | 338 | self.share_inputs["need_block_len"] = paddle.full(shape=[1], fill_value=0, dtype="int32")
|
339 | 339 | self.share_inputs["used_list_len"] = paddle.full(shape=[self.args.max_batch_size], fill_value=0, dtype="int32")
|
340 | 340 | self.share_inputs["infer_seed"] = paddle.full(shape=[self.args.max_batch_size, 1], fill_value=0, dtype="int64")
|
341 |
| - free_list = list(range(int(self.args.max_block_num * self.args.block_ratio))) |
| 341 | + |
| 342 | + free_list = list( |
| 343 | + range(self.args.max_block_num - 1, int(self.args.max_block_num * self.args.block_ratio) - 1, -1) |
| 344 | + ) |
342 | 345 | self.free_list_len = len(free_list)
|
| 346 | + |
343 | 347 | self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32")
|
344 | 348 | self.share_inputs["free_list_len"] = paddle.full(shape=[1], fill_value=self.free_list_len, dtype="int32")
|
345 | 349 |
|
|
0 commit comments