VectorInstitute · XkunW · Mar 19, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 17, 2025
diff --git a/tests/vec_inf/cli/test_utils.py b/tests/vec_inf/cli/test_utils.py
@@ -8,7 +8,6 @@
 
 from vec_inf.cli._utils import (
     MODEL_READY_SIGNATURE,
-    convert_boolean_value,
     create_table,
     get_base_url,
     is_server_running,
@@ -223,19 +222,3 @@ def test_load_config_invalid_user_model(tmp_path):
     assert "validation error" in str(excinfo.value).lower()
     assert "model_type" in str(excinfo.value)
     assert "num_gpus" in str(excinfo.value)
-
-
-def test_convert_boolean_value_with_string():
-    """Testing string inputs."""
-    assert convert_boolean_value("true") is True
-    assert convert_boolean_value("TRUE") is True
-    assert convert_boolean_value("false") is False
-    assert convert_boolean_value("random_string") is False
-
-
-def test_convert_boolean_value_with_numeric_and_boolean():
-    """Testing integer and boolean inputs."""
-    assert convert_boolean_value(1) is True
-    assert convert_boolean_value(0) is False
-    assert convert_boolean_value(True) is True
-    assert convert_boolean_value(False) is False
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -39,6 +39,21 @@ def cli() -> None:
     type=float,
     help="GPU memory utilization, default to 0.9",
 )
+@click.option(
+    "--enable-prefix-caching",
+    is_flag=True,
+    help="Enables automatic prefix caching",
+)
+@click.option(
+    "--enable-chunked-prefill",
+    is_flag=True,
+    help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
+)
+@click.option(
+    "--max-num-batched-tokens",
+    type=int,
+    help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
+)
 @click.option(
     "--partition",
     type=str,
@@ -87,13 +102,18 @@ def cli() -> None:
 )
 @click.option(
     "--pipeline-parallelism",
-    type=str,
-    help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
+    is_flag=True,
+    help="Enable pipeline parallelism, enabled by default for supported models",
+)
+@click.option(
+    "--compilation-config",
+    type=click.Choice(["0", "1", "2", "3"]),
+    help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",
 )
 @click.option(
     "--enforce-eager",
-    type=str,
-    help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
+    is_flag=True,
+    help="Always use eager-mode PyTorch",
 )
 @click.option(
     "--json-mode",

diff --git a/vec_inf/cli/_config.py b/vec_inf/cli/_config.py
@@ -47,6 +47,12 @@ class ModelConfig(BaseModel):
     max_num_seqs: int = Field(
         default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
     )
+    compilation_config: int = Field(
+        default=0,
+        gt=-1,
+        le=4,
+        description="torch.compile optimization level",
+    )
     gpu_memory_utilization: float = Field(
         default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
     )

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
@@ -34,6 +34,13 @@
     "max_model_len",
 }
 
+BOOLEAN_FIELDS = {
+    "pipeline_parallelism",
+    "enforce_eager",
+    "enable_prefix_caching",
+    "enable_chunked_prefill",
+}
+
 LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
 SRC_DIR = str(Path(__file__).parent.parent)
 
@@ -90,16 +97,15 @@ def _get_launch_params(self) -> dict[str, Any]:
         params = self.model_config.model_dump()
 
         # Process boolean fields
-        for bool_field in ["pipeline_parallelism", "enforce_eager"]:
-            if (value := self.cli_kwargs.get(bool_field)) is not None:
-                params[bool_field] = utils.convert_boolean_value(value)
+        for bool_field in BOOLEAN_FIELDS:
+            if self.cli_kwargs[bool_field]:
+                params[bool_field] = True
 
         # Merge other overrides
         for key, value in self.cli_kwargs.items():
             if value is not None and key not in [
                 "json_mode",
-                "pipeline_parallelism",
-                "enforce_eager",
+                *BOOLEAN_FIELDS,
             ]:
                 params[key] = value
 
@@ -129,7 +135,7 @@ def set_env_vars(self) -> None:
         os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
         os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
         os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
-        os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
+        os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
         os.environ["SRC_DIR"] = SRC_DIR
         os.environ["MODEL_WEIGHTS"] = str(
             Path(self.params["model_weights_parent_dir"], self.model_name)
@@ -138,6 +144,15 @@ def set_env_vars(self) -> None:
         os.environ["VENV_BASE"] = self.params["venv"]
         os.environ["LOG_DIR"] = self.params["log_dir"]
 
+        if self.params.get("enable_prefix_caching"):
+            os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
+        if self.params.get("enable_chunked_prefill"):
+            os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
+        if self.params.get("max_num_batched_tokens"):
+            os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
+        if self.params.get("enforce_eager"):
+            os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
+
     def build_launch_command(self) -> str:
         """Construct the full launch command with parameters."""
         # Base command
@@ -185,8 +200,20 @@ def format_table_output(self, job_id: str) -> Table:
         table.add_row("Max Model Length", self.params["max_model_len"])
         table.add_row("Max Num Seqs", self.params["max_num_seqs"])
         table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
+        table.add_row("Compilation Config", self.params["compilation_config"])
         table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
-        table.add_row("Enforce Eager", self.params["enforce_eager"])
+        if self.params.get("enable_prefix_caching"):
+            table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
+        if self.params.get("enable_chunked_prefill"):
+            table.add_row(
+                "Enable Chunked Prefill", self.params["enable_chunked_prefill"]
+            )
+        if self.params.get("max_num_batched_tokens"):
+            table.add_row(
+                "Max Num Batched Tokens", self.params["max_num_batched_tokens"]
+            )
+        if self.params.get("enforce_eager"):
+            table.add_row("Enforce Eager", self.params["enforce_eager"])
         table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
         table.add_row("Log Directory", self.params["log_dir"])
 

diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
@@ -158,10 +158,3 @@ def load_config() -> list[ModelConfig]:
         ModelConfig(model_name=name, **model_data)
         for name, model_data in config.get("models", {}).items()
     ]
-
-
-def convert_boolean_value(value: Union[str, int, bool]) -> bool:
-    """Convert various input types to boolean strings."""
-    if isinstance(value, str):
-        return value.lower() == "true"
-    return bool(value)
diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm
@@ -90,6 +90,24 @@ else
     export ENFORCE_EAGER=""
 fi
 
+if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
+    export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
+else
+    export ENABLE_PREFIX_CACHING=""
+fi
+
+if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
+    export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
+else
+    export ENABLE_CHUNKED_PREFILL=""
+fi
+
+if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
+    export MAX_NUM_BATCHED_TOKENS=""
+else
+    export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
+fi
+
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
     singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -106,7 +124,11 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
+    --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
+    ${ENABLE_PREFIX_CACHING} \
+    ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
 else
     source ${VENV_BASE}/bin/activate
@@ -123,6 +145,10 @@ else
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
+    --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
+    ${ENABLE_PREFIX_CACHING} \
+    ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
 fi
diff --git a/vec_inf/vllm.slurm b/vec_inf/vllm.slurm
@@ -23,6 +23,24 @@ else
     export ENFORCE_EAGER=""
 fi
 
+if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
+    export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
+else
+    export ENABLE_PREFIX_CACHING=""
+fi
+
+if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
+    export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
+else
+    export ENABLE_CHUNKED_PREFILL=""
+fi
+
+if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
+    export MAX_NUM_BATCHED_TOKENS=""
+else
+    export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
+fi
+
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
     export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -42,8 +60,13 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
+    --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
+    ${ENABLE_PREFIX_CACHING} \
+    ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
+
 else
     source ${VENV_BASE}/bin/activate
     python3 -m vllm.entrypoints.openai.api_server \
@@ -58,6 +81,10 @@ else
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
+    --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
+    ${ENABLE_PREFIX_CACHING} \
+    ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
 fi