Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new CLI options #68

Merged
merged 9 commits into from
Mar 19, 2025
17 changes: 0 additions & 17 deletions tests/vec_inf/cli/test_utils.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,6 @@

from vec_inf.cli._utils import (
MODEL_READY_SIGNATURE,
convert_boolean_value,
create_table,
get_base_url,
is_server_running,
@@ -223,19 +222,3 @@ def test_load_config_invalid_user_model(tmp_path):
assert "validation error" in str(excinfo.value).lower()
assert "model_type" in str(excinfo.value)
assert "num_gpus" in str(excinfo.value)


def test_convert_boolean_value_with_string():
"""Testing string inputs."""
assert convert_boolean_value("true") is True
assert convert_boolean_value("TRUE") is True
assert convert_boolean_value("false") is False
assert convert_boolean_value("random_string") is False


def test_convert_boolean_value_with_numeric_and_boolean():
"""Testing integer and boolean inputs."""
assert convert_boolean_value(1) is True
assert convert_boolean_value(0) is False
assert convert_boolean_value(True) is True
assert convert_boolean_value(False) is False
28 changes: 24 additions & 4 deletions vec_inf/cli/_cli.py
Original file line number Diff line number Diff line change
@@ -39,6 +39,21 @@ def cli() -> None:
type=float,
help="GPU memory utilization, default to 0.9",
)
@click.option(
"--enable-prefix-caching",
is_flag=True,
help="Enables automatic prefix caching",
)
@click.option(
"--enable-chunked-prefill",
is_flag=True,
help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
)
@click.option(
"--max-num-batched-tokens",
type=int,
help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
)
@click.option(
"--partition",
type=str,
@@ -87,13 +102,18 @@ def cli() -> None:
)
@click.option(
"--pipeline-parallelism",
type=str,
help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
is_flag=True,
help="Enable pipeline parallelism, enabled by default for supported models",
)
@click.option(
"--compilation-config",
type=click.Choice(["0", "1", "2", "3"]),
Copy link
Contributor Author

@fcogidi fcogidi Mar 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 and 2 are meant for internal use for vLLM developers.

--compilation-config, -O

torch.compile configuration for the model.When it is a number (0, 1, 2, 3), it will be interpreted as the optimization level. NOTE: level 0 is the default level without any optimization. level 1 and 2 are for internal testing only. level 3 is the recommended level for production. To specify the full compilation config, use a JSON string. Following the convention of traditional compilers, using -O without space is also supported. -O3 is equivalent to -O 3.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, missed that bit in the description, will get rid of them

help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",
)
@click.option(
"--enforce-eager",
type=str,
help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
is_flag=True,
help="Always use eager-mode PyTorch",
)
@click.option(
"--json-mode",
6 changes: 6 additions & 0 deletions vec_inf/cli/_config.py
Original file line number Diff line number Diff line change
@@ -47,6 +47,12 @@ class ModelConfig(BaseModel):
max_num_seqs: int = Field(
default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
)
compilation_config: int = Field(
default=0,
gt=-1,
le=4,
description="torch.compile optimization level",
)
gpu_memory_utilization: float = Field(
default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
)
41 changes: 34 additions & 7 deletions vec_inf/cli/_helper.py
Original file line number Diff line number Diff line change
@@ -34,6 +34,13 @@
"max_model_len",
}

BOOLEAN_FIELDS = {
"pipeline_parallelism",
"enforce_eager",
"enable_prefix_caching",
"enable_chunked_prefill",
}

LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
SRC_DIR = str(Path(__file__).parent.parent)

@@ -90,16 +97,15 @@ def _get_launch_params(self) -> dict[str, Any]:
params = self.model_config.model_dump()

# Process boolean fields
for bool_field in ["pipeline_parallelism", "enforce_eager"]:
if (value := self.cli_kwargs.get(bool_field)) is not None:
params[bool_field] = utils.convert_boolean_value(value)
for bool_field in BOOLEAN_FIELDS:
if self.cli_kwargs[bool_field]:
params[bool_field] = True

# Merge other overrides
for key, value in self.cli_kwargs.items():
if value is not None and key not in [
"json_mode",
"pipeline_parallelism",
"enforce_eager",
*BOOLEAN_FIELDS,
]:
params[key] = value

@@ -129,7 +135,7 @@ def set_env_vars(self) -> None:
os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
os.environ["SRC_DIR"] = SRC_DIR
os.environ["MODEL_WEIGHTS"] = str(
Path(self.params["model_weights_parent_dir"], self.model_name)
@@ -138,6 +144,15 @@ def set_env_vars(self) -> None:
os.environ["VENV_BASE"] = self.params["venv"]
os.environ["LOG_DIR"] = self.params["log_dir"]

if self.params.get("enable_prefix_caching"):
os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
if self.params.get("enable_chunked_prefill"):
os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
if self.params.get("max_num_batched_tokens"):
os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
if self.params.get("enforce_eager"):
os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]

def build_launch_command(self) -> str:
"""Construct the full launch command with parameters."""
# Base command
@@ -185,8 +200,20 @@ def format_table_output(self, job_id: str) -> Table:
table.add_row("Max Model Length", self.params["max_model_len"])
table.add_row("Max Num Seqs", self.params["max_num_seqs"])
table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
table.add_row("Compilation Config", self.params["compilation_config"])
table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
table.add_row("Enforce Eager", self.params["enforce_eager"])
if self.params.get("enable_prefix_caching"):
table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
if self.params.get("enable_chunked_prefill"):
table.add_row(
"Enable Chunked Prefill", self.params["enable_chunked_prefill"]
)
if self.params.get("max_num_batched_tokens"):
table.add_row(
"Max Num Batched Tokens", self.params["max_num_batched_tokens"]
)
if self.params.get("enforce_eager"):
table.add_row("Enforce Eager", self.params["enforce_eager"])
table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
table.add_row("Log Directory", self.params["log_dir"])

7 changes: 0 additions & 7 deletions vec_inf/cli/_utils.py
Original file line number Diff line number Diff line change
@@ -158,10 +158,3 @@ def load_config() -> list[ModelConfig]:
ModelConfig(model_name=name, **model_data)
for name, model_data in config.get("models", {}).items()
]


def convert_boolean_value(value: Union[str, int, bool]) -> bool:
"""Convert various input types to boolean strings."""
if isinstance(value, str):
return value.lower() == "true"
return bool(value)
26 changes: 26 additions & 0 deletions vec_inf/multinode_vllm.slurm
Original file line number Diff line number Diff line change
@@ -90,6 +90,24 @@ else
export ENFORCE_EAGER=""
fi

if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
else
export ENABLE_PREFIX_CACHING=""
fi

if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
else
export ENABLE_CHUNKED_PREFILL=""
fi

if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
export MAX_NUM_BATCHED_TOKENS=""
else
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
fi

# Activate vllm venv
if [ "$VENV_BASE" = "singularity" ]; then
singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -106,7 +124,11 @@ if [ "$VENV_BASE" = "singularity" ]; then
--max-model-len ${MAX_MODEL_LEN} \
--max-num-seqs ${MAX_NUM_SEQS} \
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
--compilation-config ${COMPILATION_CONFIG} \
--task ${TASK} \
${MAX_NUM_BATCHED_TOKENS} \
${ENABLE_PREFIX_CACHING} \
${ENABLE_CHUNKED_PREFILL} \
${ENFORCE_EAGER}
else
source ${VENV_BASE}/bin/activate
@@ -123,6 +145,10 @@ else
--max-model-len ${MAX_MODEL_LEN} \
--max-num-seqs ${MAX_NUM_SEQS} \
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
--compilation-config ${COMPILATION_CONFIG} \
--task ${TASK} \
${MAX_NUM_BATCHED_TOKENS} \
${ENABLE_PREFIX_CACHING} \
${ENABLE_CHUNKED_PREFILL} \
${ENFORCE_EAGER}
fi
27 changes: 27 additions & 0 deletions vec_inf/vllm.slurm
Original file line number Diff line number Diff line change
@@ -23,6 +23,24 @@ else
export ENFORCE_EAGER=""
fi

if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
else
export ENABLE_PREFIX_CACHING=""
fi

if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
else
export ENABLE_CHUNKED_PREFILL=""
fi

if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
export MAX_NUM_BATCHED_TOKENS=""
else
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
fi

# Activate vllm venv
if [ "$VENV_BASE" = "singularity" ]; then
export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -42,8 +60,13 @@ if [ "$VENV_BASE" = "singularity" ]; then
--max-model-len ${MAX_MODEL_LEN} \
--max-num-seqs ${MAX_NUM_SEQS} \
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
--compilation-config ${COMPILATION_CONFIG} \
--task ${TASK} \
${MAX_NUM_BATCHED_TOKENS} \
${ENABLE_PREFIX_CACHING} \
${ENABLE_CHUNKED_PREFILL} \
${ENFORCE_EAGER}

else
source ${VENV_BASE}/bin/activate
python3 -m vllm.entrypoints.openai.api_server \
@@ -58,6 +81,10 @@ else
--max-model-len ${MAX_MODEL_LEN} \
--max-num-seqs ${MAX_NUM_SEQS} \
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
--compilation-config ${COMPILATION_CONFIG} \
--task ${TASK} \
${MAX_NUM_BATCHED_TOKENS} \
${ENABLE_PREFIX_CACHING} \
${ENABLE_CHUNKED_PREFILL} \
${ENFORCE_EAGER}
fi
Loading
Oops, something went wrong.