Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ docker run \
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
Expand Down
22 changes: 11 additions & 11 deletions docs/design/debug_vllm_compile.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ TL;DR:
| Online Flag | Offline Flag | Result |
|----------|----------|-------------|
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |

## vLLM-torch.compile overview

Expand Down Expand Up @@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
```

To turn off just torch.compile, pass `mode = NONE` to the compilation config.
(`-O` is short for `--compilation_config`):
(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):

```sh
# Online
vllm serve -O.mode=0
vllm serve -cc.mode=0
```

```py
Expand All @@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:

```sh
# Online
vllm serve -O.cudagraph_mode=NONE
vllm serve -cc.cudagraph_mode=NONE
```

```py
Expand Down Expand Up @@ -183,10 +183,10 @@ help debug the issue:

```sh
# Online - using unbacked mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked

# Online - using backed_size_oblivious mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
```

```py
Expand Down Expand Up @@ -233,7 +233,7 @@ to the compilation config:

```sh
# online
vllm serve -O.backend=eager
vllm serve -cc.backend=eager
```

```py
Expand All @@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
### Editable TorchInductor code

You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.

This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
and print statements in the output code.
Expand Down Expand Up @@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:

```sh
# Online
vllm serve -O.cudagraph_mode=NONE
vllm serve -cc.cudagraph_mode=NONE
```

```py
Expand Down
2 changes: 1 addition & 1 deletion docs/design/torch_compile.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \


# Alternative: Using dot notation (simpler for single values)
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
```

#### Choosing the Right Mode
Expand Down
6 changes: 3 additions & 3 deletions tests/compile/fullgraph/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_compile_correctness(
str(pp_size),
"-tp",
str(tp_size),
"-O.cudagraph_mode=none",
"-cc.cudagraph_mode=none",
]

all_args: list[list[str]] = []
Expand All @@ -128,7 +128,7 @@ def test_compile_correctness(
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)

# inductor will change the output, so we only compare if the output
Expand All @@ -148,7 +148,7 @@ def test_compile_correctness(
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})

Expand Down
12 changes: 6 additions & 6 deletions tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,15 +248,15 @@ def test_optimization_level(args, expected):
@pytest.mark.parametrize(
("args", "expected"),
[
(["-O.mode=0"], 0),
(["-O.mode=1"], 1),
(["-O.mode=2"], 2),
(["-O.mode=3"], 3),
(["-cc.mode=0"], 0),
(["-cc.mode=1"], 1),
(["-cc.mode=2"], 2),
(["-cc.mode=3"], 3),
],
)
def test_mode_parser(args, expected):
"""
Test compilation config modes (-O.mode=int) map to compilation_config.
Test compilation config modes (-cc.mode=int) map to compilation_config.
"""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
parsed_args = parser.parse_args(args)
Expand All @@ -273,7 +273,7 @@ def test_compilation_config():
# set to string form of a dict
args = parser.parse_args(
[
"-O",
"-cc",
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
]
)
Expand Down
50 changes: 36 additions & 14 deletions tests/utils_/test_argparse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def parser():
parser.add_argument("--batch-size", type=int)
parser.add_argument("--enable-feature", action="store_true")
parser.add_argument("--hf-overrides", type=json.loads)
parser.add_argument("-O", "--compilation-config", type=json.loads)
parser.add_argument("-cc", "--compilation-config", type=json.loads)
parser.add_argument("--optimization-level", type=int)
return parser

Expand Down Expand Up @@ -167,8 +167,8 @@ def test_dict_args(parser):
"--hf-overrides.key2.key4",
"val3",
# Test compile config and compilation mode
"-O.use_inductor_graph_partition=true",
"-O.backend",
"-cc.use_inductor_graph_partition=true",
"-cc.backend",
"custom",
"-O1",
# Test = sign
Expand All @@ -191,9 +191,9 @@ def test_dict_args(parser):
"--hf_overrides.key14.key15",
"-minus.and.dot",
# Test array values
"-O.custom_ops+",
"-cc.custom_ops+",
"-quant_fp8",
"-O.custom_ops+=+silu_mul,-rms_norm",
"-cc.custom_ops+=+silu_mul,-rms_norm",
]
parsed_args = parser.parse_args(args)
assert parsed_args.model_name == "something.something"
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
"--hf-overrides.key1",
"val2",
"-O1",
"-O.mode",
"-cc.mode",
"2",
"-O3",
]
Expand Down Expand Up @@ -380,29 +380,29 @@ def test_load_config_file(tmp_path):


def test_compilation_mode_string_values(parser):
"""Test that -O.mode accepts both integer and string mode values."""
args = parser.parse_args(["-O.mode", "0"])
"""Test that -cc.mode accepts both integer and string mode values."""
args = parser.parse_args(["-cc.mode", "0"])
assert args.compilation_config == {"mode": 0}

args = parser.parse_args(["-O3"])
assert args.optimization_level == 3

args = parser.parse_args(["-O.mode=NONE"])
args = parser.parse_args(["-cc.mode=NONE"])
assert args.compilation_config == {"mode": "NONE"}

args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"])
assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}

args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"])
assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}

args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"])
assert args.compilation_config == {"mode": "VLLM_COMPILE"}

args = parser.parse_args(["-O.mode=none"])
args = parser.parse_args(["-cc.mode=none"])
assert args.compilation_config == {"mode": "none"}

args = parser.parse_args(["-O.mode=vllm_compile"])
args = parser.parse_args(["-cc.mode=vllm_compile"])
assert args.compilation_config == {"mode": "vllm_compile"}


Expand Down Expand Up @@ -458,3 +458,25 @@ def test_flat_product():
(3, 4, "a", 5, 6),
(3, 4, "b", 5, 6),
]


def test_o_legacy_syntax_deprecation(caplog_vllm):
"""Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
parser = FlexibleArgumentParser()
parser.add_argument("-cc", "--compilation-config", type=json.loads)

# Test that -O.backend gets converted correctly AND emits warning
args = parser.parse_args(["-O.backend=eager"])
assert args.compilation_config == {"backend": "eager"}

# Check that deprecation warning was logged
assert len(caplog_vllm.records) >= 1
assert (
"The -O.* dotted syntax for --compilation-config is deprecated"
in caplog_vllm.text
)

# Test that -O.mode gets converted correctly
# Note: warning_once won't emit again in same session
args = parser.parse_args(["-O.mode=2"])
assert args.compilation_config == {"mode": 2}
4 changes: 2 additions & 2 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,8 @@ class VllmConfig:
compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
"""`torch.compile` and cudagraph capture configuration for the model.
As a shorthand, one can append compilation arguments via
-0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
As a shorthand, one can append compilation arguments via
-cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
You can specify the full compilation config like so:
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
)
vllm_group.add_argument(
"--compilation-config", "-O", **vllm_kwargs["compilation_config"]
"--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
)
vllm_group.add_argument(
"--additional-config", **vllm_kwargs["additional_config"]
Expand Down
11 changes: 11 additions & 0 deletions vllm/utils/argparse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,17 @@ def repl(match: re.Match) -> str:
):
# Convert -O <n> to --optimization-level <n>
processed_args.append("--optimization-level")
elif arg.startswith("-O."):
# Handle -O.* dotted syntax - ALL dotted syntax is deprecated
logger.warning_once(
"The -O.* dotted syntax for --compilation-config is "
"deprecated and will be removed in v0.13.0 or v1.0.0"
", whichever is earlier. Please use -cc.* instead. "
"Example: -cc.backend=eager instead of "
"-O.backend=eager."
)
converted_arg = arg.replace("-O", "-cc", 1)
processed_args.append(converted_arg)
else:
processed_args.append(arg)

Expand Down