diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index d49f3e2f47cf..4d163399cfc6 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -35,7 +35,7 @@ docker run \ echo $ZE_AFFINITY_MASK pip install tblib==3.1.0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index 408d2878309d..e565f17da62a 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -8,9 +8,9 @@ TL;DR: | Online Flag | Offline Flag | Result | |----------|----------|-------------| | --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs | -| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only | -| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only | -| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor | +| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only | +| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only | +| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor | ## vLLM-torch.compile overview @@ -86,11 +86,11 @@ LLM(model, enforce_eager=True) ``` To turn off just torch.compile, pass `mode = NONE` to the compilation config. -(`-O` is short for `--compilation_config`): +(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated): ```sh # Online -vllm serve -O.mode=0 +vllm serve -cc.mode=0 ``` ```py @@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`: ```sh # Online -vllm serve -O.cudagraph_mode=NONE +vllm serve -cc.cudagraph_mode=NONE ``` ```py @@ -183,10 +183,10 @@ help debug the issue: ```sh # Online - using unbacked mode -vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked +vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked # Online - using backed_size_oblivious mode -vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious +vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious ``` ```py @@ -233,7 +233,7 @@ to the compilation config: ```sh # online -vllm serve -O.backend=eager +vllm serve -cc.backend=eager ``` ```py @@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code ` to print the Inductor output ### Editable TorchInductor code You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked` -or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable. +or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable. This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`) and print statements in the output code. @@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`: ```sh # Online -vllm serve -O.cudagraph_mode=NONE +vllm serve -cc.cudagraph_mode=NONE ``` ```py diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 7b0b2c1e9697..4dc0da0c7d65 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \ # Alternative: Using dot notation (simpler for single values) -vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked +vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked ``` #### Choosing the Right Mode diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py index 965938c4433d..f2e58b5cc423 100644 --- a/tests/compile/fullgraph/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -115,7 +115,7 @@ def test_compile_correctness( str(pp_size), "-tp", str(tp_size), - "-O.cudagraph_mode=none", + "-cc.cudagraph_mode=none", ] all_args: list[list[str]] = [] @@ -128,7 +128,7 @@ def test_compile_correctness( ]: for mode in [CompilationMode.NONE, comp_mode]: all_args.append( - final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"] + final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] ) # inductor will change the output, so we only compare if the output @@ -148,7 +148,7 @@ def test_compile_correctness( CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"]) + all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) all_envs.append({}) all_envs.append({}) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 0077609b2f36..e46f118f8e84 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -248,15 +248,15 @@ def test_optimization_level(args, expected): @pytest.mark.parametrize( ("args", "expected"), [ - (["-O.mode=0"], 0), - (["-O.mode=1"], 1), - (["-O.mode=2"], 2), - (["-O.mode=3"], 3), + (["-cc.mode=0"], 0), + (["-cc.mode=1"], 1), + (["-cc.mode=2"], 2), + (["-cc.mode=3"], 3), ], ) def test_mode_parser(args, expected): """ - Test compilation config modes (-O.mode=int) map to compilation_config. + Test compilation config modes (-cc.mode=int) map to compilation_config. """ parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parsed_args = parser.parse_args(args) @@ -273,7 +273,7 @@ def test_compilation_config(): # set to string form of a dict args = parser.parse_args( [ - "-O", + "-cc", '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}', ] ) diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index c0519155c4ba..0ea4a43d2602 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -27,7 +27,7 @@ def parser(): parser.add_argument("--batch-size", type=int) parser.add_argument("--enable-feature", action="store_true") parser.add_argument("--hf-overrides", type=json.loads) - parser.add_argument("-O", "--compilation-config", type=json.loads) + parser.add_argument("-cc", "--compilation-config", type=json.loads) parser.add_argument("--optimization-level", type=int) return parser @@ -167,8 +167,8 @@ def test_dict_args(parser): "--hf-overrides.key2.key4", "val3", # Test compile config and compilation mode - "-O.use_inductor_graph_partition=true", - "-O.backend", + "-cc.use_inductor_graph_partition=true", + "-cc.backend", "custom", "-O1", # Test = sign @@ -191,9 +191,9 @@ def test_dict_args(parser): "--hf_overrides.key14.key15", "-minus.and.dot", # Test array values - "-O.custom_ops+", + "-cc.custom_ops+", "-quant_fp8", - "-O.custom_ops+=+silu_mul,-rms_norm", + "-cc.custom_ops+=+silu_mul,-rms_norm", ] parsed_args = parser.parse_args(args) assert parsed_args.model_name == "something.something" @@ -234,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.mode", + "-cc.mode", "2", "-O3", ] @@ -380,29 +380,29 @@ def test_load_config_file(tmp_path): def test_compilation_mode_string_values(parser): - """Test that -O.mode accepts both integer and string mode values.""" - args = parser.parse_args(["-O.mode", "0"]) + """Test that -cc.mode accepts both integer and string mode values.""" + args = parser.parse_args(["-cc.mode", "0"]) assert args.compilation_config == {"mode": 0} args = parser.parse_args(["-O3"]) assert args.optimization_level == 3 - args = parser.parse_args(["-O.mode=NONE"]) + args = parser.parse_args(["-cc.mode=NONE"]) assert args.compilation_config == {"mode": "NONE"} - args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"]) + args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"]) assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"} - args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"]) + args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"]) assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"} - args = parser.parse_args(["-O.mode", "VLLM_COMPILE"]) + args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"]) assert args.compilation_config == {"mode": "VLLM_COMPILE"} - args = parser.parse_args(["-O.mode=none"]) + args = parser.parse_args(["-cc.mode=none"]) assert args.compilation_config == {"mode": "none"} - args = parser.parse_args(["-O.mode=vllm_compile"]) + args = parser.parse_args(["-cc.mode=vllm_compile"]) assert args.compilation_config == {"mode": "vllm_compile"} @@ -458,3 +458,25 @@ def test_flat_product(): (3, 4, "a", 5, 6), (3, 4, "b", 5, 6), ] + + +def test_o_legacy_syntax_deprecation(caplog_vllm): + """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax.""" + parser = FlexibleArgumentParser() + parser.add_argument("-cc", "--compilation-config", type=json.loads) + + # Test that -O.backend gets converted correctly AND emits warning + args = parser.parse_args(["-O.backend=eager"]) + assert args.compilation_config == {"backend": "eager"} + + # Check that deprecation warning was logged + assert len(caplog_vllm.records) >= 1 + assert ( + "The -O.* dotted syntax for --compilation-config is deprecated" + in caplog_vllm.text + ) + + # Test that -O.mode gets converted correctly + # Note: warning_once won't emit again in same session + args = parser.parse_args(["-O.mode=2"]) + assert args.compilation_config == {"mode": 2} diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 7ac8cc764322..34e70e3e134b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -193,8 +193,8 @@ class VllmConfig: compilation_config: CompilationConfig = Field(default_factory=CompilationConfig) """`torch.compile` and cudagraph capture configuration for the model. - As a shorthand, one can append compilation arguments via - -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`). + As a shorthand, one can append compilation arguments via + -cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`). You can specify the full compilation config like so: `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ad5a34c56161..702a15f0b349 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1107,7 +1107,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"] ) vllm_group.add_argument( - "--compilation-config", "-O", **vllm_kwargs["compilation_config"] + "--compilation-config", "-cc", **vllm_kwargs["compilation_config"] ) vllm_group.add_argument( "--additional-config", **vllm_kwargs["additional_config"] diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index b68157f02f6c..555fcfea491e 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -257,6 +257,17 @@ def repl(match: re.Match) -> str: ): # Convert -O to --optimization-level processed_args.append("--optimization-level") + elif arg.startswith("-O."): + # Handle -O.* dotted syntax - ALL dotted syntax is deprecated + logger.warning_once( + "The -O.* dotted syntax for --compilation-config is " + "deprecated and will be removed in v0.13.0 or v1.0.0" + ", whichever is earlier. Please use -cc.* instead. " + "Example: -cc.backend=eager instead of " + "-O.backend=eager." + ) + converted_arg = arg.replace("-O", "-cc", 1) + processed_args.append(converted_arg) else: processed_args.append(arg)