Refined Flux demo, solved a bug of device mismatch, and prototyped CudaGraph and Weight streaming

cehongwang · cehongwang · commit 48a7c942a0ab · 2025-03-18T04:28:10.000Z
diff --git a/examples/apps/flux-demo.py b/examples/apps/flux-demo.py
@@ -43,14 +43,20 @@
     "debug": False,
     "use_python_runtime": True,
     "immutable_weights": False,
+    # "cache_built_engines": True,
+    # "reuse_cached_engines": True,
+    # "timing_cache_path": "/home/engine_cache/flux.bin",
+    # "engine_cache_size": 40 * 1 << 30,
+    # "enable_weight_streaming": False,
+    # "enable_cuda_graph": True,
 }
 
 trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
 trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
 pipe.transformer = trt_gm
 
 
-def generate_image(prompt, inference_step, batch_size=1):
+def generate_image(prompt, inference_step, batch_size=2):
     image = pipe(
         prompt,
         output_type="pil",
@@ -60,7 +66,8 @@ def generate_image(prompt, inference_step, batch_size=1):
     return image
 
 
-generate_image(["A golden retriever holding a sign to code"], 2)
+generate_image(["Test"], 2)
+torch.cuda.empty_cache()
 
 
 def model_change(model):
@@ -76,14 +83,20 @@ def model_change(model):
 def load_lora(path):
 
     pipe.load_lora_weights(
-        path,
+        "/home/TensorRT/examples/apps/NGRVNG.safetensors",
         adapter_name="lora1",
     )
     pipe.set_adapters(["lora1"], adapter_weights=[1])
     pipe.fuse_lora()
     pipe.unload_lora_weights()
-    print("LoRA loaded!")
+    print("LoRA loaded! Begin refitting")
+    generate_image(["Test"], 2)
+    print("Refitting Finished!")
+
 
+generate_image(["Test"], 2)
+load_lora("")
+generate_image(["A golden retriever holding a sign to code"], 2)
 
 # Create Gradio interface
 with gr.Blocks(title="Flux Demo with Torch-TensorRT") as demo:
@@ -103,7 +116,8 @@ def load_lora(path):
 
             lora_upload_path = gr.Textbox(
                 label="LoRA Path",
-                placeholder="/home/TensorRT/examples/apps/NGRVNG.safetensors",
+                placeholder="Enter the LoRA checkpoint path here",
+                value="/home/TensorRT/examples/apps/NGRVNG.safetensors",
                 lines=2,
             )
             num_steps = gr.Slider(
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -317,37 +317,62 @@ def refit_module_weights(
 
     new_gm = post_lowering(new_gm, settings)
 
-    logger.info("Compilation Settings: %s\n", settings)
+    logger.debug("Lowered Input graph: " + str(new_gm.graph))
 
     # Set torch-executed ops
-    CONVERTERS.set_disallowed_targets(settings.torch_executed_ops)
+    CONVERTERS.set_compilation_settings(settings)
+
+    # Check the number of supported operations in the graph
+    num_supported_ops, total_ops = partitioning.get_graph_converter_support(
+        new_gm, settings.debug, settings.torch_executed_ops
+    )
+
+    if num_supported_ops == 0 or (
+        num_supported_ops < settings.min_block_size and not settings.dryrun
+    ):
+        logger.warning(
+            f"{num_supported_ops} supported operations detected in subgraph containing {total_ops} computational nodes. "
+            f"Skipping this subgraph, since min_block_size was detected to be {settings.min_block_size}"
+        )
+        return new_gm
+    else:
+        logger.debug(
+            f"Detected support for {num_supported_ops} operators out of {total_ops} in subgraph."
+        )
 
     # If specified, try using the fast partitioner and fall back to the global one on failure
     if settings.use_fast_partitioner:
         try:
+            logger.info("Partitioning the graph via the fast partitioner")
             new_partitioned_module, supported_ops = partitioning.fast_partition(
                 new_gm,
                 verbose=settings.debug,
                 min_block_size=settings.min_block_size,
                 torch_executed_ops=settings.torch_executed_ops,
+                require_full_compilation=settings.require_full_compilation,
+                skip_fusion=(num_supported_ops == total_ops),
             )
+
         except torch.fx.passes.splitter_base.FxNetSplitterInternalError:
             logger.error(
                 "Partitioning failed on the subgraph with fast partition. See trace above. "
-                + "Retrying with global partition.",
+                "Retrying with global partition.",
                 exc_info=True,
             )
 
             settings.use_fast_partitioner = False
 
     if not settings.use_fast_partitioner:
+        logger.info("Partitioning the graph via the global partitioner")
         new_partitioned_module, supported_ops = partitioning.global_partition(
             new_gm,
             verbose=settings.debug,
             min_block_size=settings.min_block_size,
             torch_executed_ops=settings.torch_executed_ops,
+            require_full_compilation=settings.require_full_compilation,
         )
 
+    # Done Partition
     if inline_module:
         # Preprocess the partitioned module to be in the same format as the inline module
         inline_torch_modules(new_partitioned_module)
@@ -495,6 +520,12 @@ def refit_module_weights(
             refitted_engine = torch.classes.tensorrt.Engine(tuple(new_engine_info))
             setattr(compiled_module, f"{name}_engine", refitted_engine)
 
+    # TODO: Memory control prototyping. Under discussion
+    if settings.offload_module_to_cpu:
+        del new_partitioned_module
+        gc.collect()
+        torch.cuda.empty_cache()
+
     if verify_output and arg_inputs is not None:
         if check_module_output(
             new_module=new_gm,
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import torch
+import torch_tensorrt
 from torch_tensorrt._Device import Device
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
@@ -61,6 +62,7 @@ def __init__(
         *,
         device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE,
         use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME,
+        enable_cuda_graph: bool = True,
         immutable_weights: bool = False,
         strict: bool = True,
         allow_complex_guards_as_runtime_asserts: bool = False,
@@ -127,6 +129,7 @@ def __init__(
         self.arg_inputs: tuple[Any, ...] = tuple()
         self.kwarg_inputs: dict[str, Any] = {}
         self.additional_settings = kwargs
+        self.enable_cuda_graph = enable_cuda_graph
         self.strict = strict
         self.allow_complex_guards_as_runtime_asserts = (
             allow_complex_guards_as_runtime_asserts
@@ -142,7 +145,11 @@ def __init__(
         self.run_info: Optional[tuple[Any, ...]] = None
         self.state_dict_metadata: dict[str, torch.Size] = {}
         self._store_state_dict_metadata()
-
+        self.enable_weight_streaming = (
+            kwargs["enable_weight_streaming"]
+            if "enable_weight_streaming" in kwargs
+            else False
+        )
         cls = self.__class__
         self.__class__ = type(
             self.original_model.__class__.__name__,
@@ -193,7 +200,7 @@ def forward(a, b, c=0, d=0):
 
         self.refit_state.set_state(RefitFlag.NEEDS_RECOMPILE)
 
-    def _get_total_dynamic_shapes(self) -> Union[dict[str, Any], None]:
+    def _get_total_dynamic_shapes(self) -> dict[str, Any] | None:
         if not self.arg_dynamic_shapes and not self.kwarg_dynamic_shapes:
             return None
         total_dynamic_shape = {}
@@ -266,15 +273,17 @@ def refit_gm(self) -> None:
         MutableTorchTensorRTModule automatically catches weight value updates and call this function to refit the module.
         If it fails to catch the changes, please call this function manually to update the TRT graph module.
         """
-        self.original_model.to(to_torch_device(self.trt_device))
+
         if self.exp_program is None:
+            self.original_model.to(to_torch_device(self.trt_device))
             self.exp_program = self.get_exported_program()
         else:
             self.exp_program._state_dict = (
                 MutableTorchTensorRTModule._transform_state_dict(
                     self.original_model.state_dict()
                 )
             )
+            self.exp_program.module().to(to_torch_device(self.trt_device))
         self.gm = refit_module_weights(
             self.gm,
             self.exp_program,
@@ -284,7 +293,7 @@ def refit_gm(self) -> None:
             in_place=True,
         )
 
-        self.original_model.cpu()
+        self.original_model.to("cpu")
         torch.cuda.empty_cache()
 
     def get_exported_program(self) -> torch.export.ExportedProgram:
@@ -324,8 +333,15 @@ def compile(self) -> None:
             use_python_runtime=self.use_python_runtime,
             **self.additional_settings,
         )
-        self.original_model.cpu()
+        self.original_model.to("cpu")
         torch.cuda.empty_cache()
+        # torch_tensorrt.runtime.set_cudagraphs_mode(self.enable_cuda_graph)
+        # if self.enable_cuda_graph:
+        #     self.gm = torch_tensorrt.runtime.enable_cudagraphs(self.gm)
+        if self.enable_weight_streaming:
+            self.weight_streaming_ctx = torch_tensorrt.runtime.weight_streaming(self.gm)
+            requested_budget = int(16 * 2 << 20)
+            self.weight_streaming_ctx.device_budget = requested_budget
 
     def _validate_inputs(self, *args: Any, **kwargs: Any) -> None:
 
@@ -446,14 +462,21 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 self._store_state_dict_metadata()
             self.refit_state.set_state(RefitFlag.LIVE)
 
+        # weight_streaming_ctx = self.weight_streaming_ctx if self.enable_weight_streaming else None
         result = self.gm(*args, **kwargs)
         # Storing inputs and outputs for verification when the state is unknown
         self.run_info = (args, kwargs, result)
         return result
 
-    def to(self, device: str) -> None:
-        logger.warning("Original PyTorch model is moved. CPU offload may failed.")
-        self.original_model.to(device)
+    def to(self, *args: Any, **kwargs: Any) -> None:
+        logger.warning(
+            "Trying to move the original PyTorch model. This will cause CPU offloading failing and increase GPU memory usage."
+            + "If this is absolute necessary, please call module.pytorch_model.to(...)"
+        )
+
+    @property
+    def device(self) -> torch.device:
+        return to_torch_device(self.trt_device)
 
     def __deepcopy__(self, memo: Any) -> Any:
         cls = self.__class__