Option 1 further modification- recommended set of options in test/examples, including the warning in backend, destroying processes while exiting

apbose · apbose · commit 771e9d2d37fe · 2025-03-20T18:10:27.000-07:00
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -2,6 +2,7 @@
 
 import tensorrt as trt
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch_tensorrt
 from tensor_parallel_initialize_dist import initialize_distributed_env
@@ -21,35 +22,6 @@
 """
 
 
-def compile_tp_model(tp_model, backend):
-    compile_options = {
-        "truncate_long_and_double": True,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "use_python_runtime": True,
-        "min_block_size": 1,
-    }
-
-    try:
-        return torch.compile(
-            tp_model, backend=backend, options=compile_options, dynamic=None
-        )
-    except RuntimeError as e:
-        if (
-            "aot_export is not currently supported with traceable tensor subclass"
-            in str(e)
-        ):
-            logger.warning(
-                "It is recommended to run the model with use_distributed_mode_trace=True. Running with that option"
-            )
-            compile_options["use_distributed_mode_trace"] = True
-            return torch.compile(
-                tp_model, backend=backend, options=compile_options, dynamic=None
-            )
-        else:
-            logger.debug("The distributed model fails with the following error")
-            raise
-
-
 class ToyModel(nn.Module):
     """MLP based model"""
 
@@ -93,20 +65,37 @@ def forward(self, x):
 inp = torch.rand(20, 10, device="cuda")
 python_result = tp_model(inp)
 
-compile_tp_model(tp_model, backend="torch_tensorrt")
+backend = "torch_tensorrt"
+tp_model = torch.compile(
+    tp_model,
+    backend=backend,
+    options={
+        "truncate_long_and_double": True,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "use_python_runtime": True,
+        "min_block_size": 1,
+        "use_distributed_mode_trace": True,
+    },
+    dynamic=None,
+)
 
-for i in range(10):
-    # For TP, input needs to be same across all TP ranks.
-    # Setting the random seed is to mimic the behavior of dataloader.
-    torch.manual_seed(i)
-    inp = torch.rand(20, 10, device="cuda")
-    start = time.time()
-    output = tp_model(inp)
-    end = time.time()
-    if i == 0:
-        logger.info(f"Compilation time is {end-start}")
-        assert (
-            python_result - output
-        ).std() < 0.01, "Compilation result is not correct."
-    elif _rank == 0:
-        logger.info(f"Inference time is {end-start}")
+try:
+    for i in range(10):
+        # For TP, input needs to be same across all TP ranks.
+        # Setting the random seed is to mimic the behavior of dataloader.
+        torch.manual_seed(i)
+        inp = torch.rand(20, 10, device="cuda")
+        start = time.time()
+        output = tp_model(inp)
+        end = time.time()
+        if i == 0:
+            logger.info(f"Compilation time is {end-start}")
+            assert (
+                python_result - output
+            ).std() < 0.01, "Compilation result is not correct."
+        elif _rank == 0:
+            logger.info(f"Inference time is {end-start}")
+finally:
+    # This cleans up the distributed process group
+    if dist.is_initialized():
+        dist.destroy_process_group()
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -10,6 +10,7 @@
 from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.utils import detect_fake_mode
 from torch._functorch.aot_autograd import aot_export_joint_simple
+from torch.distributed.tensor import DTensor
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo._compiler import compile_module
 from torch_tensorrt.dynamo.lowering import (
@@ -79,6 +80,10 @@ def aot_torch_tensorrt_aten_backend(
             fw_compiler=_pretraced_backend_autograd,
             decompositions=settings_aot_autograd["decompositions"],
         )(gm, sample_inputs)
+    if any(isinstance(tensor, DTensor) for tensor in sample_inputs):
+        logger.warning(
+            "It is recommended to run the model with use_distributed_mode_trace = True since there are distributed tensors in the input which is not supported aot_export_joint_simple"
+        )
     return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
 
 
diff --git a/tests/py/dynamo/distributed/test_distributed_simple_example.py b/tests/py/dynamo/distributed/test_distributed_simple_example.py
@@ -2,6 +2,7 @@
 
 import tensorrt as trt
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch_tensorrt
 from distributed_utils import initialize_distributed_env
@@ -16,36 +17,6 @@
     "./tensor_parallel_simple_example"
 )
 
-
-def compile_tp_model(tp_model, backend):
-    compile_options = {
-        "truncate_long_and_double": True,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "use_python_runtime": True,
-        "min_block_size": 1,
-    }
-
-    try:
-        return torch.compile(
-            tp_model, backend=backend, options=compile_options, dynamic=None
-        )
-    except RuntimeError as e:
-        if (
-            "aot_export is not currently supported with traceable tensor subclass"
-            in str(e)
-        ):
-            logger.warning(
-                "It is recommended to run the model with use_distributed_mode_trace=True. Running with that option"
-            )
-            compile_options["use_distributed_mode_trace"] = True
-            return torch.compile(
-                tp_model, backend=backend, options=compile_options, dynamic=None
-            )
-        else:
-            logger.debug("The distributed model fails with the following error")
-            raise
-
-
 """
 This example copies some code from https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/tensor_parallel_example.py
 """
@@ -90,20 +61,37 @@ def forward(self, x):
 inp = torch.rand(20, 10, device="cuda")
 python_result = tp_model(inp)
 
-compile_tp_model(tp_model, backend="torch_tensorrt")
+backend = "torch_tensorrt"
+tp_model = torch.compile(
+    tp_model,
+    backend=backend,
+    options={
+        "truncate_long_and_double": True,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "use_python_runtime": True,
+        "min_block_size": 1,
+        "use_distributed_mode_trace": True,
+    },
+    dynamic=None,
+)
 
-for i in range(10):
-    # For TP, input needs to be same across all TP ranks.
-    # Setting the random seed is to mimic the behavior of dataloader.
-    torch.manual_seed(i)
-    inp = torch.rand(20, 10, device="cuda")
-    start = time.time()
-    output = tp_model(inp)
-    end = time.time()
-    if i == 0:
-        logger.info(f"Compilation time is {end-start}")
-        assert (
-            python_result - output
-        ).std() < 0.01, "Compilation result is not correct."
-    elif _rank == 0:
-        logger.info(f"Inference time is {end-start}")
+try:
+    for i in range(10):
+        # For TP, input needs to be same across all TP ranks.
+        # Setting the random seed is to mimic the behavior of dataloader.
+        torch.manual_seed(i)
+        inp = torch.rand(20, 10, device="cuda")
+        start = time.time()
+        output = tp_model(inp)
+        end = time.time()
+        if i == 0:
+            logger.info(f"Compilation time is {end-start}")
+            assert (
+                python_result - output
+            ).std() < 0.01, "Compilation result is not correct."
+        elif _rank == 0:
+            logger.info(f"Inference time is {end-start}")
+finally:
+    # This cleans up the distributed process group
+    if dist.is_initialized():
+        dist.destroy_process_group()