adding the test script and correction to the backend

apbose · apbose · commit 091c83f00a07 · 2025-02-28T11:22:09.000-08:00
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -69,13 +69,12 @@ def aot_torch_tensorrt_aten_backend(
     to_delete = {
         key
         for key in settings_aot_autograd["decompositions"]
-        if "transpose" in key._name
+        if "transpose" in key._name or "detach" in key._name
     }
 
     for key in to_delete:
         del settings_aot_autograd["decompositions"][key]
 
-    remove_detach(gm, settings)
     return aot_autograd(
         fw_compiler=_pretraced_backend_autograd,
         decompositions=settings_aot_autograd["decompositions"],
diff --git a/tests/py/dynamo/distributed/distributed_utils.py b/tests/py/dynamo/distributed/distributed_utils.py
@@ -0,0 +1,74 @@
+import logging
+import os
+
+import numpy as np
+import tensorrt as trt
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor.device_mesh import init_device_mesh
+
+
+def set_environment_variables_pytest():
+    os.environ["WORLD_SIZE"] = str(1)
+    os.environ["RANK"] = str(0)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(29500)
+    os.environ["USE_TRTLLM_PLUGINS"] = "1"
+
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "MODULE.bazel" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+
+def initialize_logger(rank, logger_file_name):
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    fh = logging.FileHandler(logger_file_name + f"_{rank}.log", mode="w")
+    fh.setLevel(logging.INFO)
+    logger.addHandler(fh)
+    return logger
+
+
+# This is required for env initialization since we use mpirun
+def initialize_distributed_env(logger_file_name, rank=0, world_size=1, port=29500):
+    local_rank = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", rank % torch.cuda.device_count())
+    )
+    world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", world_size))
+
+    # Set up environment variable to run with mpirun
+    os.environ["RANK"] = str(local_rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(port)
+    os.environ["TRTLLM_PLUGINS_PATH"] = (
+        find_repo_root() + "/lib/libnvinfer_plugin_tensorrt_llm.so"
+    )
+
+    # Necessary to assign a device to each rank.
+    torch.cuda.set_device(local_rank)
+
+    # We use nccl backend
+    dist.init_process_group("nccl")
+
+    # set a manual seed for reproducibility
+    torch.manual_seed(1111)
+
+    device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size,))
+    rank = device_mesh.get_rank()
+    assert rank == local_rank
+    logger = initialize_logger(rank, logger_file_name)
+    device_id = (
+        rank % torch.cuda.device_count()
+    )  # Ensure each rank gets a unique device
+    torch.cuda.set_device(device_id)
+
+    return device_mesh, world_size, rank, logger
diff --git a/tests/py/dynamo/distributed/test_distributed_simple_example.py b/tests/py/dynamo/distributed/test_distributed_simple_example.py
@@ -0,0 +1,92 @@
+import time
+
+import tensorrt as trt
+import torch
+import torch.nn as nn
+import torch_tensorrt
+from distributed_utils import initialize_distributed_env
+from torch.distributed._tensor import Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    RowwiseParallel,
+    parallelize_module,
+)
+
+device_mesh, _world_size, _rank, logger = initialize_distributed_env(
+    "./tensor_parallel_simple_example"
+)
+
+"""
+This example copies some code from https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/tensor_parallel_example.py
+"""
+
+
+class ToyModel(nn.Module):
+    """MLP based model"""
+
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.in_proj = nn.Linear(10, 3200)
+        self.relu = nn.ReLU()
+        self.out_proj = nn.Linear(3200, 1600)
+        self.in_proj2 = nn.Linear(1600, 500)
+        self.out_proj2 = nn.Linear(500, 100)
+
+    def forward(self, x):
+        x = self.out_proj(self.relu(self.in_proj(x)))
+        x = self.relu(x)
+        x = self.out_proj2(self.relu(self.in_proj2(x)))
+        return x
+
+
+logger.info(f"Starting PyTorch TP example on rank {_rank}.")
+
+# # create model and move it to GPU - init"cuda"_mesh has already mapped GPU ids.
+tp_model = ToyModel().to("cuda")
+
+
+# Custom parallelization plan for the model
+tp_model = parallelize_module(
+    module=tp_model,
+    device_mesh=device_mesh,
+    parallelize_plan={
+        "in_proj": ColwiseParallel(input_layouts=Shard(0)),
+        "out_proj": RowwiseParallel(output_layouts=Shard(0)),
+        "in_proj2": ColwiseParallel(input_layouts=Shard(0)),
+        "out_proj2": RowwiseParallel(output_layouts=Shard(0)),
+    },
+)
+torch.manual_seed(0)
+inp = torch.rand(20, 10, device="cuda")
+python_result = tp_model(inp)
+
+
+backend = "torch_tensorrt"
+tp_model = torch.compile(
+    tp_model,
+    backend=backend,
+    options={
+        "truncate_long_and_double": True,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "use_python_runtime": True,
+        "min_block_size": 1,
+        "use_aot_joint_export": False,
+    },
+    dynamic=False,
+)
+
+for i in range(10):
+    # For TP, input needs to be same across all TP ranks.
+    # Setting the random seed is to mimic the behavior of dataloader.
+    torch.manual_seed(i)
+    inp = torch.rand(20, 10, device="cuda")
+    start = time.time()
+    output = tp_model(inp)
+    end = time.time()
+    if i == 0:
+        logger.info(f"Compilation time is {end-start}")
+        assert (
+            python_result - output
+        ).std() < 0.01, "Compilation result is not correct."
+    elif _rank == 0:
+        logger.info(f"Inference time is {end-start}")
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -3,19 +3,11 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
 
-
-def set_environment_variables():
-    os.environ["WORLD_SIZE"] = str(1)
-    os.environ["RANK"] = str(0)
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = str(29500)
-    os.environ["USE_TRTLLM_PLUGINS"] = "1"
-
-
-set_environment_variables()
+set_environment_variables_pytest()
 dist.init_process_group(backend="nccl", init_method="env://")
 group = dist.new_group(ranks=[0])
 group_name = group.group_name
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.sh b/tests/py/dynamo/distributed/test_nccl_ops.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+check_command() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+ensure_installed() {
+    local pkg="$1"
+    if ! check_command "$pkg"; then
+        echo "$pkg is not installed. Installing $pkg..."
+
+        # Determine if sudo is needed
+        if check_command sudo; then
+            SUDO="sudo"
+        else
+            SUDO=""
+        fi
+
+        # Detect OS and install accordingly
+        OS="$(uname -s)"
+        if [[ "$OS" == "Linux" ]]; then
+            if check_command apt-get; then
+                $SUDO apt-get update && $SUDO apt-get install -y "$pkg"
+            fi
+        else
+            echo "Unsupported OS: $OS. Please install $pkg manually."
+            exit 1
+        fi
+    else
+        echo "$pkg is already installed."
+    fi
+}
+
+ensure_mpi_installed() {
+    local pkg="$1"
+    if dpkg -l | grep -q "$pkg"; then
+        echo "$pkg is already installed."
+    else
+        echo "$pkg is not installed. Installing $pkg..."
+
+        # Determine if sudo is needed
+        if check_command sudo; then
+            SUDO="sudo"
+        else
+            SUDO=""
+        fi
+
+        # Detect OS and install accordingly
+        OS="$(uname -s)"
+        if [[ "$OS" == "Linux" ]]; then
+            if check_command apt-get; then
+                $SUDO apt-get update && $SUDO apt-get install -y "$pkg"
+            fi
+        else
+            echo "Unsupported OS: $OS. Please install $pkg manually."
+            exit 1
+        fi
+    fi
+}
+
+ensure_pytest_installed(){
+    if check_command pip; then
+        echo "pip is installed, installing pytest..."
+        pip install pytest
+    else
+        echo "pip is not installed. Please install pip first."
+        exit 1
+    fi
+}
+
+echo "Setting up the environment"
+
+OS="$(uname -s)"
+ARCH="$(uname -m)"
+PYTHON_VERSION="$(python3 -c 'import sys; print(f"cp{sys.version_info.major}{sys.version_info.minor}")')"
+
+
+#getting the file name for TensorRT-LLM download
+if [[ "$OS" == "Linux" && "$ARCH" == "x86_64" && "$PYTHON_VERSION" == "cp312" ]]; then
+    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_x86_64.whl"
+elif [[ "$OS" == "Linux" && "$ARCH" == "aarch64" && "$PYTHON_VERSION" == "cp312" ]]; then
+    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_aarch64.whl"
+elif [[ "$OS" == "Linux" && "$ARCH" == "x86_64" && "$PYTHON_VERSION" == "cp310" ]]; then
+    FILE="tensorrt_llm-0.17.0.post1-cp310-cp310-linux_x86_64.whl"
+elif [[ "$OS" == "Linux" && "$ARCH" == "aarch64" && "$PYTHON_VERSION" == "cp310" ]]; then
+    FILE="tensorrt_llm-0.17.0.post1-cp310-cp310-linux_aarch64.whl"
+else:
+    echo "Unsupported platform: OS=$OS ARCH=$ARCH PYTHON=$PYTHON_VERSION"
+    exit 1
+fi
+
+# Download the selected file
+URL="https://pypi.nvidia.com/tensorrt-llm/$FILE"
+echo "Downloading $FILE from $URL..."
+
+echo "Downloading ...."
+#Installing wget
+ensure_installed wget
+#Downloading the package
+wget "$URL"
+echo "Download complete: $FILE"
+
+UNZIP_DIR="tensorrt_llm_unzip"
+if [[ ! -d "$UNZIP_DIR" ]]; then
+    echo "Creating directory: $UNZIP_DIR"
+    mkdir -p "$UNZIP_DIR"
+    echo "extracting $FILE to $UNZIP_DIR ..."
+    #Installing unzip
+    ensure_installed unzip
+    #unzip the TensorRT-LLM package
+    unzip -q "$FILE" -d "$UNZIP_DIR"
+    echo "Unzip complete"
+fi
+
+
+export TRTLLM_PLUGINS_PATH="$(pwd)/${UNZIP_DIR}/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"
+echo ${TRTLLM_PLUGINS_PATH}
+
+ensure_mpi_installed libmpich-dev
+ensure_mpi_installed libopenmpi-dev
+
+run_tests() {
+    cd ..
+    export PYTHONPATH=$(pwd)  # Set PYTHONPATH to the current directory
+    echo "Running pytest on distributed/test_nccl_ops.py..."
+    pytest distributed/test_nccl_ops.py
+}
+
+run_mpi_tests(){
+    cd distributed
+    echo "Running test_distributed_simple_example with mpirun..."---
+    mpirun -n 1 --allow-run-as-root python test_distributed_simple_example.py
+}
+
+ensure_pytest_installed
+run_tests
+run_mpi_tests