See the AOTInductor tutorial full example at https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_aot_inductor.rst

In [None]:
"""
Install the latest PyTorch Build.
ETA: 1 minute
"""
# resolve dependency conflict on colab and may not be necessary on local environement

!pip uninstall torch -y

!pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121

Found existing installation: torch 2.4.0.dev20240426+cu121
Uninstalling torch-2.4.0.dev20240426+cu121:
  Successfully uninstalled torch-2.4.0.dev20240426+cu121
Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/nightly/cu121/torch-2.4.0.dev20240426%2Bcu121-cp310-cp310-linux_x86_64.whl (795.6 MB)
Installing collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.14 requires torch<2.3,>=1.10, but you have torch 2.4.0.dev20240426+cu121 which is incompatible.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 2.4.0.dev20240426+cu121 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 2.4.0.dev20240426+cu121 which is incompatible.
torchvision 0.17.1+cu121 requires torch==2.2.1, but you have torch 2.4.0.dev2024

In [None]:
"""
More details on the torch._export.aot_compile API at https://github.com/pytorch/pytorch/blob/cd06c73cbd398811efc4afe85ee29dee64ebfd45/torch/_export/__init__.py#L320
"""

import os
import torch

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(10, 16)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(16, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

device = "cuda" if torch.cuda.is_available() else "cpu"

with torch.no_grad():
    model = Model().to(device=device)
    example_inputs=(torch.randn(8, 10, device=device),)
    batch_dim = torch.export.Dim("batch", min=1, max=1024)
    so_path = torch._export.aot_compile(
        model,
        example_inputs,
        # Specify the first dimension of the input x as dynamic
        dynamic_shapes={"x": {0: batch_dim}},
        # Specify the generated shared library path
        options={"aot_inductor.output_path": os.path.join(os.getcwd(), "model.so")},
    )
    print(so_path)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


/content/model.so


https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_aot_inductor.rst contains an example on how to run the following C++ inference example by building with cmake. For this demo, we will not compile and run the following C++ code. We will use a pybind-ed runner to load the generated model.so back to Python.

```
#include <iostream>
#include <vector>

#include <torch/torch.h>
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>

int main() {
    c10::InferenceMode mode;

    torch::inductor::AOTIModelContainerRunnerCuda runner("model.so");
    std::vector<torch::Tensor> inputs = {torch::randn({8, 10}, at::kCUDA)};
    std::vector<torch::Tensor> outputs = runner.run(inputs);
    std::cout << "Result from the first inference:"<< std::endl;
    std::cout << outputs[0] << std::endl;

    // The second inference uses a different batch size and it works because we
    // specified that dimension as dynamic when compiling model.so.
    std::cout << "Result from the second inference:"<< std::endl;
    std::vector<torch::Tensor> inputs2 = {torch::randn({2, 10}, at::kCUDA)};
    std::cout << runner.run(inputs2)[0] << std::endl;

    return 0;
}
```



In [None]:
# Use python runner utility to load the generated model.so
aot_compiled = torch._export.aot_load(os.path.join(os.getcwd(), "model.so"), device=device)

input1 = torch.randn(8, 10, device=device)
print(aot_compiled(input1))

tensor([[0.6255],
        [0.5078],
        [0.5407],
        [0.5807],
        [0.5801],
        [0.6062],
        [0.5439],
        [0.6316]])


In [None]:
# Because the model was compiled with a dynamic batch size, we can run prediction  a different batch size
input2 = torch.randn(20, 10, device=device)
print(aot_compiled(input2))

tensor([[0.5668],
        [0.6103],
        [0.5271],
        [0.5384],
        [0.5847],
        [0.5622],
        [0.5398],
        [0.5522],
        [0.4717],
        [0.5272],
        [0.5258],
        [0.5603],
        [0.5213],
        [0.3987],
        [0.4904],
        [0.6111],
        [0.5210],
        [0.4858],
        [0.5139],
        [0.5684]])


In [None]:
torch._logging.set_logs(output_code=True)

with torch.no_grad():
    model = Model().to(device=device)
    example_inputs=(torch.randn(8, 10, device=device),)
    batch_dim = torch.export.Dim("batch", min=1, max=1024)
    so_path = torch._export.aot_compile(
        model,
        example_inputs,
        # Specify the first dimension of the input x as dynamic
        dynamic_shapes={"x": {0: batch_dim}},
        # Specify the generated shared library path
        options={"aot_inductor.output_path": os.path.join(os.getcwd(), "model.so")},
    )

V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] Output code: 
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #include <torch/csrc/inductor/aoti_runtime/interface.h>
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #include <torch/csrc/inductor/aoti_runtime/model_container.h>
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] 
V0426 21:16:11.469000 139836882890752 torch/_inductor/graph.py:1601] [__output_code] #in