In [1]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [2]:
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

In [3]:
batch_size = 512 # Try, for example, 128, 256, 513.
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

In [4]:
# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
data = [torch.randn(batch_size, in_size) for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()

    Found GPU0 NVIDIA T4G which is of cuda capability 7.5.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 12.8 12.9 following instructions at
    https://pytorch.org/get-started/locally/
    
NVIDIA T4G with CUDA capability sm_75 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_80 sm_90 sm_100 sm_120.
If you want to use the NVIDIA T4G GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



AcceleratorError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = net(input)
        loss = loss_fn(output, target)
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")




Default precision:
Total execution time = 5.868 sec
Max memory used by tensors = 1283817984 bytes


In [7]:
device

'cuda'

In [9]:
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

scaler = torch.amp.GradScaler("cuda")

start_timer()
with torch.autocast(device_type=device, dtype=torch.float16):
    for epoch in range(epochs):
        for input, target in zip(data, targets):
            output = net(input)
            
            loss = loss_fn(output, target)
            
            scaler.scale(loss).backward()
            
            scaler.step(opt)
            scaler.update()
            opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")


Default precision:
Total execution time = 2.782 sec
Max memory used by tensors = 1409672192 bytes


In [1]:
import torch

In [2]:
torch.torch.cuda.get_arch_list()

['sm_80', 'sm_90', 'sm_100', 'sm_120']

In [3]:
import torch
print(torch.version.cuda)

12.9


In [4]:
import tensorflow as tf
print(tf.sysconfig.get_build_info()['cuda_version'])

ModuleNotFoundError: No module named 'tensorflow'

In [7]:
torch.__version__


'2.8.0+cu129'

In [6]:
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='NVIDIA T4G', major=7, minor=5, total_memory=14912MB, multi_processor_count=40, uuid=b9de142b-7912-5e4d-e780-40641135317c, pci_bus_id=0, pci_device_id=31, pci_domain_id=0, L2_cache_size=4MB)

In [8]:
torch.cuda.get_arch_list()

['sm_80', 'sm_90', 'sm_100', 'sm_120']