In [1]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [2]:
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

In [3]:
batch_size = 512 # Try, for example, 128, 256, 513.
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

In [4]:
# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
data = [torch.randn(batch_size, in_size) for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()

In [6]:
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = net(input)
        loss = loss_fn(output, target)
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")




Default precision:
Total execution time = 5.868 sec
Max memory used by tensors = 1283817984 bytes


In [7]:
device

'cuda'

In [9]:
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

scaler = torch.amp.GradScaler("cuda")

start_timer()
with torch.autocast(device_type=device, dtype=torch.float16):
    for epoch in range(epochs):
        for input, target in zip(data, targets):
            output = net(input)
            
            loss = loss_fn(output, target)
            
            scaler.scale(loss).backward()
            
            scaler.step(opt)
            scaler.update()
            opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")


Default precision:
Total execution time = 2.782 sec
Max memory used by tensors = 1409672192 bytes


In [1]:
import torch

In [2]:
torch.torch.cuda.get_arch_list()

['sm_80', 'sm_90', 'sm_100', 'sm_120']

In [3]:
import torch
print(torch.version.cuda)

12.9


In [4]:
import tensorflow as tf
print(tf.sysconfig.get_build_info()['cuda_version'])

ModuleNotFoundError: No module named 'tensorflow'

In [7]:
torch.__version__


'2.8.0+cu129'

In [6]:
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='NVIDIA T4G', major=7, minor=5, total_memory=14912MB, multi_processor_count=40, uuid=b9de142b-7912-5e4d-e780-40641135317c, pci_bus_id=0, pci_device_id=31, pci_domain_id=0, L2_cache_size=4MB)

In [8]:
torch.cuda.get_arch_list()

['sm_80', 'sm_90', 'sm_100', 'sm_120']

In [4]:
import torch
a=torch.zeros((1,2,3,3))
a.expand(3, -1, -1, -1).shape

torch.Size([3, 2, 3, 3])

In [5]:
L, S = 10, 5

In [6]:
torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)

tensor([[ True, False, False, False, False],
        [ True,  True, False, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True]])

In [8]:
import torch.nn.functional as F

In [17]:
512/32

16.0

In [18]:
q = torch.zeros((1, 16, 512, 48))
k = torch.zeros((1, 16, 32, 48))
v = torch.zeros((1, 16, 32, 32))
softmax_scale = 1.0

x = F.scaled_dot_product_attention(q, 
                k, v, attn_mask=None, is_causal=False, scale=softmax_scale)

In [14]:
from torch.nn.attention import sdpa_kernel, SDPBackend

In [15]:
# Sample for GQA for llama3
query = torch.rand(32, 32, 128, 64, dtype=torch.float16, device="cuda")
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
with sdpa_kernel(backends=[SDPBackend.MATH]):
    F.scaled_dot_product_attention(query,key,value,enable_gqa=True)

In [19]:
import flash_attn_interface

ModuleNotFoundError: No module named 'flash_attn_interface'