In [1]:
import time
import torch

MSIZE = 2000

torch.manual_seed(1234)
TENSOR_A_CPU = torch.rand(MSIZE, MSIZE)
TENSOR_B_CPU = torch.rand(MSIZE, MSIZE)

torch.manual_seed(1234)
TENSOR_A_MPS = torch.rand(MSIZE, MSIZE).to('mps')
TENSOR_B_MPS = torch.rand(MSIZE, MSIZE).to('mps')

In [2]:
def run(f, times=100):
    start_time = time.time()
    for _ in range(times):
        f()
    
    print(f"Ran {times} times, {time.time() - start_time} seconds")

In [3]:
run(lambda: TENSOR_A_CPU @ TENSOR_B_CPU)

Ran 100 times, 0.8707849979400635 seconds


In [4]:
run(lambda: TENSOR_A_MPS @ TENSOR_B_MPS)

Ran 100 times, 0.053321123123168945 seconds


In [55]:
import torch.nn as nn

BATCH_SIZE = 32  # B
BLOCK_SIZE = 32   # T
D_MODEL = 128     # C
VOCAB_SIZE = 64  # V


class MyModule(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, D_MODEL)
        self.layer1 = nn.Linear(D_MODEL, VOCAB_SIZE)
    
    def forward(self, x, targets):
        B, T = x.shape
        x = self.embedding(x)  # B, T, D_MODEL
        x = self.layer1(x)  # B, T, V
        x = x.view(B * T, VOCAB_SIZE)
        loss = nn.functional.cross_entropy(x, targets)
        return x, loss

In [56]:
def train_model(module, x, y, times=1000):
    optimizer = torch.optim.SGD(module.parameters(), lr=1e-3)
    
    start_time = time.time()
    for _ in range(times):
        logits, loss = module(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    
    print(f"Ran {times} times, {time.time() - start_time} seconds")

In [57]:
X = torch.randint(VOCAB_SIZE, (BATCH_SIZE, BLOCK_SIZE))

Y = torch.randint(VOCAB_SIZE, (BATCH_SIZE * BLOCK_SIZE, ))

In [58]:
train_model(MyModule(), X, Y)

Ran 1000 times, 0.6712536811828613 seconds


In [59]:
train_model(MyModule().to('mps'), X.to('mps'), Y.to('mps'))

Ran 1000 times, 1.0049159526824951 seconds
