In [1]:
import torch
import ttnn

torch.manual_seed(0)

device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening device driver
[32m2023-10-30 22:20:42.034[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 4 PCI devices
[32m2023-10-30 22:20:42.094[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 3 device_id: 0xfaca revision: 0)
[32m2023-10-30 22:20:42.100[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 2 device_id: 0xfaca revision: 0)
[32m2023-10-30 22:20:42.104[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 1 device_id: 0xfaca revision: 0)
[32m2023-10-30 22:20:42.112[0m | [1m[38;2;100;149

In [2]:
m = 1024
k = 1024
n = 1024

# Initialize tensors a and b with random values using torch

In [3]:
torch_a = torch.randn((m, k), dtype=torch.bfloat16)
torch_b = torch.randn((k, n), dtype=torch.bfloat16)

In [4]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device)
b = ttnn.to_device(b, device)

# Matrix multiply tensor a and b
The operation will run longer the first time because the kernels need to get compiled

In [None]:
%%timeit -n 1 -r 1
output = a @ b

In [None]:
%%timeit -n 2 -r 5
output = a @ b

# Inspect the result of the matrix multiply

In [None]:
output = a @ b
print("Printing ttnn tensor")
print(output.shape)
print(output[:1, :32])

# Tilize tensors before running matrix multiply

In [None]:
a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

The operation will execute much faster because it will re-use compiled kernels

In [None]:
%%timeit -n 2 -r 5
output = a @ b

# Matrix multiply tensor a and b by using more performant config

In [None]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device, memory_config=ttnn.L1_MEMORY_CONFIG)
b = ttnn.to_device(b, device, memory_config=ttnn.L1_MEMORY_CONFIG)

a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

In [None]:
%%timeit -n 1 -r 1
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(8, 8))

In [None]:
%%timeit -n 2 -r 5
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(8, 8))

# Close the device

In [None]:
ttnn.close(device)