In [1]:
import torch
import ttnn

torch.manual_seed(0)

device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening device driver
[32m2023-10-31 20:13:05.409[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 4 PCI devices
[32m2023-10-31 20:13:05.431[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 3 device_id: 0xfaca revision: 0)
[32m2023-10-31 20:13:05.437[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 2 device_id: 0xfaca revision: 0)
[32m2023-10-31 20:13:05.441[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 1 device_id: 0xfaca revision: 0)
[32m2023-10-31 20:13:05.450[0m | [1m[38;2;100;149

# Enable program cache

In [2]:
ttnn.enable_program_cache()

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | Program Cache: enabled.


# Configuration

In [3]:
m = 1024
k = 1024
n = 1024

# Initialize tensors a and b with random values using torch

In [4]:
torch_a = torch.randn((m, k), dtype=torch.bfloat16)
torch_b = torch.randn((k, n), dtype=torch.bfloat16)

In [5]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device)
b = ttnn.to_device(b, device)

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | from_torch                                         external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | from_torch                                         external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | to_device                                          external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | to_device                                          external                                          


# Matrix multiply tensor a and b
The operation will run longer the first time because the kernels need to get compiled

In [6]:
%%timeit -n 1 -r 1
output = a @ b

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
773 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            


In [7]:
%%timeit -n 2 -r 5
output = a @ b

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
299 µs ± 115 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                

# Inspect the result of the matrix multiply

In [8]:
output = a @ b
output = ttnn.to_layout(output, ttnn.ROW_MAJOR_LAYOUT)

print("Printing ttnn tensor")
print(output.shape)
print(output[:1, :32])

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
Printing ttnn tensor
[1024, 1024]
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Untilize                             device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | from_device                                        external                                          
[38;2;000;128;000m  

# Tilize tensors before running matrix multiply

In [9]:
a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            


The operation will execute much faster because it will re-use compiled kernels

In [10]:
%%timeit -n 2 -r 5
output = a @ b

106 µs ± 24.3 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Matmul                               device                               

# Matrix multiply tensor a and b by using more performant config

In [11]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device, memory_config=ttnn.L1_MEMORY_CONFIG)
b = ttnn.to_device(b, device, memory_config=ttnn.L1_MEMORY_CONFIG)

a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | from_torch                                         external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | from_torch                                         external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | to_device                                          external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | to_device                                          external                                          
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::tt_metal::Tilize                               device                                            
[38;2;000;128;000m                     Op[0m | [1m[

In [12]:
%%timeit -n 1 -r 1
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(8, 8))

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                            
387 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
%%timeit -n 2 -r 5
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(8, 8))

221 µs ± 57 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                            
[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | tt::operations::primary::Matmul                    device                                 

# Close the device

In [14]:
ttnn.close(device)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Closing device 0
