# Explore the matrix multiplications
    Showcase a matmul operation
    Expand the use of matmal with scalars

In [1]:
import torch
import ttnn

torch.manual_seed(0)

device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening device driver
[32m2023-10-27 22:03:19.768[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 4 PCI devices
[32m2023-10-27 22:03:19.806[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 3 device_id: 0xfaca revision: 0)
[32m2023-10-27 22:03:19.812[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 2 device_id: 0xfaca revision: 0)
[32m2023-10-27 22:03:19.817[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 1 device_id: 0xfaca revision: 0)
[32m2023-10-27 22:03:19.824[0m | [1m[38;2;100;149

In [2]:
h = 1024
w = 1024

# Initialize tensors a and b with random values using torch

In [3]:
torch_a = torch.randn((1, 1, h, w), dtype=torch.bfloat16)
torch_b = torch.randn((1, 1, w, h), dtype=torch.bfloat16)

In [4]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device)
b = ttnn.to_device(b, device)

# Matrix multiply tensor a and b
The operation will run longer the first time because the kernels need to get compiled

In [5]:
%%timeit -n 1 -r 1
output = a @ b

836 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [6]:
%%timeit -n 2 -r 5
output = a @ b

3.03 ms ± 465 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)


# Inspect the result of the matrix multiply

In [7]:
# output = a @ b
# print("Printing ttnn tensor")
# print(output.shape)
# print(output[:1, :32])

# Tilize tensors before running Matrix Multiply

In [8]:
a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

The operation will execute much faster because it will re-use compiled kernels

In [9]:
%%timeit -n 2 -r 5
output = a @ b

1.53 ms ± 138 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)


# Matrix multiply tensor a and b and use more performant config

In [10]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device, memory_config=ttnn.L1_MEMORY_CONFIG)
b = ttnn.to_device(b, device, memory_config=ttnn.L1_MEMORY_CONFIG)

a = ttnn.to_layout(a, ttnn.TILE_LAYOUT)
b = ttnn.to_layout(b, ttnn.TILE_LAYOUT)

In [11]:
%%timeit -n 1 -r 1
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(1, 2))

425 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [12]:
%%timeit -n 2 -r 5
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(1, 2))

1.53 ms ± 186 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)


# Close the device

In [13]:
ttnn.close(device)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Closing device 0
