# Explore the matrix multiplications
    Showcase a matmul operation
    Expand the use of matmal with scalars

In [1]:
import torch
import ttnn

torch.manual_seed(0)

device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening device driver
[32m2023-10-27 19:07:44.421[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 4 PCI devices
[32m2023-10-27 19:07:44.451[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 3 device_id: 0xfaca revision: 0)
[32m2023-10-27 19:07:44.456[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 2 device_id: 0xfaca revision: 0)
[32m2023-10-27 19:07:44.461[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 1 device_id: 0xfaca revision: 0)
[32m2023-10-27 19:07:44.469[0m | [1m[38;2;100;149

In [2]:
h = 1024
w = 1024

# Initialize tensors a and b with random values using torch

In [3]:
torch_a = torch.randn((h, w), dtype=torch.bfloat16)
torch_b = torch.randn((w, h), dtype=torch.bfloat16)

a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device)
b = ttnn.to_device(b, device)

# Matrix multiply tensor a and b
The operation will run longer the first time because the kernels need to get compiled

In [4]:
%%timeit -n 1 -r 1
output = a @ b



959 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


The operation will execute much faster because it will re-use compiled kernels

In [5]:
%%timeit -n 1 -r 1
output = a @ b



128 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Inspect the result of the matrix multiply

In [6]:
output = a @ b
print("Printing ttnn tensor")
print(output.shape)
print(output[:1, :32])



Printing ttnn tensor
[1024, 1024]
Tensor([ [34.25, 9.625, 11.3125, 0.964844, 1.45312, -26.875, 23.125, -1.39062, -20.375, 33, 5.8125, 10.6875, -18.625, 14.5, -42.75, -18.375, 27.75, 44.25, -27.25, -20.5, 43.5, -5.75, -46.75, -45.75, 43.75, 33, -16.125, 39.25, 11.6875, 9.4375, -39.75, -6.5625]], dtype=bfloat16 )



# Matrix multiply tensor a and b and use more performant config

In [7]:
a = ttnn.from_torch(torch_a)
b = ttnn.from_torch(torch_b)

a = ttnn.to_device(a, device, memory_config=ttnn.L1_MEMORY_CONFIG)
b = ttnn.to_device(b, device, memory_config=ttnn.L1_MEMORY_CONFIG)

In [8]:
%%timeit -n 1 -r 1
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(1, 2))



922 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [9]:
%%timeit -n 1 -r 1
output = ttnn.matmul(a, b, memory_config=ttnn.L1_MEMORY_CONFIG, core_grid=(1, 2))



129 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Close the device

In [10]:
ttnn.close(device)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Closing device 0
