In [1]:
import tvm
from tvm import te

M = 1024
K = 1024
N = 1024

k = te.reduce_axis((0, K), 'k')
A = te.placeholder((M, K), name='A')
B = te.placeholder((K, N), name='B')
C = te.compute(
    (M, N),
    lambda x, y: te.sum(A[x, k] * B[k, y], axis=k),
    name='C')

s = te.create_schedule(C.op)
ir_m = tvm.lower(s, [A, B, C], simple_mode=True, name='mmult')
rt_m = tvm.build(ir_m, [A, B, C], target='c', name='mmult')

print("tir:\n", ir_m.astext(show_meta_data=False))

print("source code:\n", rt_m.get_source())

tir:
 #[version = "0.0.5"]
@mmult = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "mmult", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
             C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x: int32, 0, 1024) {
    for (y: int32, 0, 1024) {
      C_3: Buffer(C_2, float32, [1048576], [])[((x*1024) + y)] = 0f32
      for (k: int32, 0, 1024) {
        let cse_var_2: int32 = (x*1024)
        let cse_var_1: int32 = (cse_var_2 + y)
        C_3[cse_var_1] = (C_3[cse_var_1] + (A_3: Buffer(A_2, float32, [1048576], [])[(cse_var_2 + k)]*B_3: Buffer(B_2, float32, [1048576], [])[((k*1024) + y)]))
      }
    }
  }
}

/* For debugging purposes the metadata section has been omitted.
 * If you would like to see the full metadata section you can se

In [4]:
import torch

def print_torch_version():
    print(torch.__version__)
    print(torch.version.cuda)
    print(torch.cuda.is_available())
    print(torch.cuda.get_device_name())
    print(torch.cuda.device_count())

print_torch_version()

2.4.0
11.8
True
NVIDIA GeForce RTX 2060 with Max-Q Design
1
