In [1]:
import cutlass
import cutlass.cute as cute
import torch
from cutlass.torch import dtype as torch_dtype

In [2]:
@cute.jit
def test(ptr: cute.Pointer, shape, stride):
    layout = cute.make_layout(shape=shape, stride=stride)
    tensor = cute.make_tensor(ptr, layout)
    cute.print_tensor(tensor, verbose=False)
    cute.print_tensor(tensor, verbose=True)

dtype = cutlass.Int32
storage = torch.arange(8, dtype=torch_dtype(dtype))
storage_ptr = cute.runtime.make_ptr(dtype, storage.data_ptr())
test(storage_ptr, shape=((2,2),2), stride=((4,1),2))

tensor(raw_ptr(0x000056534e8ce240: i32, generic, align<4>) o ((2,2),2):((4,1),2), data=
       [[ 0,  2, ],
        [ 4,  6, ],
        [ 1,  3, ],
        [ 5,  7, ]])
tensor(raw_ptr(0x000056534e8ce240: i32, generic, align<4>) o ((2,2),2):((4,1),2), data= (
	((0,0),0)= 0
	((0,1),0)= 1
	((0,0),1)= 2
	((0,1),1)= 3
	((1,0),0)= 4
	((1,1),0)= 5
	((1,0),1)= 6
	((1,1),1)= 7
)


The above is a nested-mode matrix -- its layout is logically 4x2. So in non-verbose mode, it is printed using 2-D coordinates.
However, in the the 1st coordinate of `((dim0, dim1), dim2)`, by convension, the left sub-coordinate `dim0` is actually in the inner-loop, and `dim1` is in the outter-loop.

Let us take a look at another example:

In [19]:
storage = torch.arange(16, dtype=torch_dtype(dtype))
storage_ptr = cute.runtime.make_ptr(dtype, storage.data_ptr())
test(storage_ptr, shape=((2,2), (2,2)), stride=((1, 4), (2, 8)))

tensor(raw_ptr(0x0000556bcecf6900: i32, generic, align<4>) o ((2,2),(2,2)):((1,4),(2,8)), data=
       [[ 0,  2,  8,  10, ],
        [ 1,  3,  9,  11, ],
        [ 4,  6,  12,  14, ],
        [ 5,  7,  13,  15, ]])
tensor(raw_ptr(0x0000556bcecf6900: i32, generic, align<4>) o ((2,2),(2,2)):((1,4),(2,8)), data= (
	((0,0),(0,0))= 0
	((1,0),(0,0))= 1
	((0,0),(1,0))= 2
	((1,0),(1,0))= 3
	((0,1),(0,0))= 4
	((1,1),(0,0))= 5
	((0,1),(1,0))= 6
	((1,1),(1,0))= 7
	((0,0),(0,1))= 8
	((1,0),(0,1))= 9
	((0,0),(1,1))= 10
	((1,0),(1,1))= 11
	((0,1),(0,1))= 12
	((1,1),(0,1))= 13
	((0,1),(1,1))= 14
	((1,1),(1,1))= 15
)


Along each logical axis, the order of sub-coordinate is similarly inner-left and outter-right:

![](nested-layout.png)

## Reference
1. https://docs.nvidia.com/cutlass/media/docs/cpp/cute/01_layout.html
2. https://docs.modular.com/mojo/manual/layout/layouts