In [25]:
import os
import subprocess

def source_and_load(script_path):
    """
    Sources a shell script and loads the resulting environment variables
    into the current Python process. This version is robust against scripts
    that print status messages.
    """
    # Command to source the script and then run 'env' to dump all variables.
    command = f'bash -c "source {script_path} &> /dev/null && env"'
    
    try:
        result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Parse the KEY=VALUE output from 'env'
        for line in result.stdout.strip().split('\n'):
            if '=' in line:
                key, value = line.split('=', 1)
                os.environ[key] = value
                
        print(f"Successfully sourced and loaded variables from {script_path}")

    except subprocess.CalledProcessError as e:
        # If the script fails, print its error messages for debugging
        print(f"--- ERROR sourcing script: {script_path} ---")
        print(e.stderr)
        print("-------------------------------------------------")

# --- USAGE ---
# This should now work correctly even if the scripts are verbose
source_and_load("/work/shared/common/allo/setup-llvm19.sh")
source_and_load("/work/shared/common/allo/vitis_2023.2_u280.sh")

# You can now verify that a variable is set
print(f"LLVM_BUILD_DIR is set to: {os.environ.get('LLVM_BUILD_DIR', 'Not Set')}")

Successfully sourced and loaded variables from /work/shared/common/allo/setup-llvm19.sh
Successfully sourced and loaded variables from /work/shared/common/allo/vitis_2023.2_u280.sh
LLVM_BUILD_DIR is set to: /work/shared/common/llvm-project-19.x/build


In [None]:
# Copyright Allo authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

import allo
from allo.ir.types import float32
import allo.dataflow as df
import allo.backend.hls as hls
import numpy as np

M, N, K = 2, 2, 2
P0, P1 = M + 2, N + 2


        tM = M / 2
        tA1: float32[tM, K] = 0.0
        tA2: float32[tM, K] = 0.0
        for i in range(M):
            for k in range(K):
                if i < tM:
                    tA1[i, k] = A[i, k]
                else:
                    tA2[i - tM, k] = A[i, k]

@df.region()
def top():
    fifo_A = df.array(df.pipe(dtype=float32, shape=(), depth=4), shape=(P0, P1))
    fifo_B = df.array(df.pipe(dtype=float32, shape=(), depth=4), shape=(P0, P1))

    @df.kernel(mapping=[P0, P1]) # systolic array of P0 x P1 PEs
    def gemm(gA: float32[gM, K], B: float32[K, N], gC: float32[gM, N]):
        
        i, j = df.get_pid()
        # periperals kernels
        with allo.meta_if(i in {0, M + 1} and j in {0, N + 1}):
            pass
        with allo.meta_elif(j == 0):
            # i > 0
            for k in range(K):
                fifo_A[i, j + 1].put(A[i - 1, k])
        with allo.meta_elif(i == 0):
            # j > 0
            for k in range(K):
                fifo_B[i + 1, j].put(B[k, j - 1])
        # drain
        with allo.meta_elif(i == M + 1 and j > 0):
            for k in range(K):
                b: float32 = fifo_B[i, j].get()
        with allo.meta_elif(j == N + 1 and i > 0):
            for k in range(K):
                a: float32 = fifo_A[i, j].get()
        # main body
        with allo.meta_else():
            c: float32 = 0
            for k in range(K):
                a: float32 = fifo_A[i, j].get()
                b: float32 = fifo_B[i, j].get()
                c += a * b
                fifo_A[i, j + 1].put(a)
                fifo_B[i + 1, j].put(b)
            C[i - 1, j - 1] = c


def test_systolic(sim_only=True):
    A = np.random.rand(M, K).astype(np.float32)
    B = np.random.rand(K, N).astype(np.float32)
    C = np.zeros((M, N), dtype=np.float32)

    sim_mod = df.build(top, target="simulator")
    sim_mod(A, B, C)
    np.testing.assert_allclose(C, np.dot(A, B), atol=1e-5)
    print("Dataflow Simulator Passed!")

    if not sim_only and hls.is_available("vitis_hls"):
        s = df.customize(top)
        s.partition("top:A", dim=1, factor=2)
        s.partition("top:B", dim=2, factor=2)
        s.partition("top:C", dim=0, factor=2)
        mod = s.build(target="vitis_hls", mode="hw_emu", project="systolic.prj")
        C = np.zeros((M, N), dtype=np.float32)
        mod(A, B, C)
        np.testing.assert_allclose(C, np.dot(A, B), atol=1e-5)
        print("Passed!")

In [34]:
test_systolic()

Dataflow Simulator Passed!


In [None]:
import allo
from allo.ir.types import float32, int32

M,N,K = 128,128,128
T = 32

def gemm(A: float32[T,T], B: float32[T,T]) -> float32[T,T]:
    C: float32[T,T] = 0.0
    for i,j in allo.grid(T,T):
        for k in allo.reduction(T):
            C[i,j] += A[i,k] * B[k,j]
    return C


def gevm(A: float32[T], B: float32[T,T]) -> float32[T]:
    C: float32[T] = 0.0
    for i in allo.grid(T):
        for k in allo.reduction(T):
            C[i] += A[k] * B[k,i]
    return C


def gevmm(A: float32[T,T], B: float32[T,T]) -> float32[T,T]:
    C: float32[T,T] = 0.0
    for i in allo.grid(T):
        for j in allo.grid(T):
            C[i,j] = gevm(A[i*T], B)[j] # FIXME data type issue
    return C

'''
def tiled_gemm(A: float32[M,K], B: float32[K,N]) -> float32[M,N]:
    C: float32[M,N] = 0.0
    for i,j in allo.grid(M//T,N//T):
        tC: float32[T,T] = 0.0
        for k in allo.reduction(K//T):
            tC += gemm(A[i*T:(i+1)*T, k*T:(k+1)*T], B[k*T:(k+1)*T, j*T:(j+1)*T])
        C[i*T:(i+1)*T, j*T:(j+1)*T] += tC
    return C
    '''

'\ndef tiled_gemm(A: float32[M,K], B: float32[K,N]) -> float32[M,N]:\n    C: float32[M,N] = 0.0\n    for i,j in allo.grid(M//T,N//T):\n        tC: float32[T,T] = 0.0\n        for k in allo.reduction(K//T):\n            tC += gemm(A[i*T:(i+1)*T, k*T:(k+1)*T], B[k*T:(k+1)*T, j*T:(j+1)*T])\n        C[i*T:(i+1)*T, j*T:(j+1)*T] += tC\n    return C\n    '

In [31]:
s = allo.customize(gevmm)
#print(s.module)

Traceback (most recent call last):
  File "/home/sk3463/allo/allo/ir/infer.py", line 1330, in visit_stmts
    results.append(visit_stmt(ctx, stmt))
                   ^^^^^^^^^^^^^^^^^^^^^
  File "/home/sk3463/allo/allo/ir/visitor.py", line 227, in __call__
    res = method(ctx, node)
          ^^^^^^^^^^^^^^^^^
  File "/home/sk3463/allo/allo/ir/infer.py", line 318, in visit_Assign
    rhs = visit_stmt(ctx, node.value)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sk3463/allo/allo/ir/visitor.py", line 227, in __call__
    res = method(ctx, node)
          ^^^^^^^^^^^^^^^^^
  File "/home/sk3463/allo/allo/ir/infer.py", line 491, in visit_Subscript
    node.dtype = ctx.get_symbol(node.value.id).dtype
                                ^^^^^^^^^^^^^
AttributeError: 'Call' object has no attribute 'id'



SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [32]:
# Functional Testing; LLVM Backend
executable = s.build()

# testing
import numpy as np
np_A = np.random.rand(T,T).astype(np.float32)
np_B = np.random.rand(T,T).astype(np.float32)
np_C = executable(np_A, np_B)
golden_C = np.matmul(np_A, np_B)
np.testing.assert_allclose(np_C, golden_C, rtol=1e-3, atol=1e-3)
print("testing completed, np_C == golden_C")

AssertionError: 
Not equal to tolerance rtol=0.001, atol=0.001

Mismatched elements: 1024 / 1024 (100%)
Max absolute difference among violations: 11.981458
Max relative difference among violations: 1.
 ACTUAL: array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],...
 DESIRED: array([[6.596055, 7.304658, 8.017097, ..., 7.35523 , 8.358668, 8.076088],
       [6.653503, 6.359422, 8.199472, ..., 7.539833, 7.393631, 7.338074],
       [7.283918, 7.280246, 8.71664 , ..., 6.367553, 8.534851, 7.289828],...

So far, we have functionally tested the feature of calling the kernel with an extra argument (the loop trip count).

Now, let's generalize this feature to generate arbitrary-sized square matrix tiledGEMM kernel.

Next, we'll achieve:
- Functional TiledGemm
- Do HLS workflow with TiledGEMM
    - Functional check -> HLS codegen -> Inspect HLS -> Iterate
- Programmable Feather
    - different tiling, layout, and routing patterns in FEATHER

In [None]:
# Copyright Allo authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# %%
# We first reorder the inner reduction loop with the middle loop.
# This is used to change the computation order of matrix multiplication.

s.reorder("k", "j")
print(s.module)

# %%
# .. note::
#
#    This reordering seems to be easy, but it is impossible in the old Allo,
#    since the previous Allo directly generate reduction variables which make
#    the ``j`` loop becomes imperfect, while MLIR only supports reordering perfect
#    loops.

# %%
# Next, we create a new buffer for the output tensor ``C``.
# We provide a ``.buffer_at()`` primitive for users to quickly create a new buffer
# along a specific axis. Since Allo has attached all the tensors to the function,
# we can directly use ``<schedule>.<tensor>`` to access a specific tensor in the schedule.

s.buffer_at(s.C, axis="i")
print(s.module)

# %%
# From the above generated code, we can see that Allo automatically
# creates an intermediate buffer ``%1`` for ``C`` and attach it inside the ``i`` loop.
# Also two additional loop nested named ``j_init`` and ``j_back`` are created to
# initialize and write the intermediate buffer back to output tensor.

# %%
# Lastly, we pipeline the ``j`` loop in order to achieve the best performance.

s.pipeline("j")
print(s.module)

In [None]:

##############################################################################
# Codegen for Vivado/Vitis HLS
# ----------------------------
# Similar to the CPU execution, we only need to change the target of the ``.build()`` function
# in order to target different backends. Here, we use ``vhls`` as the target to generate
# Vivado/Vitis HLS code, which will directly returns the generated code as a string.

code = s.build(target="vhls")
print(code)

In [None]:

# %%
# We can see that the generated code preserves the same structure as the IR, and inserts
# necessary headers and pragmas for Vivado/Vitis HLS. The generated code can be directly passed
# to Vivado/Vitis HLS to generate RTL designs.
#
# .. note::
#
#    Vivado HLS was the previous name of Vitis HLS (before 2020.1). The previous HLS code
#    can still run on the latest Vitis HLS, but the performance of the generated RTL design
#    and the estimated reports may be different, as the newer version of Vitis HLS provides better
#    automatic optimizations.

# %%
# We also provide an easy way to invoke Vitis HLS from Allo. Users can simply provide
# the synthesis mode that are supported by Vitis HLS (e.g., ``sw_emu``, ``hw_emu``, and ``hw``),
# and the target project folder name. Allo will automatically generate
# the HLS project and invoke the compiler to generate the RTL design.
#
# .. note::
#
#    - ``sw_emu``: Software emulation mode, which is similar to C simulation that compiles the program using C compiler and runs it on the CPU. Depending on the size of your input data, this mode may take within one minute.
#    - ``hw_emu``: Hardware emulation mode, which is similar to co-simulation that compiles the program into RTL design using HLS compiler and runs the RTL with the test bench on the FPGA emulator. Since it needs to go through the HLS synthesis flow, it may take several minutes to finish.
#    - ``hw``: Hardware mode, which compiles the program into RTL design using HLS, goes through placement and routing, generates the bitstream, and finally executes on FPGA. This mode may take several hours to finish.

mod = s.build(target="vitis_hls", mode="hw_emu", project="gemm.prj")


In [None]:
# %%
# After running the above instruction, we can see a ``gemm.prj`` folder is generated in the current directory:
#
# - ``host.cpp``: The host (CPU) OpenCL code that invokes the generated accelerator.
# - ``kernel.cpp``: The generated accelerator code.
# - ``Makefile``: Defined some shorthands for compiling the project.
#
# To generate the hardware design and see the performance estimation, we need to first
# prepare the input data. Allo supports NumPy inputs even for hardware programs,
# so we can just create two NumPy arrays ``np_A`` and ``np_B`` for inputs.
# Since the C++ design cannot support returning a new array, we also need to
# explicitly create an output array ``allo_C`` and pass it to the function.
#
# .. note::
#
#    You need to configure the `Vitis HLS <https://www.amd.com/en/products/software/adaptive-socs-and-fpgas/vitis/vitis-hls.html>`_ and `XRT <https://github.com/Xilinx/XRT>`_ environment before proceeding to the next step.
#    For Zhang group students, we have the Vitis environment configured on the server, so you can directly
#    ``source /work/shared/common/allo/vitis_2023.2_u280.sh`` to set up the environment, which
#    targets the AMD U280 FPGA board.
#
# .. code-block:: python
#
np_A = np.random.random((M, K)).astype(np.float32)
np_B = np.random.random((K, N)).astype(np.float32)
allo_C = np.zeros((M, N), dtype=np.float32)
mod(np_A, np_B, allo_C)
np.testing.assert_allclose(allo_C, np.matmul(np_A, np_B), rtol=1e-5, atol=1e-5)

# %%
# After executing the above command, you can check the following report under ``gemm.prj/_x.hw_emu.xilinx_u250_gen3x16_xdma_4_1_202210_1/gemm/gemm/gemm/solution/syn/report/csynth.rpt``.
#
# .. code-block:: python
#
#    +--------------------------------------------------+---------+-----------+----------+---------+------+----------+---------+---------+-------------+------------+-----+
#    |                      Modules                     | Latency |  Latency  | Iteration|         | Trip |          |         |         |             |            |     |
#    |                      & Loops                     | (cycles)|    (ns)   |  Latency | Interval| Count| Pipelined|  BRAM   |   DSP   |      FF     |     LUT    | URAM|
#    +--------------------------------------------------+---------+-----------+----------+---------+------+----------+---------+---------+-------------+------------+-----+
#    |+ gemm                                            |    39934|  1.331e+05|         -|    39935|     -|        no|  6 (~0%)|  5 (~0%)|  19074 (~0%)|  29069 (2%)|    -|
#    | + gemm_Pipeline_VITIS_LOOP_44_1_VITIS_LOOP_45_2  |     1026|  3.420e+03|         -|     1026|     -|        no|        -|        -|     36 (~0%)|   169 (~0%)|    -|
#    |  o VITIS_LOOP_44_1_VITIS_LOOP_45_2               |     1024|  3.413e+03|         2|        1|  1024|       yes|        -|        -|            -|           -|    -|
#    | o l_S_buf0_buf0_l_0_l_buf0_l_1                   |     1025|  3.416e+03|         3|        1|  1024|       yes|        -|        -|            -|           -|    -|
#    | o l_S_buf1_buf1_l_0_l_buf1_l_1                   |     1025|  3.416e+03|         3|        1|  1024|       yes|        -|        -|            -|           -|    -|
#    | o l_S_i_j_0_i                                    |    35616|  1.187e+05|      1113|        -|    32|        no|        -|        -|            -|           -|    -|
#    |  + gemm_Pipeline_l_j_init                        |       34|    113.322|         -|       34|     -|        no|        -|        -|      8 (~0%)|    50 (~0%)|    -|
#    |   o l_j_init                                     |       32|    106.656|         1|        1|    32|       yes|        -|        -|            -|           -|    -|
#    |  + gemm_Pipeline_l_S_k_0_k_l_j                   |     1039|  3.463e+03|         -|     1039|     -|        no|        -|  5 (~0%)|    759 (~0%)|   494 (~0%)|    -|
#    |   o l_S_k_0_k_l_j                                |     1037|  3.456e+03|        15|        1|  1024|       yes|        -|        -|            -|           -|    -|
#    |  + gemm_Pipeline_l_j_back                        |       34|    113.322|         -|       34|     -|        no|        -|        -|     15 (~0%)|    78 (~0%)|    -|
#    |   o l_j_back                                     |       32|    106.656|         2|        1|    32|       yes|        -|        -|            -|           -|    -|
#    | o l_S_result2_result2_l_0_l_result2_l_1          |     1026|  3.420e+03|         4|        1|  1024|       yes|        -|        -|            -|           -|    -|
#    +--------------------------------------------------+---------+-----------+----------+---------+------+----------+---------+---------+-------------+------------+-----+
#
# From the above output, we can clearly see that all the loops inside the GEMM kernel (marked as ``o``) are pipelined
# with Initiation Interval (II) equal to 1. You can also find more detailed information under the ``report`` folder.

In [None]:

##############################################################################
# On-board Execution
# ------------------
# After optimizing the design and make sure everything works correctly,
# we can push the generated RTL design to the backend synthesis flow to generate
# the bitstream for FPGA. In Allo, we can directly change the target to ``hw``
# to launch the backend synthesis job. It may take several hours to generate the final
# bitstream, so it would be better to run it using `tmux <https://github.com/tmux/tmux/wiki>`_.
#
# .. code-block:: python
#
mod = s.build(target="vitis_hls", mode="hw", project="gemm.prj")
mod(np_A, np_B, allo_C)
np.testing.assert_allclose(allo_C, np.matmul(np_A, np_B), rtol=1e-5, atol=1e-5)
#
# Finally, you should be able to see the generated bitstream ``.xclbin`` under the ``gemm.prj/build_dir.hw.xilinx_u280_gen3x16_xdma_1_202211_1`` folder
# (actual board name may be different), and the above test should pass.

# %%
# To get more detailed information on the resource usage and performance of the generated design,
# you can check the following files:
#
# - ``gemm.prj/build_dir.hw.xilinx_u280_gen3x16_xdma_1_202211_1/gemm.xclbin``: The generated bitstream.
# - ``gemm.prj/build_dir.hw.xilinx_u280_gen3x16_xdma_1_202211_1/gemm.link.xclbin.info``: Frequency of the actual design, which can be found in ``DATA_CLK``. By default, it is 300MHz.
# - ``gemm.prj/_x.hw.xilinx_u280_gen3x16_xdma_1_202211_1/reports/gemm/hls_reports/gemm_csynth.rpt``: The HLS synthesis report.
# - ``gemm.prj/_x.hw.xilinx_u280_gen3x16_xdma_1_202211_1/reports/link/imp/impl_1_full_util_routed.rpt``: The full utilization report after placement and routing. You can find the following resource usage:
#
#   - LUT: ``1. CLB Logic -- CLB LUTs``
#   - FF: ``1. CLB Logic -- CLB Registers -- Register as Flip Flop``
#   - BRAM: ``3. BLOCKRAM -- Block RAM Tile``
#   - DSP: ``4. ARITHMETIC -- DSPs``
#
# - ``gemm.prj/_x.hw.xilinx_u280_gen3x16_xdma_1_202211_1/reports/link/imp/impl_1_slr_util_routed.rpt``: The per SLR utilization report after placement and routing.
# - ``gemm.prj/_x.hw.xilinx_u280_gen3x16_xdma_1_202211_1/logs/gemm/gemm_vitis_hls.log``: The log file of the Vitis HLS.
# - ``gemm.prj/_x.hw.xilinx_u280_gen3x16_xdma_1_202211_1/logs/link/v++.log``: The log file of the Vivado backend synthesis.
