## Simple Allo -> DSLX flow (stateless dataflow)

In [2]:
%load_ext autoreload
%autoreload 2
import allo
from allo.ir.types import int32, uint32

In [3]:
def add(a: uint32, b: uint32) -> uint32:
  return a + b

s = allo.customize(add)
code = s.build(target='xls')
# print(code)

In [4]:
# Validate add proc
code.test([(0, 0, 0), (1, 2, 3), (123, 456, 579), (2**16, 2**16, 2**17)])


[ RUN UNITTEST  ] add_test
[            OK ]





In [5]:
code.interpret()







In [6]:
code.to_ir()

package add

file_number 0 "abax/add.x"

chan add__in0(bits[32], id=0, kind=streaming, ops=receive_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)
chan add__in1(bits[32], id=1, kind=streaming, ops=receive_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)
chan add__out0(bits[32], id=2, kind=streaming, ops=send_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)

top proc __add__add_0_next(__state: (), init={()}) {
  after_all.4: token = after_all(id=4)
  literal.3: bits[1] = literal(value=1, id=3)
  after_all.9: token = after_all(id=9)
  receive.5: (token, bits[32]) = receive(after_all.4, predicate=literal.3, channel=add__in0, id=5)
  receive.10: (token, bits[32]) = receive(after_all.9, predicate=literal.3, channel=add__in1, id=10)
  tmp0: bits[32] = tuple_index(receive.5, index=1, id=8, pos=[(0,10,15)])
  tmp1: bits[32] = tuple_index(receive.10, index=1, id=13, pos=[(0,11,15)])
  tmp2: bits[33] = zero_ext(tmp0, new_bit_count=3

In [7]:
code.opt()

package add

file_number 0 "abax/add.x"

chan add__in0(bits[32], id=0, kind=streaming, ops=receive_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)
chan add__in1(bits[32], id=1, kind=streaming, ops=receive_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)
chan add__out0(bits[32], id=2, kind=streaming, ops=send_only, flow_control=ready_valid, strictness=proven_mutually_exclusive)

top proc __add__add_0_next() {
  after_all.4: token = after_all(id=4)
  receive.37: (token, bits[32]) = receive(after_all.4, channel=add__in0, id=37)
  receive.38: (token, bits[32]) = receive(after_all.4, channel=add__in1, id=38)
  tok0: token = tuple_index(receive.37, index=0, id=7, pos=[(0,10,9)])
  tok1: token = tuple_index(receive.38, index=0, id=12, pos=[(0,11,9)])
  tmp0: bits[32] = tuple_index(receive.37, index=1, id=8, pos=[(0,10,15)])
  tmp1: bits[32] = tuple_index(receive.38, index=1, id=13, pos=[(0,11,15)])
  tok: token = after_all(tok0, tok1, id=18)
  tmp4__

In [8]:
code.to_vlog()

codegen_main --delay_model=sky130 --reset=rst --pipeline_stages=1 abax/add.opt.ir

module __add__add_0_next(
  input wire clk,
  input wire rst,
  input wire [31:0] add__in0,
  input wire add__in0_vld,
  input wire [31:0] add__in1,
  input wire add__in1_vld,
  input wire add__out0_rdy,
  output wire add__in0_rdy,
  output wire add__in1_rdy,
  output wire [31:0] add__out0,
  output wire add__out0_vld
);
  reg [31:0] __add__in0_reg;
  reg __add__in0_valid_reg;
  reg [31:0] __add__in1_reg;
  reg __add__in1_valid_reg;
  reg [31:0] __add__out0_reg;
  reg __add__out0_valid_reg;
  wire add__out0_valid_inv;
  wire p0_all_active_inputs_valid;
  wire add__out0_valid_load_en;
  wire add__out0_load_en;
  wire p0_stage_done;
  wire add__in0_valid_inv;
  wire add__in1_valid_inv;
  wire add__in0_valid_load_en;
  wire add__in1_valid_load_en;
  wire add__in0_load_en;
  wire add__in1_load_en;
  wire [31:0] tmp4__1;
  assign add__out0_valid_inv = ~__add__out0_valid_reg;
  assign p0_all_active_inputs_vali

In [9]:
code.flow()

## Some other examples

In [10]:
# supports both unsigned and signed integers
def mac(a: int32, b: int32, c: int32) -> int32:
  return (a * b) + c

s = allo.customize(mac)
print(s.module)
code = s.build(target='xls')
print(code)
# print(code)
# code.flow()

module {
  func.func @mac(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 attributes {itypes = "sss", otypes = "s"} {
    %0 = arith.extsi %arg0 : i32 to i64
    %1 = arith.extsi %arg1 : i32 to i64
    %2 = arith.muli %0, %1 : i64
    %3 = arith.extsi %2 : i64 to i65
    %4 = arith.extsi %arg2 : i32 to i65
    %5 = arith.addi %3, %4 : i65
    %6 = arith.trunci %5 : i65 to i32
    return %6 : i32
  }
}

pub proc mac {
  in0: chan<s32> in;
  in1: chan<s32> in;
  in2: chan<s32> in;
  out0: chan<s32> out;

  config(in0: chan<s32> in, in1: chan<s32> in, in2: chan<s32> in, out0: chan<s32> out) { (in0, in1, in2, out0) }

  init { () }

  next(state: ()) {
    let (tok0, tmp0) = recv(join(), in0);
    let (tok1, tmp1) = recv(join(), in1);
    let (tok2, tmp2) = recv(join(), in2);
    let tmp3 = (tmp0 as s64);
    let tmp4 = (tmp1 as s64);
    let tmp5 = (tmp3 * tmp4);
    let tmp6 = (tmp5 as sN[65]);
    let tmp7 = (tmp2 as sN[65]);
    let tmp8 = (tmp6 + tmp7);
    let tmp9 = (tmp8 as s32);
    le

In [11]:
# Validate mac proc
code.test([(2, 3, 4, 10), (5, -1, 7, 2), (-4, 5, 6, -14)])


[ RUN UNITTEST  ] mac_test
[            OK ]





In [12]:
# supports multiple outputs
def wsa(a: int32, b: int32) -> (int32, int32, int32):
  return a | b, a & b, a ^ b

s = allo.customize(wsa)
code = s.build(target='xls')
# print(code)
code.flow()

In [13]:
# Validate wsa proc
code.test([(a, b, a | b, a & b, a ^ b) for a, b in [(0, 0), (5, 3), (-1, 7)]])


[ RUN UNITTEST  ] wsa_test
[            OK ]





In [14]:
# supports (basic) conditional statements
def max(a: int32, b: int32) -> int32:
  return a if (a > b) else b

s = allo.customize(max)
code = s.build(target='xls')
# print(code)
# code.flow()

In [15]:
# Validate max proc
code.test([(-3, -7, -3), (10, 4, 10), (5, 9, 9)])


[ RUN UNITTEST  ] max_test
[            OK ]





In [16]:
# supports (basic) conditional statements
def incr(a: int32) -> int32:
  return a + 1

s = allo.customize(incr)
code = s.build(target='xls')
# print(code)
# code.flow()

In [17]:
# Validate incr proc
code.test([(-1, 0), (0, 1), (41, 42), (2**10, 2**10 + 1)])


[ RUN UNITTEST  ] incr_test
[            OK ]





In [18]:
# Generate XLS/DSLX code for fact function
def fact(a: int32) -> int32:
  acc: int32 = 1
  for i in range(a):
    acc *= (i + 1)
  return acc

s = allo.customize(fact)
# print(s.module)
code = s.build(target='xls')
# print(code)
code.interpret()







In [19]:
# Validate fact proc
code.test([(0, 1), (1, 1), (5, 120), (7, 5040), (10, 3628800)])


[ RUN UNITTEST  ] fact_test
[            OK ]





In [20]:
# Generate XLS/DSLX code for fibonacci function
# This tests multiple accumulators (prev, curr)
def fib(n: int32) -> int32:
  prev: int32 = 0
  curr: int32 = 1
  for i in range(n):
    next_val: int32 = prev + curr
    prev = curr
    curr = next_val
  return curr

s = allo.customize(fib)
code = s.build(target='xls')
code.interpret()
# validate fib proc
code.test([(0, 1), (1, 1), (5, 8), (10, 89)])




[ RUN UNITTEST  ] fib_test
[            OK ]





In [21]:
# WHILE loop example: Count steps until n becomes 1 (Collatz-like)
# Simplified: divide by 2 if even, subtract 1 if odd, count steps
def count_steps(n: int32) -> int32:
  steps: int32 = 0
  val: int32 = n
  while val > 1:
    if val % 2 == 0:
      val = val // 2
    else:
      val = val - 1
    steps = steps + 1
  return steps

s = allo.customize(count_steps)
# print("=== MLIR for count_steps (WHILE loop) ===")
# print(s.module)
# print("\n=== Building DSLX ===")
code = s.build(target='xls')
print(code)
code.interpret()

pub proc count_steps {
  in0: chan<s32> in;
  out0: chan<s32> out;

  config(in0: chan<s32> in, out0: chan<s32> out) { (in0, out0) }

  init { (0, 0, true, false) }

  next(state: (s32, s32, bool, bool)) {
    let (acc0, acc1, index0, busy) = state;
    let (tok0, tmp0) = recv_if(join(), in0, !busy, acc0);
    let tmp1 = if (!busy) { tmp0 } else { acc1 };
    let tmp2 = (tmp1 > s32:1);
    let tmp3 = tmp2;
    let tmp4 = (tmp1 % s32:2);
    let tmp5 = (tmp4 == s32:0);
    let tmp6 = (tmp1 / s32:2);
    let tmp7 = tmp6;
    let tmp8 = (tmp1 as s33);
    let tmp9 = (tmp8 - s33:1);
    let tmp10 = (tmp9 as s32);
    let tmp11 = tmp10;
    let tmp12 = if (tmp5) { tmp7 } else { tmp11 };
    let tmp13 = (acc0 as s33);
    let tmp14 = (tmp13 + s33:1);
    let tmp15 = (tmp14 as s32);
    let tmp16 = tmp15;
    let tmp17 = if (tmp3) { tmp16 } else { acc0 };
    let tmp18 = if (tmp3) { tmp12 } else { tmp1 };
    let tmp19 = !tmp3;
    let tok1 = send_if(tok0, out0, tmp19, tmp17);
    let tmp20 =



In [22]:
# Validate count_steps proc
code.test([(1, 0), (2, 1), (4, 2), (7, 4), (16, 4)])


[ RUN UNITTEST  ] count_steps_test
[            OK ]





In [23]:
# GCD using WHILE loop (Euclidean algorithm)
def gcd(a: int32, b: int32) -> int32:
  x: int32 = a
  y: int32 = b
  while y > 0:
    temp: int32 = y
    y = x % y
    x = temp
  return x

s = allo.customize(gcd)
# print("=== MLIR for gcd (WHILE loop with two state vars) ===")
# print(s.module)
# print("\n=== Building DSLX ===")
code = s.build(target='xls')
# print(code)
code.interpret()







In [24]:
# Validate gcd proc
code.test([(12, 8, 4), (48, 18, 6), (17, 13, 1), (100, 25, 25), (7, 7, 7)])


[ RUN UNITTEST  ] gcd_test
[            OK ]





In [25]:
# vector-vector add

def vvadd(a: int32[16], b: int32[16]) -> int32[16]:
  c: int32[16] = 0
  for i in range(16):
    c[i] = a[i] + b[i]
  return c

s = allo.customize(vvadd)
print(s.module)
code = s.build(target='xls')
print(code)
code.interpret()
# code.to_ir(False)
# code.opt()

module {
  func.func @vvadd(%arg0: memref<16xi32>, %arg1: memref<16xi32>) -> memref<16xi32> attributes {itypes = "ss", otypes = "s"} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i32_0 = arith.constant 0 : i32
    %alloc = memref.alloc() {name = "c"} : memref<16xi32>
    linalg.fill ins(%c0_i32_0 : i32) outs(%alloc : memref<16xi32>)
    affine.for %arg2 = 0 to 16 {
      %0 = affine.load %arg0[%arg2] {from = "a"} : memref<16xi32>
      %1 = affine.load %arg1[%arg2] {from = "b"} : memref<16xi32>
      %2 = arith.extsi %0 : i32 to i33
      %3 = arith.extsi %1 : i32 to i33
      %4 = arith.addi %2, %3 : i33
      %5 = arith.trunci %4 : i33 to i32
      affine.store %5, %alloc[%arg2] {to = "c"} : memref<16xi32>
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<16xi32>
  }
}

// Simple dual-port RAM model with independent read and write ports.
// Parameterized on address width, data width, and depth.
// Reads observe the state before a concurrent write (read-before-wri



In [28]:
code.to_ir(verbose=False)
code.opt(verbose=False)
code.to_vlog(ram_latency=1, pipeline_stages=3, delay_model="sky130", verbose=False)

In [None]:
import numpy as np

# Validate vvadd memory-based proc
vec_len = 32
vec_a = np.arange(vec_len, dtype=np.int32)
vec_b = np.arange(vec_len, dtype=np.int32) * 2
expected = vec_a + vec_b
code.test(vec_a, vec_b, expected)


[ RUN UNITTEST  ] vvadd_test
[            OK ]





In [None]:
def mv[N](A: int32[N, N], x: int32[N]) -> int32[N]:
  C: int32[N] = 0
  for i in range(N):
    acc: int32 = 0
    for j in range(N):
      acc += A[i, j] * x[j]
    C[i] = acc
  return C

mv_sched = allo.customize(mv, instantiate=[4])
mv_code = mv_sched.build(target='xls')
mv_code.interpret()







In [None]:
# Validate mv matvec
mat4 = (np.arange(16, dtype=np.int32).reshape(4, 4) - 3)
vec4 = np.array([1, -2, 3, -4], dtype=np.int32)
expected_vec = mat4 @ vec4
mv_code.test(mat4, vec4, expected_vec)



[ RUN UNITTEST  ] mv_test
[            OK ]





In [None]:
def mm[N](A: int32[N, N], B: int32[N, N]) -> int32[N, N]:
  C: int32[N, N] = 0
  for i, j in allo.grid(N, N):
    acc: int32 = 0
    for k in range(N):
      acc += A[i, k] * B[k, j]
    C[i, j] = acc
  return C

mm_sched = allo.customize(mm, instantiate=[4])
mm_code = mm_sched.build(target='xls')
mm_code.interpret()
print(mm_code)



// Simple dual-port RAM model with independent read and write ports.
// Parameterized on address width, data width, and depth.
// Reads observe the state before a concurrent write (read-before-write).
// Writes always update a full word.

pub struct SimpleReadReq<ADDR_WIDTH: u32> {
  addr: uN[ADDR_WIDTH],
}

pub struct SimpleReadResp<DATA_WIDTH: u32> {
  data: uN[DATA_WIDTH],
}

pub struct SimpleWriteReq<ADDR_WIDTH: u32, DATA_WIDTH: u32> {
  addr: uN[ADDR_WIDTH],
  data: uN[DATA_WIDTH],
}

pub struct SimpleWriteResp {}

pub proc Simple1R1WRam<ADDR_WIDTH: u32, DATA_WIDTH: u32, SIZE: u32> {
  read_req: chan<SimpleReadReq<ADDR_WIDTH>> in;
  read_resp: chan<SimpleReadResp<DATA_WIDTH>> out;
  write_req: chan<SimpleWriteReq<ADDR_WIDTH, DATA_WIDTH>> in;
  write_resp: chan<SimpleWriteResp> out;

  config(read_req: chan<SimpleReadReq<ADDR_WIDTH>> in,
         read_resp: chan<SimpleReadResp<DATA_WIDTH>> out,
         write_req: chan<SimpleWriteReq<ADDR_WIDTH, DATA_WIDTH>> in,
         write_re

In [None]:
# Validate mm matrix multiply
mat_a = (np.arange(16, dtype=np.int32).reshape(4, 4) - 1)
mat_b = (np.arange(16, dtype=np.int32).reshape(4, 4) + 2)
expected_mat = mat_a @ mat_b
mm_code.test(mat_a, mat_b, expected_mat)


[ RUN UNITTEST  ] mm_test
[            OK ]





In [None]:
mm_code.to_ir(verbose=False)
mm_code.opt(verbose=False)
mm_code.to_vlog(ram_latency=4, pipeline_stages=9, delay_model="sky130", verbose=False)

In [None]:
from allo.ir.types import float32

def add(a: float32, b: float32) -> float32:
  return a + b

s = allo.customize(add)
print(s.module)
code = s.build(target='xls')
print(code)
code.interpret()
code.flow()

module {
  func.func @add(%arg0: f32, %arg1: f32) -> f32 attributes {itypes = "__", otypes = "_"} {
    %0 = arith.addf %arg0, %arg1 : f32
    return %0 : f32
  }
}

import apfloat;

pub const F32_EXP_SZ = u32:8;
pub const F32_FRAC_SZ = u32:23;
pub type F32 = apfloat::APFloat<F32_EXP_SZ, F32_FRAC_SZ>;


pub proc add {
  in0: chan<F32> in;
  in1: chan<F32> in;
  out0: chan<F32> out;

  config(in0: chan<F32> in, in1: chan<F32> in, out0: chan<F32> out) { (in0, in1, out0) }

  init { () }

  next(state: ()) {
    let (tok0, tmp0) = recv(join(), in0);
    let (tok1, tmp1) = recv(join(), in1);
    let tmp2 = apfloat::add(tmp0, tmp1);
    let tok = join(tok0, tok1);
    send(tok, out0, tmp2);
  }
}




In [None]:
# Validate floating point add proc
code.test([
    (1.0, 2.0, 3.0), (0.0, 5.0, 5.0), (3.0, 0.0, 3.0), 
    (-1.5, 2.5, 1.0), (-5.0, 2.0, -3.0), (1.5, 2.5, 4.0),
    (10.5, -3.2, 7.3), (-10.0, -5.0, -15.0)
])



[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]


[ RUN UNITTEST  ] add_test
[            OK ]



In [None]:
def useless(a: int32) -> int32:
  b: int32 = a
  for i in range(10):
    b = b + 1
  return b

s = allo.customize(useless, instantiate=[int32])
s.unroll("i")
code = s.build(target='xls')
code.interpret()







In [None]:
# Validate useless proc (unrolled for loop test)
code.test([(5, 15), (0, 10), (100, 110), (-5, 5)])


[ RUN UNITTEST  ] useless_test
[            OK ]


[ RUN UNITTEST  ] useless_test
[            OK ]


[ RUN UNITTEST  ] useless_test
[            OK ]


[ RUN UNITTEST  ] useless_test
[            OK ]



In [None]:
# vector-vector add

def vvadd(a: float32[32], b: float32[32]) -> float32[32]:
  c: float32[32] = 0
  for i in range(32):
    c[i] = a[i] + b[i]
  return c

s = allo.customize(vvadd)
print(s.module)
code = s.build(target='xls')
print(code)
code.interpret()
# code.to_ir(False)
# code.opt()

module {
  func.func @vvadd(%arg0: memref<32xf32>, %arg1: memref<32xf32>) -> memref<32xf32> attributes {itypes = "__", otypes = "_"} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i32_0 = arith.constant 0 : i32
    %c0_i32_1 = arith.constant 0 : i32
    %c0_i32_2 = arith.constant 0 : i32
    %0 = arith.sitofp %c0_i32_2 : i32 to f32
    %alloc = memref.alloc() {name = "c"} : memref<32xf32>
    linalg.fill ins(%0 : f32) outs(%alloc : memref<32xf32>)
    affine.for %arg2 = 0 to 32 {
      %1 = affine.load %arg0[%arg2] {from = "a"} : memref<32xf32>
      %2 = affine.load %arg1[%arg2] {from = "b"} : memref<32xf32>
      %3 = arith.addf %1, %2 : f32
      affine.store %3, %alloc[%arg2] {to = "c"} : memref<32xf32>
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<32xf32>
  }
}

import apfloat;

pub const F32_EXP_SZ = u32:8;
pub const F32_FRAC_SZ = u32:23;
pub type F32 = apfloat::APFloat<F32_EXP_SZ, F32_FRAC_SZ>;


// Simple dual-port RAM model with independent read and wri

