Skip to content

wehu/c-mlir

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

c-mlir

A translator from c to MLIR

Only a subset of c is supported based on the current semantics of MLIR dialects.

  • Scalar builtin types -> MLIR types
  • Static sizes of array and operations -> MemRef dialect
  • Arithmetic operations -> Arith dialect
  • A limited version of flow control(for/if) -> Affine/SCF dialect
  • math function calls -> Math dialect
  • 1D operations -> Vector dialect
  • 2D operations -> Linalg dialect

For for, if possible, try to lower to affine.for, and if not, try to lower to scf.for, elsewise scf.while.

For opencl, __local is mapped to memory space 1, __global is mapped to memory space 2.

Dynamic sizes array, pointer arithmetic, break, continue, goto and switch/case are not allowed.

Basic matmul example

#define N 1
#define M 2
#define K 3
void matmul() {
  float lhs[N][K];
  float rhs[K][M];
  float output[N][M];
  for (int i=0; i<N; ++i) {
    for (int j=0; j<M; ++j) {
      output[i][j] = 0.0;
      for (int k=0; k<K; ++k) {
        output[i][j] += lhs[i][k] * rhs[k][j];
      }
    }
  }
}

Output IR as below:

module  {
  func @matmul() attributes {llvm.emit_c_interface} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = memref.alloca() : memref<1x3xf32>
    %1 = memref.alloca() : memref<3x2xf32>
    %2 = memref.alloca() : memref<1x2xf32>
    affine.for %arg0 = 0 to 1 {
      affine.for %arg1 = 0 to 2 {
        affine.store %cst, %2[%arg0, %arg1] : memref<1x2xf32>
        affine.for %arg2 = 0 to 3 {
          %3 = affine.load %2[%arg0, %arg1] : memref<1x2xf32>
          %4 = affine.load %0[%arg0, %arg2] : memref<1x3xf32>
          %5 = affine.load %1[%arg2, %arg1] : memref<3x2xf32>
          %6 = arith.mulf %4, %5 : f32
          %7 = arith.addf %3, %6 : f32
          affine.store %7, %2[%arg0, %arg1] : memref<1x2xf32>
        }
      }
    }
    return
  }
}

Opencl GEMM example

int get_global_id(int);

__kernel void GEMM(const int M, const int N, const int K,
                   const __global float* A,
                   const __global float* B,
                   __global float* C) {
    
    // Thread identifiers
    const int globalRow = get_global_id(0); // Row ID of C (0..M)
    const int globalCol = get_global_id(1); // Col ID of C (0..N)
 
    // Compute a single element (loop over K)
    float acc = 0.0f;
    for (int k=0; k<K; k++) {
        acc += A[k*M + globalRow] * B[globalCol*K + k];
    }
 
    // Store the result
    C[globalCol*M + globalRow] = acc;
}

Output IR as below:

module  {
  func private @get_global_id(i32) -> i32
  func @GEMM(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: memref<?xf32, 2>, %arg4: memref<?xf32, 2>, %arg5: memref<?xf32, 2>) attributes {cl.kernel = true, llvm.emit_c_interface} {
    %cst = arith.constant 0.000000e+00 : f32
    %c1_i32 = arith.constant 1 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.index_cast %arg0 : i32 to index
    %1 = arith.index_cast %arg2 : i32 to index
    %2 = call @get_global_id(%c0_i32) : (i32) -> i32
    %3 = arith.index_cast %2 : i32 to index
    %4 = call @get_global_id(%c1_i32) : (i32) -> i32
    %5 = arith.index_cast %4 : i32 to index
    %6 = memref.alloca() : memref<f32>
    affine.store %cst, %6[] : memref<f32>
    affine.for %arg6 = 0 to %1 {
      %8 = affine.load %6[] : memref<f32>
      %9 = affine.load %arg3[%arg6 * symbol(%0) + symbol(%3)] : memref<?xf32, 2>
      %10 = affine.load %arg4[%arg6 + symbol(%5) * symbol(%1)] : memref<?xf32, 2>
      %11 = arith.mulf %9, %10 : f32
      %12 = arith.addf %8, %11 : f32
      affine.store %12, %6[] : memref<f32>
    }
    %7 = affine.load %6[] : memref<f32>
    affine.store %7, %arg5[symbol(%5) * symbol(%0) + symbol(%3)] : memref<?xf32, 2>
    return
  }
}

Opencl GEMM with local memory

#define TS 10
#define CLK_LOCAL_MEM_FENCE 1

int get_local_id(int);
int get_group_id(int);
void barrier(int);

// Tiled and coalesced version
__kernel void GEMM(const int M, const int N, const int K,
                   const __global float* A,
                   const __global float* B,
                   __global float* C,
                   // Local memory to fit a tile of TS*TS elements of A and B
                   __local float Asub[TS][TS],
                   __local float Bsub[TS][TS]) {
    
    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS)
    const int col = get_local_id(1); // Local col ID (max: TS)
    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
 
 
    // Initialise the accumulation register
    float acc = 0.0f;
    
    // Loop over all tiles
    const int numTiles = K/TS;
    for (int t=0; t<numTiles; t++) {
 
        // Load one tile of A and B into local memory
        const int tiledRow = TS*t + row;
        const int tiledCol = TS*t + col;
        Asub[col][row] = A[tiledCol*M + globalRow];
        Bsub[col][row] = B[globalCol*K + tiledRow];
 
        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);
 
        // Perform the computation for a single tile
        for (int k=0; k<TS; k++) {
            acc += Asub[k][row] * Bsub[col][k];
        }
 
        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }
 
    // Store the final result in C
    C[globalCol*M + globalRow] = acc;
}

Output IR as below:

#map = affine_map<()[s0] -> (s0 floordiv 10)>
module  {
  func private @get_local_id(i32) -> i32
  func private @get_group_id(i32) -> i32
  func private @barrier(i32)
  func @GEMM(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: memref<?xf32, 2>, %arg4: memref<?xf32, 2>, %arg5: memref<?xf32, 2>, %arg6: memref<10x10xf32, 1>, %arg7: memref<10x10xf32, 1>) attributes {cl.kernel = true, llvm.emit_c_interface} {
    %c10_i32 = arith.constant 10 : i32
    %cst = arith.constant 0.000000e+00 : f32
    %c1_i32 = arith.constant 1 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.index_cast %arg0 : i32 to index
    %1 = arith.index_cast %arg2 : i32 to index
    %2 = call @get_local_id(%c0_i32) : (i32) -> i32
    %3 = arith.index_cast %2 : i32 to index
    %4 = call @get_local_id(%c1_i32) : (i32) -> i32
    %5 = arith.index_cast %4 : i32 to index
    %6 = call @get_group_id(%c0_i32) : (i32) -> i32
    %7 = arith.muli %6, %c10_i32 : i32
    %8 = arith.addi %7, %2 : i32
    %9 = arith.index_cast %8 : i32 to index
    %10 = call @get_group_id(%c1_i32) : (i32) -> i32
    %11 = arith.muli %10, %c10_i32 : i32
    %12 = arith.addi %11, %4 : i32
    %13 = arith.index_cast %12 : i32 to index
    %14 = memref.alloca() : memref<1xf32>
    affine.store %cst, %14[0] : memref<1xf32>
    affine.for %arg8 = 0 to #map()[%1] {
      %16 = affine.load %arg3[(%arg8 * 10 + symbol(%5)) * symbol(%0) + symbol(%9)] : memref<?xf32, 2>
      affine.store %16, %arg6[symbol(%5), symbol(%3)] : memref<10x10xf32, 1>
      %17 = affine.load %arg4[%arg8 * 10 + symbol(%3) + symbol(%13) * symbol(%1)] : memref<?xf32, 2>
      affine.store %17, %arg7[symbol(%5), symbol(%3)] : memref<10x10xf32, 1>
      call @barrier(%c1_i32) : (i32) -> ()
      affine.for %arg9 = 0 to 10 {
        %18 = affine.load %14[0] : memref<1xf32>
        %19 = affine.load %arg6[%arg9, symbol(%3)] : memref<10x10xf32, 1>
        %20 = affine.load %arg7[symbol(%5), %arg9] : memref<10x10xf32, 1>
        %21 = arith.mulf %19, %20 : f32
        %22 = arith.addf %18, %21 : f32
        affine.store %22, %14[0] : memref<1xf32>
      }
      call @barrier(%c1_i32) : (i32) -> ()
    }
    %15 = affine.load %14[0] : memref<1xf32>
    affine.store %15, %arg5[symbol(%13) * symbol(%0) + symbol(%9)] : memref<?xf32, 2>
    return
  }
}

Vector type example

typedef int int2 __attribute__((__ext_vector_type__(2)));

void foo() {
  int2 v0;
  int v1[2];
  vload(v1[0], &v0);
  v0 += v0;
  vstore(v0, v1[0]);
}

Output IR as below

module  {
  func @foo() attributes {llvm.emit_c_interface} {
    %c0 = arith.constant 0 : index
    %0 = memref.alloca() : memref<1xvector<2xi32>>
    %1 = memref.alloca() : memref<2xi32>
    %2 = vector.load %1[%c0] : memref<2xi32>, vector<2xi32>
    affine.store %2, %0[0] : memref<1xvector<2xi32>>
    %3 = affine.load %0[0] : memref<1xvector<2xi32>>
    %4 = affine.load %0[0] : memref<1xvector<2xi32>>
    %5 = arith.addi %3, %4 : vector<2xi32>
    affine.store %5, %0[0] : memref<1xvector<2xi32>>
    %6 = affine.load %0[0] : memref<1xvector<2xi32>>
    vector.store %6, %1[%c0] : memref<2xi32>, vector<2xi32>
    return
  }
}

Dma example

void foo() {
  int src[2];
  int dst[2];
  int tag[1];
  dma_start(src[0], dst[0], tag[0], 2);
  dma_wait(tag[0], 2);
}

Output IR as below

module  {
  func @foo() attributes {llvm.emit_c_interface} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %0 = memref.alloca() : memref<2xi32>
    %1 = memref.alloca() : memref<2xi32>
    %2 = memref.alloca() : memref<1xi32>
    affine.dma_start %0[%c0], %1[%c0], %2[%c0], %c2 : memref<2xi32>, memref<2xi32>, memref<1xi32>
    affine.dma_wait %2[%c0], %c2 : memref<1xi32>
    return
  }
}

Builtin Conv2D example

void foo() {
  float lhs[3][4][5][6];
  float rhs[1][4][1][1];
  float output[3][1][5][6];
  conv_2d_nchw_fchw(lhs, rhs, output, 1, 1, 1, 1);
}

Output IR as below:

module  {
  func @foo() attributes {llvm.emit_c_interface} {
    %0 = memref.alloca() : memref<3x4x5x6xf32>
    %1 = memref.alloca() : memref<1x4x1x1xf32>
    %2 = memref.alloca() : memref<3x1x5x6xf32>
    linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%0, %1 : memref<3x4x5x6xf32>, memref<1x4x1x1xf32>) outs(%2 : memref<3x1x5x6xf32>)
    return
  }
}

Opencl example with linalg.matmul

#define TS 10
#define CLK_LOCAL_MEM_FENCE 1

int get_local_id(int);
int get_group_id(int);
void barrier(int);

// Tiled and coalesced version
__kernel void GEMM(const int M, const int N, const int K,
                   const __global float* A,
                   const __global float* B,
                   __global float* C,
                   // Local memory to fit a tile of TS*TS elements of A and B
                   __local float Asub[TS][TS],
                   __local float Bsub[TS][TS]) {
    
    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS)
    const int col = get_local_id(1); // Local col ID (max: TS)
    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
 
 
    // Initialise the accumulation register
    float acc[1][1];
    
    // Loop over all tiles
    const int numTiles = K/TS;
    for (int t=0; t<numTiles; t++) {
 
        // Load one tile of A and B into local memory
        const int tiledRow = TS*t + row;
        const int tiledCol = TS*t + col;
        Asub[row][col] = A[tiledCol*M + globalRow];
        Bsub[col][row] = B[globalCol*K + tiledRow];
 
        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);
 
        // Perform the computation for a single tile
        matmul((float [1][TS])Bsub[col][0], (float [TS][1])Asub[row][0], acc);
 
        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }
 
    // Store the final result in C
    C[globalCol*M + globalRow] = acc[0][0];
}

Output IR as below:

map0 = affine_map<()[s0] -> (s0 floordiv 10)>
#map1 = affine_map<(d0, d1)[s0, s1] -> (d1 + s0 + d0 * s1)>
module  {
  func private @get_local_id(i32) -> i32
  func private @get_group_id(i32) -> i32
  func private @barrier(i32)
  func @GEMM(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: memref<?xf32, 2>, %arg4: memref<?xf32, 2>, %arg5: memref<?xf32, 2>, %arg6: memref<10x10xf32, 1>, %arg7: memref<10x10xf32, 1>) attributes {cl.kernel = true, llvm.emit_c_interface} {
    %c0_i32 = arith.constant 0 : i32
    %c1 = arith.constant 1 : index
    %c10 = arith.constant 10 : index
    %c10_i32 = arith.constant 10 : i32
    %c1_i32 = arith.constant 1 : i32
    %0 = arith.index_cast %arg0 : i32 to index
    %1 = arith.index_cast %arg2 : i32 to index
    %2 = call @get_local_id(%c0_i32) : (i32) -> i32
    %3 = arith.index_cast %2 : i32 to index
    %4 = call @get_local_id(%c1_i32) : (i32) -> i32
    %5 = arith.index_cast %4 : i32 to index
    %6 = call @get_group_id(%c0_i32) : (i32) -> i32
    %7 = arith.muli %6, %c10_i32 : i32
    %8 = arith.addi %7, %2 : i32
    %9 = arith.index_cast %8 : i32 to index
    %10 = call @get_group_id(%c1_i32) : (i32) -> i32
    %11 = arith.muli %10, %c10_i32 : i32
    %12 = arith.addi %11, %4 : i32
    %13 = arith.index_cast %12 : i32 to index
    %14 = memref.alloca() : memref<1x1xf32>
    affine.for %arg8 = 0 to #map0()[%1] {
      %16 = affine.load %arg3[(%arg8 * 10 + symbol(%5)) * symbol(%0) + symbol(%9)] : memref<?xf32, 2>
      affine.store %16, %arg6[symbol(%3), symbol(%5)] : memref<10x10xf32, 1>
      %17 = affine.load %arg4[%arg8 * 10 + symbol(%3) + symbol(%13) * symbol(%1)] : memref<?xf32, 2>
      affine.store %17, %arg7[symbol(%5), symbol(%3)] : memref<10x10xf32, 1>
      call @barrier(%c1_i32) : (i32) -> ()
      %18 = arith.index_cast %4 : i32 to index
      %19 = arith.muli %18, %c10 : index
      %20 = memref.reinterpret_cast %arg7 to offset: [%19], sizes: [1, 10], strides: [%c10, 1] : memref<10x10xf32, 1> to memref<1x10xf32, #map1, 1>
      %21 = arith.index_cast %2 : i32 to index
      %22 = arith.muli %21, %c10 : index
      %23 = memref.reinterpret_cast %arg6 to offset: [%22], sizes: [10, 1], strides: [%c1, 1] : memref<10x10xf32, 1> to memref<10x1xf32, #map1, 1>
      linalg.matmul ins(%20, %23 : memref<1x10xf32, #map1, 1>, memref<10x1xf32, #map1, 1>) outs(%14 : memref<1x1xf32>)
      call @barrier(%c1_i32) : (i32) -> ()
    }
    %15 = affine.load %14[0, 0] : memref<1x1xf32>
    affine.store %15, %arg5[symbol(%13) * symbol(%0) + symbol(%9)] : memref<?xf32, 2>
    return
  }
}

Launch example

__kernel void foo(__global float *input) {}

void launch(const char *, int, int, char **);

void main() {
    char *inputs[1];
    inputs[0] = malloc(10);
    launch("foo", 1, 1, (char **)inputs);
    free(inputs[0]);
}

Output IR as below:

module  {
  func private @launch(memref<?xi8>, i32, i32, memref<?xmemref<?xi8>>)
  func @foo(%arg0: memref<?xf32, 2>) attributes {cl.kernel = true, llvm.emit_c_interface} {
    return
  }
  func @main() attributes {llvm.emit_c_interface} {
    %c0 = arith.constant 0 : index
    %c1_i32 = arith.constant 1 : i32
    %cst = arith.constant dense<[102, 111, 111]> : vector<3xi8>
    %0 = memref.alloca() : memref<1xmemref<?xi8>>
    %1 = memref.alloc() : memref<10xi8>
    %2 = memref.cast %1 : memref<10xi8> to memref<?xi8>
    affine.store %2, %0[0] : memref<1xmemref<?xi8>>
    %3 = memref.alloca() : memref<3xi8>
    %4 = memref.cast %3 : memref<3xi8> to memref<?xi8>
    vector.store %cst, %3[%c0] : memref<3xi8>, vector<3xi8>
    %5 = memref.cast %0 : memref<1xmemref<?xi8>> to memref<?xmemref<?xi8>>
    call @launch(%4, %c1_i32, %c1_i32, %5) : (memref<?xi8>, i32, i32, memref<?xmemref<?xi8>>) -> ()
    %6 = affine.load %0[0] : memref<1xmemref<?xi8>>
    memref.dealloc %6 : memref<?xi8>
    return
  }
}

Install

Install stack

curl https://get-ghcup.haskell.org -sSf | sh

Run tests

stack test

Translate a C file

stack run -- test.c

Translate to LLVM IR

stack run -- test.c --llvm

Dump IR with locations

stack run -- test.c --loc

Jit run

stack run -- test.c --jit=func_name

About

A translator from c to MLIR

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published