Skip to content

Commit

Permalink
[CUDA] Enable shared memory for CUDA (#5429)
Browse files Browse the repository at this point in the history
* Implement shared memory for CUDA

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
turbo0628 and pre-commit-ci[bot] committed Jul 20, 2022
1 parent 5f2e607 commit 7e74c76
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 12 deletions.
9 changes: 9 additions & 0 deletions python/taichi/lang/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
MeshRelationAccessProxy,
MeshReorderedMatrixFieldProxy,
MeshReorderedScalarFieldProxy, element_type_name)
from taichi.lang.simt.block import SharedArray
from taichi.lang.snode import SNode
from taichi.lang.struct import Struct, StructField, _IntermediateStruct
from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
Expand All @@ -32,6 +33,12 @@ def expr_init_local_tensor(shape, element_type, elements):
shape, element_type, elements)


@taichi_scope
def expr_init_shared_array(shape, element_type):
return get_runtime().prog.current_ast_builder().expr_alloca_shared_array(
shape, element_type)


@taichi_scope
def expr_init(rhs):
if rhs is None:
Expand All @@ -40,6 +47,8 @@ def expr_init(rhs):
return type(rhs)(*rhs.to_list())
if isinstance(rhs, Matrix):
return Matrix(rhs.to_list())
if isinstance(rhs, SharedArray):
return rhs
if isinstance(rhs, Struct):
return Struct(rhs.to_dict(include_methods=True))
if isinstance(rhs, list):
Expand Down
14 changes: 14 additions & 0 deletions python/taichi/lang/simt/block.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
from taichi.lang import impl
from taichi.lang.util import taichi_scope


def sync():
return impl.call_internal("block_barrier", with_runtime_context=False)


class SharedArray:
_is_taichi_class = True

def __init__(self, shape, dtype):
self.shape = shape
self.dtype = dtype
self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype)

@taichi_scope
def _subscript(self, *indices, get_ref=False):
return impl.make_index_expr(self.shared_array_proxy, (indices, ))
24 changes: 23 additions & 1 deletion taichi/codegen/llvm/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,29 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
auto type = tlctx->get_data_type(tensor_type->get_element_type());
auto array_size = tlctx->get_constant(tensor_type->get_num_elements());
// Return type is [array_size x type]*.
llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
if (stmt->is_shared) {
size_t data_element_size = tlctx->get_type_size(
tlctx->get_data_type(tensor_type->get_element_type()));
auto type = llvm::ArrayType::get(
llvm::Type::getInt8Ty(*llvm_context),
data_element_size * tensor_type->get_num_elements());
auto base = new llvm::GlobalVariable(
*module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
fmt::format("shared_array_{}", stmt->id), nullptr,
llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/);
base->setAlignment(llvm::MaybeAlign(8));

auto ptr = builder->CreateGEP(
#ifdef TI_LLVM_15
base->getValueType(),
#endif
base, {tlctx->get_constant(0), tlctx->get_constant(0)});
auto ptr_type = llvm::PointerType::get(
tlctx->get_data_type(tensor_type->get_element_type()), 0);
llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
} else {
llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
}
} else {
TI_ASSERT(stmt->width() == 1);
llvm_val[stmt] =
Expand Down
10 changes: 10 additions & 0 deletions taichi/ir/frontend_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,16 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
return var;
}

Expr ASTBuilder::expr_alloca_shared_array(const std::vector<int> &shape,
const DataType &element_type) {
auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
this->insert(std::make_unique<FrontendAllocaStmt>(
std::static_pointer_cast<IdExpression>(var.expr)->id, shape, element_type,
true));
var->ret_type = this->get_last_stmt()->ret_type;
return var;
}

void ASTBuilder::expr_assign(const Expr &lhs, const Expr &rhs, std::string tb) {
TI_ASSERT(lhs->is_lvalue());
auto stmt = std::make_unique<FrontendAssignStmt>(lhs, rhs);
Expand Down
18 changes: 14 additions & 4 deletions taichi/ir/frontend_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,21 @@ class FrontendAllocaStmt : public Stmt {
public:
Identifier ident;

FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs) {
FrontendAllocaStmt(const Identifier &lhs, DataType type)
: ident(lhs), is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
}

FrontendAllocaStmt(const Identifier &lhs,
std::vector<int> shape,
DataType element)
: ident(lhs) {
DataType element,
bool is_shared = false)
: ident(lhs), is_shared(is_shared) {
ret_type = DataType(TypeFactory::create_tensor_type(shape, element));
}

bool is_shared;

TI_DEFINE_ACCEPT
};

Expand Down Expand Up @@ -874,6 +878,8 @@ class ASTBuilder {
Expr expr_alloca_local_tensor(const std::vector<int> &shape,
const DataType &element_type,
const ExprGroup &elements);
Expr expr_alloca_shared_array(const std::vector<int> &shape,
const DataType &element_type);
void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
void create_assert_stmt(const Expr &cond,
const std::string &msg,
Expand Down Expand Up @@ -907,7 +913,11 @@ class ASTBuilder {
}

void block_dim(int v) {
TI_ASSERT(bit::is_power_of_two(v));
if (arch_ == Arch::cuda) {
TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
} else {
TI_ASSERT(bit::is_power_of_two(v));
}
for_loop_dec_.config.block_dim = v;
}

Expand Down
12 changes: 8 additions & 4 deletions taichi/ir/statements.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,20 @@ class Function;
*/
class AllocaStmt : public Stmt {
public:
AllocaStmt(DataType type) {
AllocaStmt(DataType type) : is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
TI_STMT_REG_FIELDS;
}

AllocaStmt(int width, DataType type) {
AllocaStmt(int width, DataType type) : is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(width, type);
TI_STMT_REG_FIELDS;
}

AllocaStmt(const std::vector<int> &shape, DataType type) {
AllocaStmt(const std::vector<int> &shape,
DataType type,
bool is_shared = false)
: is_shared(is_shared) {
ret_type = TypeFactory::create_tensor_type(shape, type);
TI_STMT_REG_FIELDS;
}
Expand All @@ -41,7 +44,8 @@ class AllocaStmt : public Stmt {
return false;
}

TI_STMT_DEF_FIELDS(ret_type);
bool is_shared;
TI_STMT_DEF_FIELDS(ret_type, is_shared);
TI_DEFINE_ACCEPT_AND_CLONE
};

Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ void export_lang(py::module &m) {
.def("insert_external_func_call", &ASTBuilder::insert_external_func_call)
.def("expr_alloca", &ASTBuilder::expr_alloca)
.def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor)
.def("expr_alloca_shared_array", &ASTBuilder::expr_alloca_shared_array)
.def("create_assert_stmt", &ASTBuilder::create_assert_stmt)
.def("expr_assign", &ASTBuilder::expr_assign)
.def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for)
Expand Down
6 changes: 4 additions & 2 deletions taichi/transforms/ir_printer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class IRPrinter : public IRVisitor {
}

void visit(FrontendAllocaStmt *alloca) override {
print("{}${} = alloca {}", alloca->type_hint(), alloca->id,
std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id, shared_suffix,
alloca->ident.name());
}

Expand Down Expand Up @@ -187,7 +188,8 @@ class IRPrinter : public IRVisitor {
}

void visit(AllocaStmt *alloca) override {
print("{}${} = alloca", alloca->type_hint(), alloca->id);
std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix);
}

void visit(RandStmt *stmt) override {
Expand Down
3 changes: 2 additions & 1 deletion taichi/transforms/lower_ast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ class LowerAST : public IRVisitor {
if (stmt->ret_type->is<TensorType>()) {
auto tensor_type = stmt->ret_type->cast<TensorType>();
auto lowered = std::make_unique<AllocaStmt>(
tensor_type->get_shape(), tensor_type->get_element_type());
tensor_type->get_shape(), tensor_type->get_element_type(),
stmt->is_shared);
block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get()));
stmt->parent->replace_with(stmt, std::move(lowered));
} else {
Expand Down
27 changes: 27 additions & 0 deletions tests/python/test_shared_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import taichi as ti
from tests import test_utils


@test_utils.test(arch=ti.cuda)
def test_shared_array_save():
block_dim = 128
pad_num = 16
a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num, ))

@ti.kernel
def func():
ti.loop_config(block_dim=block_dim)
for i in range(block_dim * pad_num):
g_tid = ti.global_thread_idx()
tid = g_tid % block_dim
pad = ti.simt.block.SharedArray((block_dim, ), ti.f32)
pad[tid] = tid * 2.0
ti.simt.block.sync()
a[i] = pad[tid]
ti.simt.block.sync()

func()
for i in range(pad_num):
assert a[i * block_dim + 7] == 14.0
assert a[i * block_dim + 29] == 58.0
assert a[i * block_dim + 127] == 254.0

0 comments on commit 7e74c76

Please sign in to comment.