Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Enable shared memory for CUDA #5429

Merged
merged 12 commits into from
Jul 20, 2022
9 changes: 9 additions & 0 deletions python/taichi/lang/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
MeshRelationAccessProxy,
MeshReorderedMatrixFieldProxy,
MeshReorderedScalarFieldProxy, element_type_name)
from taichi.lang.simt.block import SharedArray
from taichi.lang.snode import SNode
from taichi.lang.struct import Struct, StructField, _IntermediateStruct
from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
Expand All @@ -32,6 +33,12 @@ def expr_init_local_tensor(shape, element_type, elements):
shape, element_type, elements)


@taichi_scope
def expr_init_shared_array(shape, element_type):
return get_runtime().prog.current_ast_builder().expr_alloca_shared_array(
shape, element_type)


@taichi_scope
def expr_init(rhs):
if rhs is None:
Expand All @@ -40,6 +47,8 @@ def expr_init(rhs):
return type(rhs)(*rhs.to_list())
if isinstance(rhs, Matrix):
return Matrix(rhs.to_list())
if isinstance(rhs, SharedArray):
return rhs
if isinstance(rhs, Struct):
return Struct(rhs.to_dict(include_methods=True))
if isinstance(rhs, list):
Expand Down
14 changes: 14 additions & 0 deletions python/taichi/lang/simt/block.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
from taichi.lang import impl
from taichi.lang.util import taichi_scope


def sync():
return impl.call_internal("block_barrier", with_runtime_context=False)


class SharedArray:
_is_taichi_class = True

def __init__(self, shape, dtype):
self.shape = shape
self.dtype = dtype
self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype)

@taichi_scope
def _subscript(self, indices, get_ref=False):
return impl.make_index_expr(self.shared_array_proxy, (indices, ))
turbo0628 marked this conversation as resolved.
Show resolved Hide resolved
24 changes: 23 additions & 1 deletion taichi/codegen/llvm/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,29 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
auto type = tlctx->get_data_type(tensor_type->get_element_type());
auto array_size = tlctx->get_constant(tensor_type->get_num_elements());
// Return type is [array_size x type]*.
llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
if (stmt->is_shared) {
size_t data_element_size = tlctx->get_type_size(
tlctx->get_data_type(tensor_type->get_element_type()));
auto type = llvm::ArrayType::get(
llvm::Type::getInt8Ty(*llvm_context),
data_element_size * tensor_type->get_num_elements());
auto base = new llvm::GlobalVariable(
*module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
fmt::format("shared_array_{}", stmt->id), nullptr,
llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/);
base->setAlignment(llvm::MaybeAlign(8));

auto ptr = builder->CreateGEP(
#ifdef TI_LLVM_15
base->getValueType(),
#endif
base, {tlctx->get_constant(0), tlctx->get_constant(0)});
auto ptr_type = llvm::PointerType::get(
tlctx->get_data_type(tensor_type->get_element_type()), 0);
llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
} else {
llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
}
} else {
TI_ASSERT(stmt->width() == 1);
llvm_val[stmt] =
Expand Down
10 changes: 10 additions & 0 deletions taichi/ir/frontend_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,16 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
return var;
}

Expr ASTBuilder::expr_alloca_shared_array(const std::vector<int> &shape,
const DataType &element_type) {
auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
this->insert(std::make_unique<FrontendAllocaStmt>(
std::static_pointer_cast<IdExpression>(var.expr)->id, shape, element_type,
true));
var->ret_type = this->get_last_stmt()->ret_type;
return var;
}

void ASTBuilder::expr_assign(const Expr &lhs, const Expr &rhs, std::string tb) {
TI_ASSERT(lhs->is_lvalue());
auto stmt = std::make_unique<FrontendAssignStmt>(lhs, rhs);
Expand Down
18 changes: 14 additions & 4 deletions taichi/ir/frontend_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,21 @@ class FrontendAllocaStmt : public Stmt {
public:
Identifier ident;

FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs) {
FrontendAllocaStmt(const Identifier &lhs, DataType type)
: ident(lhs), is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
}

FrontendAllocaStmt(const Identifier &lhs,
std::vector<int> shape,
DataType element)
: ident(lhs) {
DataType element,
bool is_shared = false)
: ident(lhs), is_shared(is_shared) {
ret_type = DataType(TypeFactory::create_tensor_type(shape, element));
}

bool is_shared;

TI_DEFINE_ACCEPT
};

Expand Down Expand Up @@ -874,6 +878,8 @@ class ASTBuilder {
Expr expr_alloca_local_tensor(const std::vector<int> &shape,
const DataType &element_type,
const ExprGroup &elements);
Expr expr_alloca_shared_array(const std::vector<int> &shape,
const DataType &element_type);
void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
void create_assert_stmt(const Expr &cond,
const std::string &msg,
Expand Down Expand Up @@ -907,7 +913,11 @@ class ASTBuilder {
}

void block_dim(int v) {
TI_ASSERT(bit::is_power_of_two(v));
if (arch_ == Arch::cuda) {
TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
} else {
TI_ASSERT(bit::is_power_of_two(v));
}
for_loop_dec_.config.block_dim = v;
}

Expand Down
12 changes: 8 additions & 4 deletions taichi/ir/statements.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,20 @@ class Function;
*/
class AllocaStmt : public Stmt {
public:
AllocaStmt(DataType type) {
AllocaStmt(DataType type) : is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
TI_STMT_REG_FIELDS;
}

AllocaStmt(int width, DataType type) {
AllocaStmt(int width, DataType type) : is_shared(false) {
ret_type = TypeFactory::create_vector_or_scalar_type(width, type);
TI_STMT_REG_FIELDS;
}

AllocaStmt(const std::vector<int> &shape, DataType type) {
AllocaStmt(const std::vector<int> &shape,
DataType type,
bool is_shared = false)
: is_shared(is_shared) {
ret_type = TypeFactory::create_tensor_type(shape, type);
TI_STMT_REG_FIELDS;
}
Expand All @@ -41,7 +44,8 @@ class AllocaStmt : public Stmt {
return false;
}

TI_STMT_DEF_FIELDS(ret_type);
bool is_shared;
TI_STMT_DEF_FIELDS(ret_type, is_shared);
TI_DEFINE_ACCEPT_AND_CLONE
};

Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ void export_lang(py::module &m) {
.def("insert_external_func_call", &ASTBuilder::insert_external_func_call)
.def("expr_alloca", &ASTBuilder::expr_alloca)
.def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor)
.def("expr_alloca_shared_array", &ASTBuilder::expr_alloca_shared_array)
.def("create_assert_stmt", &ASTBuilder::create_assert_stmt)
.def("expr_assign", &ASTBuilder::expr_assign)
.def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for)
Expand Down
8 changes: 5 additions & 3 deletions taichi/transforms/ir_printer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,9 @@ class IRPrinter : public IRVisitor {
}

void visit(FrontendAllocaStmt *alloca) override {
print("{}${} = alloca {}", alloca->type_hint(), alloca->id,
alloca->ident.name());
std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
print("{}${} = alloca {}{}", alloca->type_hint(), alloca->id,
turbo0628 marked this conversation as resolved.
Show resolved Hide resolved
alloca->ident.name(), shared_suffix);
}

void visit(FrontendAssertStmt *assert) override {
Expand Down Expand Up @@ -187,7 +188,8 @@ class IRPrinter : public IRVisitor {
}

void visit(AllocaStmt *alloca) override {
print("{}${} = alloca", alloca->type_hint(), alloca->id);
std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
turbo0628 marked this conversation as resolved.
Show resolved Hide resolved
print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix);
}

void visit(RandStmt *stmt) override {
Expand Down
3 changes: 2 additions & 1 deletion taichi/transforms/lower_ast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ class LowerAST : public IRVisitor {
if (stmt->ret_type->is<TensorType>()) {
auto tensor_type = stmt->ret_type->cast<TensorType>();
auto lowered = std::make_unique<AllocaStmt>(
tensor_type->get_shape(), tensor_type->get_element_type());
tensor_type->get_shape(), tensor_type->get_element_type(),
stmt->is_shared);
block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get()));
stmt->parent->replace_with(stmt, std::move(lowered));
} else {
Expand Down
27 changes: 27 additions & 0 deletions tests/python/test_shared_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import taichi as ti
from tests import test_utils


@test_utils.test(arch=ti.cuda)
def test_shared_array_save():
block_dim = 128
pad_num = 16
a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num, ))

@ti.kernel
def func():
ti.loop_config(block_dim=block_dim)
for i in range(block_dim * pad_num):
g_tid = ti.global_thread_idx()
tid = g_tid % block_dim
pad = ti.simt.block.SharedArray((block_dim, ), ti.f32)
pad[tid] = tid * 2.0
ti.simt.block.sync()
a[i] = pad[tid]
ti.simt.block.sync()

func()
for i in range(pad_num):
assert a[i * block_dim + 7] == 14.0
assert a[i * block_dim + 29] == 58.0
assert a[i * block_dim + 127] == 254.0