diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py index 43940fce55439..4168454898c43 100644 --- a/python/taichi/lang/impl.py +++ b/python/taichi/lang/impl.py @@ -19,6 +19,7 @@ MeshRelationAccessProxy, MeshReorderedMatrixFieldProxy, MeshReorderedScalarFieldProxy, element_type_name) +from taichi.lang.simt.block import SharedArray from taichi.lang.snode import SNode from taichi.lang.struct import Struct, StructField, _IntermediateStruct from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class, @@ -32,6 +33,12 @@ def expr_init_local_tensor(shape, element_type, elements): shape, element_type, elements) +@taichi_scope +def expr_init_shared_array(shape, element_type): + return get_runtime().prog.current_ast_builder().expr_alloca_shared_array( + shape, element_type) + + @taichi_scope def expr_init(rhs): if rhs is None: @@ -40,6 +47,8 @@ def expr_init(rhs): return type(rhs)(*rhs.to_list()) if isinstance(rhs, Matrix): return Matrix(rhs.to_list()) + if isinstance(rhs, SharedArray): + return rhs if isinstance(rhs, Struct): return Struct(rhs.to_dict(include_methods=True)) if isinstance(rhs, list): diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py index 427664473dd5d..0baf3f352b5de 100644 --- a/python/taichi/lang/simt/block.py +++ b/python/taichi/lang/simt/block.py @@ -1,5 +1,19 @@ from taichi.lang import impl +from taichi.lang.util import taichi_scope def sync(): return impl.call_internal("block_barrier", with_runtime_context=False) + + +class SharedArray: + _is_taichi_class = True + + def __init__(self, shape, dtype): + self.shape = shape + self.dtype = dtype + self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype) + + @taichi_scope + def _subscript(self, *indices, get_ref=False): + return impl.make_index_expr(self.shared_array_proxy, (indices, )) diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp index 6966c81f44d84..bb357a77de202 100644 --- a/taichi/codegen/llvm/codegen_llvm.cpp +++ b/taichi/codegen/llvm/codegen_llvm.cpp @@ -128,7 +128,29 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) { auto type = tlctx->get_data_type(tensor_type->get_element_type()); auto array_size = tlctx->get_constant(tensor_type->get_num_elements()); // Return type is [array_size x type]*. - llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size); + if (stmt->is_shared) { + size_t data_element_size = tlctx->get_type_size( + tlctx->get_data_type(tensor_type->get_element_type())); + auto type = llvm::ArrayType::get( + llvm::Type::getInt8Ty(*llvm_context), + data_element_size * tensor_type->get_num_elements()); + auto base = new llvm::GlobalVariable( + *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr, + fmt::format("shared_array_{}", stmt->id), nullptr, + llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/); + base->setAlignment(llvm::MaybeAlign(8)); + + auto ptr = builder->CreateGEP( +#ifdef TI_LLVM_15 + base->getValueType(), +#endif + base, {tlctx->get_constant(0), tlctx->get_constant(0)}); + auto ptr_type = llvm::PointerType::get( + tlctx->get_data_type(tensor_type->get_element_type()), 0); + llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type); + } else { + llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size); + } } else { TI_ASSERT(stmt->width() == 1); llvm_val[stmt] = diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp index f2c03ea18c99d..56d5de025a9e8 100644 --- a/taichi/ir/frontend_ir.cpp +++ b/taichi/ir/frontend_ir.cpp @@ -985,6 +985,16 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector &shape, return var; } +Expr ASTBuilder::expr_alloca_shared_array(const std::vector &shape, + const DataType &element_type) { + auto var = Expr(std::make_shared(get_next_id())); + this->insert(std::make_unique( + std::static_pointer_cast(var.expr)->id, shape, element_type, + true)); + var->ret_type = this->get_last_stmt()->ret_type; + return var; +} + void ASTBuilder::expr_assign(const Expr &lhs, const Expr &rhs, std::string tb) { TI_ASSERT(lhs->is_lvalue()); auto stmt = std::make_unique(lhs, rhs); diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index fdbe86c95c620..9859f6a9cb63d 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -63,17 +63,21 @@ class FrontendAllocaStmt : public Stmt { public: Identifier ident; - FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs) { + FrontendAllocaStmt(const Identifier &lhs, DataType type) + : ident(lhs), is_shared(false) { ret_type = TypeFactory::create_vector_or_scalar_type(1, type); } FrontendAllocaStmt(const Identifier &lhs, std::vector shape, - DataType element) - : ident(lhs) { + DataType element, + bool is_shared = false) + : ident(lhs), is_shared(is_shared) { ret_type = DataType(TypeFactory::create_tensor_type(shape, element)); } + bool is_shared; + TI_DEFINE_ACCEPT }; @@ -874,6 +878,8 @@ class ASTBuilder { Expr expr_alloca_local_tensor(const std::vector &shape, const DataType &element_type, const ExprGroup &elements); + Expr expr_alloca_shared_array(const std::vector &shape, + const DataType &element_type); void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb); void create_assert_stmt(const Expr &cond, const std::string &msg, @@ -907,7 +913,11 @@ class ASTBuilder { } void block_dim(int v) { - TI_ASSERT(bit::is_power_of_two(v)); + if (arch_ == Arch::cuda) { + TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v)); + } else { + TI_ASSERT(bit::is_power_of_two(v)); + } for_loop_dec_.config.block_dim = v; } diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index 0ffa885f1fb27..02937e9849d1f 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -18,17 +18,20 @@ class Function; */ class AllocaStmt : public Stmt { public: - AllocaStmt(DataType type) { + AllocaStmt(DataType type) : is_shared(false) { ret_type = TypeFactory::create_vector_or_scalar_type(1, type); TI_STMT_REG_FIELDS; } - AllocaStmt(int width, DataType type) { + AllocaStmt(int width, DataType type) : is_shared(false) { ret_type = TypeFactory::create_vector_or_scalar_type(width, type); TI_STMT_REG_FIELDS; } - AllocaStmt(const std::vector &shape, DataType type) { + AllocaStmt(const std::vector &shape, + DataType type, + bool is_shared = false) + : is_shared(is_shared) { ret_type = TypeFactory::create_tensor_type(shape, type); TI_STMT_REG_FIELDS; } @@ -41,7 +44,8 @@ class AllocaStmt : public Stmt { return false; } - TI_STMT_DEF_FIELDS(ret_type); + bool is_shared; + TI_STMT_DEF_FIELDS(ret_type, is_shared); TI_DEFINE_ACCEPT_AND_CLONE }; diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index a2ea36d1ecfc2..edf26ec7078c5 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -303,6 +303,7 @@ void export_lang(py::module &m) { .def("insert_external_func_call", &ASTBuilder::insert_external_func_call) .def("expr_alloca", &ASTBuilder::expr_alloca) .def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor) + .def("expr_alloca_shared_array", &ASTBuilder::expr_alloca_shared_array) .def("create_assert_stmt", &ASTBuilder::create_assert_stmt) .def("expr_assign", &ASTBuilder::expr_assign) .def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for) diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp index eb94695e83125..ca462e42773e8 100644 --- a/taichi/transforms/ir_printer.cpp +++ b/taichi/transforms/ir_printer.cpp @@ -119,7 +119,8 @@ class IRPrinter : public IRVisitor { } void visit(FrontendAllocaStmt *alloca) override { - print("{}${} = alloca {}", alloca->type_hint(), alloca->id, + std::string shared_suffix = (alloca->is_shared) ? "(shared)" : ""; + print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id, shared_suffix, alloca->ident.name()); } @@ -187,7 +188,8 @@ class IRPrinter : public IRVisitor { } void visit(AllocaStmt *alloca) override { - print("{}${} = alloca", alloca->type_hint(), alloca->id); + std::string shared_suffix = (alloca->is_shared) ? "(shared)" : ""; + print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix); } void visit(RandStmt *stmt) override { diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp index e7c390ce3c41e..90e33cba4e01b 100644 --- a/taichi/transforms/lower_ast.cpp +++ b/taichi/transforms/lower_ast.cpp @@ -71,7 +71,8 @@ class LowerAST : public IRVisitor { if (stmt->ret_type->is()) { auto tensor_type = stmt->ret_type->cast(); auto lowered = std::make_unique( - tensor_type->get_shape(), tensor_type->get_element_type()); + tensor_type->get_shape(), tensor_type->get_element_type(), + stmt->is_shared); block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get())); stmt->parent->replace_with(stmt, std::move(lowered)); } else { diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py new file mode 100644 index 0000000000000..64aa418807bea --- /dev/null +++ b/tests/python/test_shared_array.py @@ -0,0 +1,27 @@ +import taichi as ti +from tests import test_utils + + +@test_utils.test(arch=ti.cuda) +def test_shared_array_save(): + block_dim = 128 + pad_num = 16 + a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num, )) + + @ti.kernel + def func(): + ti.loop_config(block_dim=block_dim) + for i in range(block_dim * pad_num): + g_tid = ti.global_thread_idx() + tid = g_tid % block_dim + pad = ti.simt.block.SharedArray((block_dim, ), ti.f32) + pad[tid] = tid * 2.0 + ti.simt.block.sync() + a[i] = pad[tid] + ti.simt.block.sync() + + func() + for i in range(pad_num): + assert a[i * block_dim + 7] == 14.0 + assert a[i * block_dim + 29] == 58.0 + assert a[i * block_dim + 127] == 254.0